In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import tqdm
import pickle
warnings.filterwarnings("ignore")

In [2]:
SEED = 1337
np.random.seed(SEED)

In [3]:
SPLIT_DATE_TRAIN = "2020-01-01"
SPLIT_DATE_VAL = "2023-01-01"
RADIUS = 300
THRESHOLD = 750
BLOCK_SIZE = 64

In [4]:
PREPROC_PARAMS = {
    "mag_low": -1,
    "mag_high": 7,
    "depth_low": 2,
    "depth_high": 1e8,
    "scale_distance": 78.44,
    "scale_distance_lag": 300,
}

In [5]:
df = pd.read_csv("../data/with_features_final.csv")
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,longitude_disc,latitude_disc,pos,lat_cent,lon_cent,plate_region,dist_region,dist,plate,label
0,1973-01-01,-155.360833,19.443667,7.302,1.85,-156,19,19_-156,19.5,-155.5,22,3527.668174,3522.498687,22.0,0
1,1973-01-02,-155.204333,19.326000,6.589,2.21,-156,19,19_-156,19.5,-155.5,22,3527.668174,3521.228924,22.0,0
2,1973-01-02,-155.253167,19.314833,7.041,1.93,-156,19,19_-156,19.5,-155.5,22,3527.668174,3525.575831,22.0,0
3,1973-01-03,-155.290167,19.399833,7.864,1.76,-156,19,19_-156,19.5,-155.5,22,3527.668174,3521.146991,22.0,0
4,1973-01-03,-155.273667,19.408500,13.166,1.94,-156,19,19_-156,19.5,-155.5,22,3527.668174,3519.272394,22.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4111174,2023-09-17,-16.551100,36.338800,10.000,4.30,-17,36,36_-17,36.5,-16.5,18,106.166869,123.068500,18.0,0
4111175,2023-09-29,-63.741300,60.600500,10.000,4.20,-64,60,60_-64,60.5,-63.5,18,1730.151056,1743.541742,18.0,0
4111176,2023-09-28,-110.487800,32.234100,5.000,3.10,-111,32,32_-111,32.5,-110.5,31,364.340172,356.687073,31.0,0
4111177,2023-09-27,93.112900,0.862200,10.000,5.50,93,0,0_93,0.5,93.5,15,50.773571,35.611755,15.0,0


In [7]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

In [8]:
def filter_regions(df: pd.DataFrame, threshold: int, radius: int) -> pd.DataFrame:
    df_f = df[df["time"] <= SPLIT_DATE_TRAIN]
    regions = []
    for pos in tqdm.tqdm(df_f["pos"].unique()):
        # tmp = df_f.copy(deep=True)
        lat, lon = pos.split("_")
        lat, lon = float(lat), float(lon)
        diff = int(radius / 111) + 3
        tmp = df_f[(df_f["latitude"] >= lat - diff) & (df_f["latitude"] <= lat + diff) & (df_f["longitude"] >= lon - diff) & (df_f["longitude"] <= lon + diff)]
        tmp["distance"] = haversine_distance(tmp["latitude"], tmp["longitude"], lat + 0.5, lon + 0.5)
        tmp = tmp[tmp["distance"] <= radius]
        if len(tmp) >= threshold:
            regions.append(pos)
    return regions

In [9]:
def preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN):
    scaler_dict = {}
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]

    scaler = MinMaxScaler()
    mag = np.clip(df_train["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"])
    scaler.fit(mag.reshape(-1, 1))
    df["mag"] = scaler.transform(np.clip(df["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"]).reshape(-1, 1))
    scaler_dict["mag"] = scaler

    scaler = MinMaxScaler()
    depth = np.log(df_train["depth"] + np.abs(df["depth"].min()) + 1).values
    depth = np.clip(depth, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"])
    scaler.fit(depth.reshape(-1, 1))
    df["depth"] = np.log(df["depth"] + np.abs(df["depth"].min()) + 1)
    df["depth"] = scaler.transform(np.clip(df["depth"].values, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"]).reshape(-1, 1))
    scaler_dict["depth"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["latitude"].values.reshape(-1, 1))
    df["latitude_new"] = scaler.transform(df["latitude"].values.reshape(-1, 1))
    scaler_dict["latitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["longitude"].values.reshape(-1, 1))
    df["longitude_new"] = scaler.transform(df["longitude"].values.reshape(-1, 1))
    scaler_dict["longitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lat_cent"].values.reshape(-1, 1))
    df["lat_cent"] = scaler.transform(df["lat_cent"].values.reshape(-1, 1))
    scaler_dict["lat_cent"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lon_cent"].values.reshape(-1, 1))
    df["lon_cent"] = scaler.transform(df["lon_cent"].values.reshape(-1, 1))
    scaler_dict["lon_cent"] = scaler

    scaler = MinMaxScaler()
    dist = np.log(df_train["dist"] + 1).values.reshape(-1, 1)
    scaler.fit(dist)
    df["dist"] = scaler.transform(np.log(df["dist"] + 1).values.reshape(-1, 1))
    scaler_dict["dist"] = scaler

    scaler = MinMaxScaler()
    dist_region = np.log(df_train["dist_region"] + 1).values.reshape(-1, 1)
    scaler.fit(dist_region)
    df["dist_region"] = scaler.transform(np.log(df["dist_region"] + 1).values.reshape(-1, 1))
    scaler_dict["dist_region"] = scaler

    return df, scaler_dict

In [10]:
def make_block(df, pos, radius, block_size, PREPROC_PARAMS):
    bins = [0, 1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 180, 1e8]
    lat, lon = pos.split("_")
    lat, lon = float(lat), float(lon)
    tmp1 = df[df["pos"] == pos]
    diff = int(radius / 111) + 3
    tmp2 = df[((df["latitude"] >= lat - diff) & (df["latitude"] <= lat + diff) & (df["longitude"] >= lon - diff) & (df["longitude"] <= lon + diff)) & (df["pos"] != pos)]
    tmp2["label"] = -1
    tmp = pd.concat([tmp1, tmp2], axis=0)
    tmp["distance"] = haversine_distance(tmp["latitude"], tmp["longitude"], lat + 0.5, lon + 0.5)
    tmp = tmp[tmp["distance"] <= radius]
    tmp.sort_values(by=["time"], inplace=True)
    tmp["diff_days"] = (tmp["time"] - tmp["time"].shift(1)).dt.days
    tmp["diff_days"] = np.digitize(tmp["diff_days"], bins=bins) - 1
    for idx in range(1, block_size):
        tmp["mag_" + str(idx)] = tmp["mag"].shift(idx)
        tmp["depth_" + str(idx)] = tmp["depth"].shift(idx)
        tmp["latitude_new_" + str(idx)] = tmp["latitude_new"].shift(idx)
        tmp["longitude_new_" + str(idx)] = tmp["longitude_new"].shift(idx)
        tmp["dist_" + str(idx)] = tmp["dist"].shift(idx)
        tmp["distance_" + str(idx)] = tmp["distance"].shift(idx) / PREPROC_PARAMS["scale_distance_lag"]
        tmp["plate_" + str(idx)] = tmp["plate"].shift(idx)
        tmp["diff_days_" + str(idx)] = tmp["diff_days"].shift(idx)
    tmp = tmp[tmp["label"] != -1]
    tmp["distance"] = tmp["distance"] / PREPROC_PARAMS["scale_distance"]
    tmp.dropna(inplace=True)
    return tmp

In [11]:
def reshape(df, block_size, feature_order, featrues_region):
    df = df.sample(frac=1, random_state = SEED).reset_index(drop=True)
    x_train = df[feature_order].to_numpy().reshape(-1, block_size, len(feature_order) // block_size)
    x_region = df[featrues_region].to_numpy().reshape(-1, len(featrues_region))
    y_train = df["label"].to_numpy().reshape(-1, 1)
    return x_train, x_region, y_train

In [12]:
def split_all(df, block_size, feature_order, features_region, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]
    df_val = df[(df["time"] >= SPLIT_DATE_TRAIN) & (df["time"] < SPLIT_DATE_VAL)]
    df_test = df[df["time"] >= SPLIT_DATE_VAL]
    x_train, x_train_region, y_train = reshape(df_train, block_size, feature_order, features_region)
    x_val, x_val_region, y_val = reshape(df_val, block_size, feature_order, features_region)
    x_test, x_test_region, y_test = reshape(df_test, block_size, feature_order, features_region)
    return x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test

In [13]:
def make_npys(df, radius, th, block_size, features_order, features_region, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df.sort_values(by="time", inplace=True)
    regions = filter_regions(df, th, radius)
    df, scaler_dict = preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN)
    np.random.shuffle(regions)
    for idx, pos in enumerate(tqdm.tqdm(regions)):
        df_pos = make_block(df, pos, radius, block_size, PREPROC_PARAMS)
        x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test = split_all(df_pos, block_size, features_order, features_region, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)
        np.save("../data/npys/x_train_" + str(idx) + ".npy", x_train)
        np.save("../data/npys/x_train_region_" + str(idx) + ".npy", x_train_region)
        np.save("../data/npys/y_train_" + str(idx) + ".npy", y_train)
        np.save("../data/npys/x_val_" + str(idx) + ".npy", x_val)
        np.save("../data/npys/x_val_region_" + str(idx) + ".npy", x_val_region)
        np.save("../data/npys/y_val_" + str(idx) + ".npy", y_val)
        np.save("../data/npys/x_test_" + str(idx) + ".npy", x_test)
        np.save("../data/npys/x_test_region_" + str(idx) + ".npy", x_test_region)
        np.save("../data/npys/y_test_" + str(idx) + ".npy", y_test)
    return scaler_dict

In [14]:
features_region = ["lat_cent", "lon_cent", "dist_region", "plate_region"]
featrues = ["mag", "depth", "latitude_new", "longitude_new", "dist", "distance", "plate", "diff_days"]
featrues_order = [featrues[idx] + "_" + str(i) for i in range(BLOCK_SIZE-1, 0, -1) for idx in range(len(featrues))]
featrues_order = featrues_order + featrues

In [15]:
scalers = make_npys(df.copy(deep=True), RADIUS, THRESHOLD, BLOCK_SIZE, featrues_order, features_region, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)

100%|██████████| 1378/1378 [35:03<00:00,  1.53s/it] 


In [16]:
# save scalers
with open("../data/scalers_for_npys.pkl", "wb") as f:
    pickle.dump(scalers, f)