In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import tqdm
import pickle
warnings.filterwarnings("ignore")

In [2]:
SEED = 1337
np.random.seed(SEED)

In [3]:
SPLIT_DATE_TRAIN = "2020-01-01"
SPLIT_DATE_VAL = "2023-01-01"
RADIUS = 300
THRESHOLD = 150
BLOCK_SIZE = 64

In [4]:
PREPROC_PARAMS = {
    "mag_low": -1,
    "mag_high": 7,
    "depth_low": 2,
    "depth_high": 1e8,
    "dist_low": 1,
    "dist_high": 1e8,
    "dist_region_low": 2,
    "dist_region_high": 1e8,
    "scale_distance": 78.44,
    "scale_distance_lag": 300,
}

In [5]:
df = pd.read_csv("../data/with_features.csv")
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,magType,time_disc,longitude_disc,latitude_disc,pos,lat_cent,lon_cent,plate_region,dist_region,dist,plate,label
0,1973-01-01 01:05:56.150,-117.588000,34.189833,6.000,1.70,5,1973-01-01,-118,34,34_-118,34.5,-117.5,1,16.691592,19.302507,1,0
1,1973-01-01 04:46:09.800,150.634000,-9.214000,41.000,5.30,3,1973-01-01,150,-10,-10_150,-9.5,150.5,51,21.124956,55.729840,61,0
2,1973-01-01 05:20:59.780,-122.117333,48.308667,13.680,2.20,2,1973-01-01,-123,48,48_-123,48.5,-122.5,3,296.114618,314.738391,3,0
3,1973-01-01 06:22:29.800,-173.958000,-15.012000,33.000,5.00,3,1973-01-01,-174,-16,-16_-174,-15.5,-173.5,34,84.317941,18.239739,42,0
4,1973-01-01 08:58:11.460,-155.360833,19.443667,7.302,1.85,1,1973-01-01,-156,19,19_-156,19.5,-155.5,2,3527.668174,3522.498687,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4100001,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,1,2023-10-01,-180,51,51_-180,51.5,-179.5,1,113.570487,137.833943,1,0
4100002,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,3,2023-10-01,152,-5,-5_152,-4.5,152.5,33,39.066841,17.721537,35,0
4100003,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,1,2023-10-01,-153,60,60_-153,60.5,-152.5,1,397.084905,387.824459,1,0
4100004,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,1,2023-10-01,-176,51,51_-176,51.5,-175.5,1,84.433512,123.080765,1,0


In [7]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

In [8]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    df_f = df[df["time"] <= SPLIT_DATE_TRAIN]
    df_agg = df_f.groupby(["pos"]).agg({"mag": "count"}).reset_index()
    regions = df_agg.loc[df_agg["mag"] >= threshold, "pos"].values
    return regions

In [9]:
def preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN):
    scaler_dict = {}
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]

    scaler = MinMaxScaler()
    mag = np.clip(df_train["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"])
    scaler.fit(mag.reshape(-1, 1))
    df["mag"] = scaler.transform(np.clip(df["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"]).reshape(-1, 1))
    scaler_dict["mag"] = scaler

    scaler = MinMaxScaler()
    depth = np.log(df_train["depth"] + np.abs(df["depth"].min()) + 1).values
    depth = np.clip(depth, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"])
    scaler.fit(depth.reshape(-1, 1))
    df["depth"] = np.log(df["depth"] + np.abs(df["depth"].min()) + 1)
    df["depth"] = scaler.transform(np.clip(df["depth"].values, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"]).reshape(-1, 1))
    scaler_dict["depth"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["latitude"].values.reshape(-1, 1))
    df["latitude_new"] = scaler.transform(df["latitude"].values.reshape(-1, 1))
    scaler_dict["latitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["longitude"].values.reshape(-1, 1))
    df["longitude_new"] = scaler.transform(df["longitude"].values.reshape(-1, 1))
    scaler_dict["longitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lat_cent"].values.reshape(-1, 1))
    df["lat_cent"] = scaler.transform(df["lat_cent"].values.reshape(-1, 1))
    scaler_dict["lat_cent"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lon_cent"].values.reshape(-1, 1))
    df["lon_cent"] = scaler.transform(df["lon_cent"].values.reshape(-1, 1))
    scaler_dict["lon_cent"] = scaler

    scaler = MinMaxScaler()
    dist = np.log(df_train["dist"] + 1).values.reshape(-1, 1)
    dist = np.clip(dist, PREPROC_PARAMS["dist_low"], PREPROC_PARAMS["dist_high"])
    scaler.fit(dist)
    df["dist"] = scaler.transform(np.clip(np.log(df["dist"] + 1).values.reshape(-1, 1), PREPROC_PARAMS["dist_low"], PREPROC_PARAMS["dist_high"]))
    scaler_dict["dist"] = scaler

    scaler = MinMaxScaler()
    dist_region = np.log(df_train["dist_region"] + 1).values.reshape(-1, 1)
    dist_region = np.clip(dist_region, PREPROC_PARAMS["dist_region_low"], PREPROC_PARAMS["dist_region_high"])
    scaler.fit(dist_region)
    df["dist_region"] = scaler.transform(np.clip(np.log(df["dist_region"] + 1).values.reshape(-1, 1), PREPROC_PARAMS["dist_region_low"], PREPROC_PARAMS["dist_region_high"]))
    scaler_dict["dist_region"] = scaler

    return df, scaler_dict

In [10]:
def add_target(df, block_size):
    for i in range(-1, block_size-1):
        df[f"target_lat_{i}"] = np.abs(df["latitude"].shift(i)) % 1
        df[f"target_lon_{i}"] = np.abs(df["longitude"].shift(i)) % 1
        df[f"target_mag_{i}"] = df["mag"].shift(i)
        df[f"target_depth_{i}"] = df["depth"].shift(i)
        df[f"target_diff_days_{i}"] = df["diff_days"].shift(i)
    return df

In [11]:
def make_block(df, pos, radius, block_size, PREPROC_PARAMS):
    bins = [0, 1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 180, 1e8]
    lat, lon = pos.split("_")
    lat, lon = float(lat), float(lon)
    tmp1 = df[df["pos"] == pos]
    tmp2 = df[df["pos"] != pos]
    tmp2["label"] = -1
    tmp = pd.concat([tmp1, tmp2], axis=0)
    tmp["distance"] = haversine_distance(tmp["latitude"], tmp["longitude"], lat + 0.5, lon + 0.5)
    tmp = tmp[tmp["distance"] <= radius]
    tmp.sort_values(by=["time"], inplace=True)
    tmp["diff_days"] = (tmp["time"] - tmp["time"].shift(1)).dt.days
    tmp["diff_days"] = np.digitize(tmp["diff_days"], bins=bins)
    for idx in range(1, block_size):
        tmp["mag_" + str(idx)] = tmp["mag"].shift(idx)
        tmp["depth_" + str(idx)] = tmp["depth"].shift(idx)
        tmp["latitude_new_" + str(idx)] = tmp["latitude_new"].shift(idx)
        tmp["longitude_new_" + str(idx)] = tmp["longitude_new"].shift(idx)
        tmp["dist_" + str(idx)] = tmp["dist"].shift(idx)
        tmp["distance_" + str(idx)] = tmp["distance"].shift(idx) / PREPROC_PARAMS["scale_distance_lag"]
        tmp["plate_" + str(idx)] = tmp["plate"].shift(idx)
        tmp["diff_days_" + str(idx)] = tmp["diff_days"].shift(idx)
    tmp = tmp[tmp["label"] != -1]
    tmp["distance"] = tmp["distance"] / PREPROC_PARAMS["scale_distance"]
    tmp = add_target(tmp, block_size)
    tmp.dropna(inplace=True)
    return tmp

In [12]:
def reshape(df, block_size, feature_order, featrues_region, targets_order):
    df = df.sample(frac=1, random_state = SEED).reset_index(drop=True)
    x_train = df[feature_order].to_numpy().reshape(-1, block_size, len(feature_order) // block_size)
    x_region = df[featrues_region].to_numpy().reshape(-1, len(featrues_region))
    y_train = df[targets_order].to_numpy().reshape(-1, block_size, len(targets_order) // block_size)
    return x_train, x_region, y_train

In [13]:
def split_all(df, block_size, feature_order, features_region, targets_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]
    df_val = df[(df["time"] >= SPLIT_DATE_TRAIN) & (df["time"] < SPLIT_DATE_VAL)]
    df_test = df[df["time"] >= SPLIT_DATE_VAL]
    x_train, x_train_region, y_train = reshape(df_train, block_size, feature_order, features_region, targets_order)
    x_val, x_val_region, y_val = reshape(df_val, block_size, feature_order, features_region, targets_order)
    x_test, x_test_region, y_test = reshape(df_test, block_size, feature_order, features_region, targets_order)
    return x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test

In [14]:
def make_npys(df, radius, th, block_size, features_order, features_region, targets_order, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df.sort_values(by="time", inplace=True)
    regions = filter_regions(df, th)
    df, scaler_dict = preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN)
    np.random.shuffle(regions)
    for idx, pos in enumerate(tqdm.tqdm(regions)):
        df_pos = make_block(df, pos, radius, block_size, PREPROC_PARAMS)
        x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test = split_all(df_pos, block_size, features_order, features_region, targets_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)
        np.save("../data/npys/x_train_" + str(idx) + ".npy", x_train)
        np.save("../data/npys/x_train_region_" + str(idx) + ".npy", x_train_region)
        np.save("../data/npys/y_train_" + str(idx) + ".npy", y_train)
        np.save("../data/npys/x_val_" + str(idx) + ".npy", x_val)
        np.save("../data/npys/x_val_region_" + str(idx) + ".npy", x_val_region)
        np.save("../data/npys/y_val_" + str(idx) + ".npy", y_val)
        np.save("../data/npys/x_test_" + str(idx) + ".npy", x_test)
        np.save("../data/npys/x_test_region_" + str(idx) + ".npy", x_test_region)
        np.save("../data/npys/y_test_" + str(idx) + ".npy", y_test)
    return scaler_dict

In [15]:
features_region = ["lat_cent", "lon_cent", "dist_region", "plate_region"]
featrues = ["mag", "depth", "latitude_new", "longitude_new", "dist", "distance", "plate", "diff_days"]
featrues_order = [featrues[idx] + "_" + str(i) for i in range(BLOCK_SIZE-1, 0, -1) for idx in range(len(featrues))]
featrues_order = featrues_order + featrues
targets = ["target_lat", "target_lon", "target_mag", "target_depth", "target_diff_days"]
targets_order = [target + "_" + str(i) for i in range(BLOCK_SIZE-2, -2, -1) for target in targets]

In [16]:
scalers = make_npys(df.copy(deep=True), RADIUS, THRESHOLD, BLOCK_SIZE, featrues_order, features_region, targets_order, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)

100%|██████████| 1362/1362 [1:12:45<00:00,  3.21s/it]  


In [17]:
# save scalers
with open("../data/MEM_scalers_for_npys.pkl", "wb") as f:
    pickle.dump(scalers, f)