In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler
import tqdm
warnings.filterwarnings("ignore")

In [2]:
SEED = 1337
np.random.seed(SEED)

In [3]:
SPLIT_DATE_TRAIN = "2020-01-01"
SPLIT_DATE_VAL = "2023-01-01"
RADIUS = 300
# GEO_SPLIT = 3 - THRESHOLD = 300
# GEO_SPLIT = 5 - THRESHOLD = 600
# GEO_SPLIT = 7 - THRESHOLD = 900
THRESHOLD = 900
BLOCK_SIZE = 32

In [4]:
PREPROC_PARAMS = {
    "mag_low": -1,
    "mag_high": 7,
    "depth_low": 2,
    "depth_high": 1e8,
    "dist_low": 1,
    "dist_high": 1e8,
    "dist_region_low": 2,
    "dist_region_high": 1e8,
    "scale_distance": 78.44,
    "scale_distance_lag": 300,
}

In [5]:
df = pd.read_csv("../data/with_features_7.csv")
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,magType,time_disc,longitude_disc,latitude_disc,pos,lat_cent,lon_cent,plate_region,dist_region,dist,plate,label
0,1973-01-01 01:05:56.150,-117.588000,34.189833,6.0,1.70,5,1973-01-01,-119,28,28_-119,31.5,-115.5,1,88.654893,19.302507,1,0
1,1973-01-02 03:45:48.510,-117.347500,33.629333,6.1,2.98,1,1973-01-02,-119,28,28_-119,31.5,-115.5,1,88.654893,63.761504,1,0
2,1973-01-03 01:59:38.420,-116.359500,34.815833,6.0,2.98,1,1973-01-03,-119,28,28_-119,31.5,-115.5,1,88.654893,90.352341,1,0
3,1973-01-03 04:21:54.300,-115.751167,31.611000,6.0,3.77,1,1973-01-03,-119,28,28_-119,31.5,-115.5,1,88.654893,93.714839,1,0
4,1973-01-03 07:30:44.150,-115.264333,32.603333,6.0,2.34,5,1973-01-03,-119,28,28_-119,31.5,-115.5,1,88.654893,11.278141,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4100001,2022-11-04 04:52:34.626,-104.514400,75.805300,10.0,4.70,3,2022-11-04,-105,70,70_-105,73.5,-101.5,18,2048.234902,1850.853940,19,0
4100002,2022-11-04 22:31:20.740,-104.955000,75.750900,10.0,4.80,3,2022-11-04,-105,70,70_-105,73.5,-101.5,18,2048.234902,1861.477168,19,0
4100003,2022-11-05 08:59:36.443,-104.806600,75.711900,10.0,4.80,3,2022-11-05,-105,70,70_-105,73.5,-101.5,18,2048.234902,1863.662303,19,0
4100004,2023-06-17 03:18:21.683,40.539100,-28.318500,10.0,4.30,3,2023-06-17,35,-35,-35_35,-31.5,38.5,51,529.940200,598.532143,61,0


In [7]:
df.drop(columns=["time_disc", "longitude_disc", "latitude_disc"], inplace=True)

In [8]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

In [9]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    df_f = df[df["time"] <= SPLIT_DATE_TRAIN]
    df_agg = df_f.groupby(["pos"]).agg({"mag": "count"}).reset_index()
    regions = df_agg.loc[df_agg["mag"] >= threshold, "pos"].values
    return regions

In [10]:
def preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN):
    scaler_dict = {}
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]

    scaler = MinMaxScaler()
    mag = np.clip(df_train["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"])
    scaler.fit(mag.reshape(-1, 1))
    df["mag"] = scaler.transform(np.clip(df["mag"].values, PREPROC_PARAMS["mag_low"], PREPROC_PARAMS["mag_high"]).reshape(-1, 1))
    scaler_dict["mag"] = scaler

    scaler = MinMaxScaler()
    depth = np.log(df_train["depth"] + np.abs(df["depth"].min()) + 1).values
    depth = np.clip(depth, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"])
    scaler.fit(depth.reshape(-1, 1))
    df["depth"] = np.log(df["depth"] + np.abs(df["depth"].min()) + 1)
    df["depth"] = scaler.transform(np.clip(df["depth"].values, PREPROC_PARAMS["depth_low"], PREPROC_PARAMS["depth_high"]).reshape(-1, 1))
    scaler_dict["depth"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["latitude"].values.reshape(-1, 1))
    df["latitude_new"] = scaler.transform(df["latitude"].values.reshape(-1, 1))
    scaler_dict["latitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["longitude"].values.reshape(-1, 1))
    df["longitude_new"] = scaler.transform(df["longitude"].values.reshape(-1, 1))
    scaler_dict["longitude_new"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lat_cent"].values.reshape(-1, 1))
    df["lat_cent"] = scaler.transform(df["lat_cent"].values.reshape(-1, 1))
    scaler_dict["lat_cent"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["lon_cent"].values.reshape(-1, 1))
    df["lon_cent"] = scaler.transform(df["lon_cent"].values.reshape(-1, 1))
    scaler_dict["lon_cent"] = scaler

    scaler = MinMaxScaler()
    dist = np.log(df_train["dist"] + 1).values.reshape(-1, 1)
    dist = np.clip(dist, PREPROC_PARAMS["dist_low"], PREPROC_PARAMS["dist_high"])
    scaler.fit(dist)
    df["dist"] = scaler.transform(np.clip(np.log(df["dist"] + 1).values.reshape(-1, 1), PREPROC_PARAMS["dist_low"], PREPROC_PARAMS["dist_high"]))
    scaler_dict["dist"] = scaler

    scaler = MinMaxScaler()
    dist_region = np.log(df_train["dist_region"] + 1).values.reshape(-1, 1)
    dist_region = np.clip(dist_region, PREPROC_PARAMS["dist_region_low"], PREPROC_PARAMS["dist_region_high"])
    scaler.fit(dist_region)
    df["dist_region"] = scaler.transform(np.clip(np.log(df["dist_region"] + 1).values.reshape(-1, 1), PREPROC_PARAMS["dist_region_low"], PREPROC_PARAMS["dist_region_high"]))
    scaler_dict["dist_region"] = scaler

    return df, scaler_dict

In [11]:
def make_block(df, pos, radius, block_size, PREPROC_PARAMS):
    bins = [0, 1, 2, 3, 4, 5, 6, 7, 10, 14, 21, 30, 60, 180, 1e8]
    lat, lon = pos.split("_")
    lat, lon = float(lat), float(lon)
    tmp = df[df["pos"] == pos]
    tmp["distance"] = haversine_distance(tmp["latitude"], tmp["longitude"], lat + 0.5, lon + 0.5) / 549
    tmp.sort_values(by=["time"], inplace=True)
    tmp["diff_days"] = (tmp["time"] - tmp["time"].shift(1)).dt.days
    tmp.dropna(inplace=True)
    tmp["diff_days"] = np.digitize(tmp["diff_days"], bins=bins) - 1
    for idx in range(1, block_size):
        tmp["mag_" + str(idx)] = tmp["mag"].shift(idx)
        tmp["depth_" + str(idx)] = tmp["depth"].shift(idx)
        tmp["latitude_new_" + str(idx)] = tmp["latitude_new"].shift(idx)
        tmp["longitude_new_" + str(idx)] = tmp["longitude_new"].shift(idx)
        tmp["dist_" + str(idx)] = tmp["dist"].shift(idx)
        tmp["distance_" + str(idx)] = tmp["distance"].shift(idx)
        tmp["plate_" + str(idx)] = tmp["plate"].shift(idx)
        tmp["diff_days_" + str(idx)] = tmp["diff_days"].shift(idx)
        tmp["magType_" + str(idx)] = tmp["magType"].shift(idx)
    tmp.dropna(inplace=True)
    return tmp

In [12]:
def reshape(df, block_size, feature_order, featrues_region):
    df = df.sample(frac=1, random_state = SEED).reset_index(drop=True)
    x_train = df[feature_order].to_numpy().reshape(-1, block_size, len(feature_order) // block_size)
    x_region = df[featrues_region].to_numpy().reshape(-1, len(featrues_region))
    y_train = df["label"].to_numpy().reshape(-1, 1)
    return x_train, x_region, y_train

In [13]:
def split_all(df, block_size, feature_order, features_region, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]
    df_val = df[(df["time"] >= SPLIT_DATE_TRAIN) & (df["time"] < SPLIT_DATE_VAL)]
    df_test = df[df["time"] >= SPLIT_DATE_VAL]
    x_train, x_train_region, y_train = reshape(df_train, block_size, feature_order, features_region)
    x_val, x_val_region, y_val = reshape(df_val, block_size, feature_order, features_region)
    x_test, x_test_region, y_test = reshape(df_test, block_size, feature_order, features_region)
    return x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test

In [14]:
def make_npys(df, radius, th, block_size, features_order, features_region, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df.sort_values(by="time", inplace=True)
    regions = filter_regions(df, th)
    df, scaler_dict = preprocess_df(df, PREPROC_PARAMS, SPLIT_DATE_TRAIN)
    np.random.shuffle(regions)
    for idx, pos in enumerate(tqdm.tqdm(regions)):
        df_pos = make_block(df, pos, radius, block_size, PREPROC_PARAMS)
        x_train, x_train_region, y_train, x_val, x_val_region, y_val, x_test, x_test_region, y_test = split_all(df_pos, block_size, features_order, features_region, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)
        np.save("../data/npys/x_train_" + str(idx) + ".npy", x_train)
        np.save("../data/npys/x_train_region_" + str(idx) + ".npy", x_train_region)
        np.save("../data/npys/y_train_" + str(idx) + ".npy", y_train)
        np.save("../data/npys/x_val_" + str(idx) + ".npy", x_val)
        np.save("../data/npys/x_val_region_" + str(idx) + ".npy", x_val_region)
        np.save("../data/npys/y_val_" + str(idx) + ".npy", y_val)
        np.save("../data/npys/x_test_" + str(idx) + ".npy", x_test)
        np.save("../data/npys/x_test_region_" + str(idx) + ".npy", x_test_region)
        np.save("../data/npys/y_test_" + str(idx) + ".npy", y_test)
    return scaler_dict

In [15]:
features_region = ["lat_cent", "lon_cent", "dist_region", "plate_region"]
featrues = ["mag", "depth", "latitude_new", "longitude_new", "dist", "distance", "plate", "diff_days", "magType"]
featrues_order = [featrues[idx] + "_" + str(i) for i in range(BLOCK_SIZE-1, 0, -1) for idx in range(len(featrues))]
featrues_order = featrues_order + featrues

In [16]:
scalers = make_npys(df.copy(deep=True), RADIUS, THRESHOLD, BLOCK_SIZE, featrues_order, features_region, PREPROC_PARAMS, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)

100%|██████████| 141/141 [08:29<00:00,  3.61s/it]
