In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tqdm
import pickle
warnings.filterwarnings("ignore")

2023-10-29 03:27:45.344194: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
SEED = 1337
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [3]:
SPLIT_DATE_TRAIN = "2018-09-14" # df["time"].quantile(0.8)
SPLIT_DATE_VAL = "2020-12-03" # df["time"].quantile(0.9)
BLOCK_SIZE = 128
BATCH_SIZE = 1024

In [4]:
df = pd.read_csv("../data/with_features.csv")

In [5]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,time_new,latitude_disc,longitude_disc,pos,diff_days,label
0,1973-02-01 01:33:03.700,167.1750,-15.5650,45.000,5.0,1973-02-01,-16,167,-16_167,,0
1,1973-02-15 16:40:55.500,167.1410,-15.1390,64.000,4.8,1973-02-01,-16,167,-16_167,14.0,0
2,1973-03-27 20:55:27.900,167.2960,-15.0080,135.000,4.9,1973-03-01,-16,167,-16_167,40.0,1
3,1973-04-08 13:41:02.000,167.2180,-15.7790,35.000,6.4,1973-04-01,-16,167,-16_167,11.0,1
4,1973-04-21 21:30:35.700,167.2830,-15.8820,33.000,5.3,1973-04-01,-16,167,-16_167,13.0,1
...,...,...,...,...,...,...,...,...,...,...,...
3972926,2023-01-24 06:52:23.260,-97.8194,37.2064,5.000,2.6,2023-01-01,37,-98,37_-98,37.0,0
3972927,2023-02-11 03:29:01.909,-97.8680,37.5394,5.000,3.2,2023-02-01,37,-98,37_-98,17.0,0
3972928,2023-02-26 06:48:59.171,-97.2213,37.7068,2.198,2.3,2023-02-01,37,-98,37_-98,15.0,0
3972929,2023-03-05 20:35:51.407,-97.8587,37.0150,5.000,2.2,2023-03-01,37,-98,37_-98,7.0,0


In [6]:
def preprocess_df(df, SPLIT_DATE_TRAIN):
    scaler_dict = {}
    df_train = df[df["time_new"] < SPLIT_DATE_TRAIN]

    scaler = StandardScaler()
    scaler.fit(df_train["mag"].values.reshape(-1, 1))
    df["mag"] = scaler.transform(df["mag"].values.reshape(-1, 1))
    scaler_dict["mag"] = scaler

    scaler = StandardScaler()
    depth = np.log(df_train["depth"] + np.abs(df_train["depth"].min()) + 1).values.reshape(-1, 1)
    scaler.fit(depth)
    df["depth"] = scaler.transform(df["depth"].values.reshape(-1, 1))
    scaler_dict["depth"] = scaler

    scaler = StandardScaler()
    diff_days = np.log(df_train["diff_days"] + 1).values.reshape(-1, 1)
    scaler.fit(diff_days)
    df["diff_days"] = scaler.transform(df["diff_days"].values.reshape(-1, 1))

    scaler = MinMaxScaler()
    scaler.fit(df_train["latitude"].values.reshape(-1, 1))
    df["latitude"] = scaler.transform(df["latitude"].values.reshape(-1, 1))
    scaler_dict["latitude"] = scaler

    scaler = MinMaxScaler()
    scaler.fit(df_train["longitude"].values.reshape(-1, 1))
    df["longitude"] = scaler.transform(df["longitude"].values.reshape(-1, 1))
    scaler_dict["longitude"] = scaler
    return df, scaler_dict

In [7]:
def make_block(df, block_size):
    for idx in range(1, block_size):
        df["mag_" + str(idx)] = df["mag"].shift(idx)
        df["depth_" + str(idx)] = df["depth"].shift(idx)
        df["latitude_" + str(idx)] = df["latitude"].shift(idx)
        df["longitude_" + str(idx)] = df["longitude"].shift(idx)
        df["diff_days_" + str(idx)] = df["diff_days"].shift(idx)
    df.dropna(inplace=True)
    return df

In [8]:
def split(df, block_size, feature_order):
    x_train = df[feature_order].to_numpy().reshape(-1, block_size, len(feature_order) // block_size)
    y_train = df["label"].to_numpy().reshape(-1, 1)
    return x_train, y_train

In [9]:
def split_all(df, block_size, feature_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df_train = df[df["time"] < SPLIT_DATE_TRAIN]
    df_val = df[(df["time"] >= SPLIT_DATE_TRAIN) & (df["time"] < SPLIT_DATE_VAL)]
    df_test = df[df["time"] >= SPLIT_DATE_VAL]
    x_train, y_train = split(df_train, block_size, feature_order)
    x_val, y_val = split(df_val, block_size, feature_order)
    x_test, y_test = split(df_test, block_size, feature_order)
    return x_train, y_train, x_val, y_val, x_test, y_test

In [10]:
featrues = ["mag", "depth", "latitude", "longitude", "diff_days"]
featrues_order = [featrues[idx] + "_" + str(i) for i in range(BLOCK_SIZE-1, 0, -1) for idx in range(len(featrues))]
featrues_order = featrues_order + featrues

In [11]:
def make_npys(df, block_size, features_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL):
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df.sort_values(by="time", inplace=True)
    df, scaler_dict = preprocess_df(df, SPLIT_DATE_TRAIN)
    
    for idx, pos in enumerate(tqdm.tqdm(df["pos"].unique())):
        df_pos = df[df["pos"] == pos]
        df_pos = make_block(df_pos, block_size)
        x_train, y_train, x_val, y_val, x_test, y_test = split_all(df_pos, block_size, features_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)
        np.save("../data/npys/x_train_" + str(idx) + ".npy", x_train)
        np.save("../data/npys/y_train_" + str(idx) + ".npy", y_train)
        np.save("../data/npys/x_val_" + str(idx) + ".npy", x_val)
        np.save("../data/npys/y_val_" + str(idx) + ".npy", y_val)
        np.save("../data/npys/x_test_" + str(idx) + ".npy", x_test)
        np.save("../data/npys/y_test_" + str(idx) + ".npy", y_test)
    return scaler_dict

In [13]:
scalers = make_npys(df.copy(deep=True), BLOCK_SIZE, featrues_order, SPLIT_DATE_TRAIN, SPLIT_DATE_VAL)

100%|██████████| 1349/1349 [24:56<00:00,  1.11s/it] 


In [14]:
# save scalers
with open("../data/scalers_for_npys.pkl", "wb") as f:
    pickle.dump(scalers, f)