In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')
SEED = 1337
tf.random.set_seed(SEED)
np.random.seed(SEED)

2023-10-23 23:05:48.421049: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('../data/new_usgs_small.csv')
df.head()

Unnamed: 0,time,longitude,latitude,depth,mag
0,1949-12-31 23:30:08.230000,-117.522,34.191167,4.49,2.58
1,1949-12-31 07:16:07.740000,-117.650667,35.859333,0.0,2.01
2,1949-12-31 00:14:20.310000,-116.628833,32.143333,6.0,3.3
3,1949-12-30 21:27:38.960000,-118.089333,33.856167,0.25,1.83
4,1949-12-30 12:13:50.070000,-116.806333,32.113,6.0,3.02


In [3]:
df = df[["time", "latitude", "longitude", "mag"]]
df.dropna(inplace=True)

In [4]:
df["time_new"] = df["time"].apply(lambda x: x[:7])
df

Unnamed: 0,time,latitude,longitude,mag,time_new
0,1949-12-31 23:30:08.230000,34.191167,-117.522000,2.58,1949-12
1,1949-12-31 07:16:07.740000,35.859333,-117.650667,2.01,1949-12
2,1949-12-31 00:14:20.310000,32.143333,-116.628833,3.30,1949-12
3,1949-12-30 21:27:38.960000,33.856167,-118.089333,1.83,1949-12
4,1949-12-30 12:13:50.070000,32.113000,-116.806333,3.02,1949-12
...,...,...,...,...,...
4293100,2023-09-01 02:45:43.100000,17.937167,-66.917667,2.28,2023-09
4293101,2023-09-01 02:42:08.428000,60.279600,-147.859600,2.00,2023-09
4293102,2023-09-01 02:15:42.220000,33.486000,-116.586500,0.64,2023-09
4293103,2023-09-01 02:09:45.740000,35.348667,-97.926667,0.84,2023-09


In [5]:
time_cut = "1973-01"
df = df[df["time_new"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)

In [6]:
geo_split = 1

In [7]:
df["latitude_old"] = df["latitude"]
df["longitude_old"] = df["longitude"]
df["latitude"] = df["latitude"] // geo_split * geo_split

df["longitude"] = df["longitude"] // geo_split * geo_split
df["pos"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

In [8]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    tmp = []
    df_agg = df.groupby(["pos"])["mag"].count().reset_index()
    for _, row in df_agg.iterrows():
        if row["mag"] >= threshold:
            tmp.append(row["pos"])
    df = df[df["pos"].isin(tmp)]
    return df

In [9]:
th = 500
df = filter_regions(df, th)

In [10]:
def make_label(df):
    dfs = []
    df["time_new"] = pd.to_datetime(df["time_new"])
    for pos in df["pos"].unique():
        tmp = df[df["pos"] == pos]
        tmp.sort_values("time", inplace=True)
        for time in tmp["time_new"].unique():
            tmp_t0 = tmp[tmp["time_new"] == time]
            tmp_t1 = tmp[tmp["time_new"] == time + pd.DateOffset(months=1)]
            max_mag = tmp_t1["mag"].max()
            tmp_t0["label"] = 0 if max_mag < 5 else 1
            dfs.append(tmp_t0)
    return pd.concat(dfs)

In [11]:
df = make_label(df)
df

Unnamed: 0,time,latitude,longitude,mag,time_new,energy,latitude_old,longitude_old,pos,label
203350,1980-02-01 00:57:28.090000,40.0,-125.0,1.37,1980-02-01,1.632300e+07,40.501833,-124.158000,40.0_-125.0,0
207919,1980-02-01 12:22:28.500000,40.0,-125.0,1.91,1980-02-01,9.781377e+07,40.620167,-124.590833,40.0_-125.0,0
207883,1980-02-01 23:33:40.950000,40.0,-125.0,1.77,1980-02-01,6.148936e+07,40.293667,-124.761000,40.0_-125.0,0
207843,1980-02-02 12:38:21.930000,40.0,-125.0,1.21,1980-02-01,9.602847e+06,40.267000,-124.224333,40.0_-125.0,0
207808,1980-02-03 00:13:04.330000,40.0,-125.0,1.57,1980-02-01,3.168108e+07,40.863167,-124.050167,40.0_-125.0,0
...,...,...,...,...,...,...,...,...,...,...
4282349,2023-09-30 01:47:59.481000,31.0,-105.0,2.10,2023-09-01,1.836538e+08,31.564000,-104.136000,31.0_-105.0,1
4282309,2023-09-30 03:51:39.968000,31.0,-105.0,1.90,2023-09-01,9.462372e+07,31.594000,-104.551000,31.0_-105.0,1
4282197,2023-09-30 11:58:45.961000,31.0,-105.0,2.30,2023-09-01,3.564511e+08,31.507000,-104.017000,31.0_-105.0,1
4282193,2023-09-30 12:08:31.439000,31.0,-105.0,2.40,2023-09-01,4.965923e+08,31.525000,-104.008000,31.0_-105.0,1


In [12]:
def make_batch(df, batch_size, block_size, pos_amount):
    x, y = [], []
    pos = np.random.choice(list(pos_amount.keys()))
    tmp = df[df["pos"] == pos]
    for j in range(batch_size):
        idx = np.random.randint(block_size, pos_amount[pos])
        x.append(tmp[idx-block_size:idx][["energy", "latitude_old", "longitude_old"]].to_numpy())
        y.append(tmp["label"].values[idx-1])
    x , y = np.array(x), np.array(y)
    idx = np.random.permutation(len(x))
    x, y = x[idx], y[idx]
    return x, y

In [13]:
df_train = df[df["time_new"] < "2016-01"]
df_val = df[df["time_new"] >= "2016-01"]

In [14]:
pos_amount_train = {}
pos_amount_val = {}
for pos in df_train["pos"].unique():
    pos_amount_train[pos] = len(df_train[df_train["pos"] == pos])
for pos in df_val["pos"].unique():
    pos_amount_val[pos] = len(df_val[df_val["pos"] == pos])

In [15]:
# keep only keys with over 1000 samples
pos_amount_train = {k: v for k, v in pos_amount_train.items() if v > 1000}
pos_amount_val = {k: v for k, v in pos_amount_val.items() if v > 1000}

In [16]:
block_size = 64
batch_size = 1024

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(512, input_shape=(block_size, 3), return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2)
])

In [22]:
EPOCHS = 20
MAX_STEPS_TRAIN = 3000
MAX_STEPS_VAL = 600
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy')
train_f1 = tf.keras.metrics.F1Score(average="micro")
val_f1 = tf.keras.metrics.F1Score(average="micro")

In [23]:
for epoch in range(EPOCHS):
    train_loss.reset_states()
    train_accuracy.reset_states()
    val_loss.reset_states()
    val_accuracy.reset_states()
    train_f1.reset_states()
    val_f1.reset_states()
    for step in range(MAX_STEPS_TRAIN):
        x_train, y_train = make_batch(df, batch_size, block_size, pos_amount_train)
        y_train = y_train.reshape(-1, 1)
        y_train = np.concatenate((y_train, 1 - y_train), axis=1)
        with tf.GradientTape() as tape:
            predictions = model(x_train)
            loss = loss_object(y_train, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_loss(loss)
        train_accuracy(y_train, predictions)
        train_f1(y_train, predictions)

    for step in range(MAX_STEPS_VAL):
        x_val, y_val = make_batch(df_val, batch_size, block_size, pos_amount_val)
        y_val = y_val.reshape(-1, 1)
        y_val = np.concatenate((y_val, 1 - y_val), axis=1)
        predictions = model(x_val)
        loss = loss_object(y_val, predictions)
        val_loss(loss)
        val_accuracy(y_val, predictions)
        val_f1(y_val, predictions)


    template = 'Epoch {}, Loss: {}, Accuracy: {}, Val Loss: {}, Val Accuracy: {}, F1: {}, Val F1: {}'
    print(template.format(epoch+1,
                train_loss.result(),
                train_accuracy.result()*100,
                val_loss.result(),
                val_accuracy.result()*100,
                train_f1.result()*100,
                val_f1.result()*100))
    

Epoch 1, Loss: 0.3019588887691498, Accuracy: 91.12985229492188, Val Loss: 0.25012001395225525, Val Accuracy: 93.15250396728516, F1: 91.12985229492188, Val F1: 93.15250396728516
Epoch 2, Loss: 0.2997852563858032, Accuracy: 91.16409301757812, Val Loss: 0.2675309181213379, Val Accuracy: 93.22542572021484, F1: 91.16409301757812, Val F1: 93.22542572021484


KeyboardInterrupt: 