In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')
SEED = 1337
tf.random.set_seed(SEED)
np.random.seed(SEED)

2023-10-24 21:28:34.005062: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('../../data/new_usgs_small.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,time,longitude,latitude,depth,mag
0,1949-12-31 23:30:08.230000,-117.522,34.191167,4.49,2.58
1,1949-12-31 07:16:07.740000,-117.650667,35.859333,0.0,2.01
2,1949-12-31 00:14:20.310000,-116.628833,32.143333,6.0,3.3
3,1949-12-30 21:27:38.960000,-118.089333,33.856167,0.25,1.83
4,1949-12-30 12:13:50.070000,-116.806333,32.113,6.0,3.02


In [3]:
df = df[["time", "latitude", "longitude", "mag"]]

In [4]:
df["time_new"] = df["time"].apply(lambda x: x[:7])
df

Unnamed: 0,time,latitude,longitude,mag,time_new
0,1949-12-31 23:30:08.230000,34.191167,-117.522000,2.58,1949-12
1,1949-12-31 07:16:07.740000,35.859333,-117.650667,2.01,1949-12
2,1949-12-31 00:14:20.310000,32.143333,-116.628833,3.30,1949-12
3,1949-12-30 21:27:38.960000,33.856167,-118.089333,1.83,1949-12
4,1949-12-30 12:13:50.070000,32.113000,-116.806333,3.02,1949-12
...,...,...,...,...,...
4293100,2023-09-01 02:45:43.100000,17.937167,-66.917667,2.28,2023-09
4293101,2023-09-01 02:42:08.428000,60.279600,-147.859600,2.00,2023-09
4293102,2023-09-01 02:15:42.220000,33.486000,-116.586500,0.64,2023-09
4293103,2023-09-01 02:09:45.740000,35.348667,-97.926667,0.84,2023-09


In [5]:
time_cut = "1980-01"
df = df[df["time_new"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)

In [6]:
geo_split = 1

In [7]:
df["latitude_old"] = df["latitude"]
df["longitude_old"] = df["longitude"]
df["latitude"] = df["latitude"] // geo_split * geo_split

df["longitude"] = df["longitude"] // geo_split * geo_split
df["pos"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

In [8]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    tmp = []
    df_agg = df.groupby(["pos"])["mag"].count().reset_index()
    for _, row in df_agg.iterrows():
        if row["mag"] >= threshold:
            tmp.append(row["pos"])
    df = df[df["pos"].isin(tmp)]
    return df

In [9]:
th = 100
df = filter_regions(df, th)

In [10]:
def make_label(df):
    dfs = []
    df["time_new"] = pd.to_datetime(df["time_new"])
    for pos in df["pos"].unique():
        tmp = df[df["pos"] == pos]
        tmp.sort_values("time", inplace=True)
        for time in tmp["time_new"].unique():
            tmp_t0 = tmp[tmp["time_new"] == time]
            tmp_t1 = tmp[tmp["time_new"] == time + pd.DateOffset(months=1)]
            max_mag = tmp_t1["mag"].max()
            tmp_t0["label"] = 0 if max_mag < 5 else 1
            dfs.append(tmp_t0)
    return pd.concat(dfs)

In [11]:
df = make_label(df)
df

Unnamed: 0,time,latitude,longitude,mag,time_new,energy,latitude_old,longitude_old,pos,label
203350,1980-02-01 00:57:28.090000,40.0,-125.0,1.37,1980-02-01,1.632300e+07,40.501833,-124.158000,40.0_-125.0,0
207919,1980-02-01 12:22:28.500000,40.0,-125.0,1.91,1980-02-01,9.781377e+07,40.620167,-124.590833,40.0_-125.0,0
207883,1980-02-01 23:33:40.950000,40.0,-125.0,1.77,1980-02-01,6.148936e+07,40.293667,-124.761000,40.0_-125.0,0
207843,1980-02-02 12:38:21.930000,40.0,-125.0,1.21,1980-02-01,9.602847e+06,40.267000,-124.224333,40.0_-125.0,0
207808,1980-02-03 00:13:04.330000,40.0,-125.0,1.57,1980-02-01,3.168108e+07,40.863167,-124.050167,40.0_-125.0,0
...,...,...,...,...,...,...,...,...,...,...
4282349,2023-09-30 01:47:59.481000,31.0,-105.0,2.10,2023-09-01,1.836538e+08,31.564000,-104.136000,31.0_-105.0,1
4282309,2023-09-30 03:51:39.968000,31.0,-105.0,1.90,2023-09-01,9.462372e+07,31.594000,-104.551000,31.0_-105.0,1
4282197,2023-09-30 11:58:45.961000,31.0,-105.0,2.30,2023-09-01,3.564511e+08,31.507000,-104.017000,31.0_-105.0,1
4282193,2023-09-30 12:08:31.439000,31.0,-105.0,2.40,2023-09-01,4.965923e+08,31.525000,-104.008000,31.0_-105.0,1


In [10]:
def make_ds(df, block_size):
    dfs_train, dfs_val = [], []
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df["diff_days"] = (df["time"] - df["time"].shift(1)).dt.days
    df["energy"] = np.log(df["energy"])   
    for i in df["latitude"].unique():
        for j in df["longitude"].unique():
            tmp = df[(df["latitude"] == i) & (df["longitude"] == j)]
            if not tmp.empty:
                tmp.sort_values(by="time", inplace=True)
                for idx in range(1, block_size):
                    tmp["energy_" + str(idx)] = tmp["energy"].shift(idx)
                    tmp["latitude_" + str(idx)] = tmp["latitude_old"].shift(idx)
                    tmp["longitude_" + str(idx)] = tmp["longitude_old"].shift(idx)
                    tmp["diff_days_" + str(idx)] = tmp["diff_days"].shift(idx)
                dfs_train.append(tmp[tmp["time_new"] < "2016-01"])
                dfs_val.append(tmp[tmp["time_new"] >= "2016-01"])
    df_train = pd.concat(dfs_train)
    df_val = pd.concat(dfs_val)
    return df_train, df_val

In [11]:
block_size = 3
df_train, df_val = make_ds(df.copy(deep=True), block_size)
df_train

Unnamed: 0,time,latitude,longitude,mag,time_new,energy,latitude_old,longitude_old,pos,diff_days,energy_1,latitude_1,longitude_1,diff_days_1,energy_2,latitude_2,longitude_2,diff_days_2
203350,1980-02-01 00:57:28.090,40.0,-125.0,1.37,1980-02,16.608086,40.501833,-124.158000,40.0_-125.0,,,,,,,,,
207919,1980-02-01 12:22:28.500,40.0,-125.0,1.91,1980-02,18.398576,40.620167,-124.590833,40.0_-125.0,-1.0,16.608086,40.501833,-124.158000,,,,,
207883,1980-02-01 23:33:40.950,40.0,-125.0,1.77,1980-02,17.934375,40.293667,-124.761000,40.0_-125.0,-1.0,18.398576,40.620167,-124.590833,-1.0,16.608086,40.501833,-124.158000,
207843,1980-02-02 12:38:21.930,40.0,-125.0,1.21,1980-02,16.077570,40.267000,-124.224333,40.0_-125.0,-1.0,17.934375,40.293667,-124.761000,-1.0,18.398576,40.620167,-124.590833,-1.0
207808,1980-02-03 00:13:04.330,40.0,-125.0,1.57,1980-02,17.271230,40.863167,-124.050167,40.0_-125.0,-1.0,16.077570,40.267000,-124.224333,-1.0,17.934375,40.293667,-124.761000,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078980,2015-04-24 05:36:42.400,-43.0,173.0,6.10,2015-04,32.291453,-42.060200,173.006600,-43.0_173.0,-1.0,25.991581,-42.335000,173.909800,-1.0,26.323153,-42.369400,173.099000,-1.0
3078770,2015-04-24 16:35:24.680,-43.0,173.0,4.20,2015-04,25.991581,-42.177900,173.105100,-43.0_173.0,-1.0,32.291453,-42.060200,173.006600,-1.0,25.991581,-42.335000,173.909800,-1.0
3090089,2015-05-22 19:40:39.660,-43.0,173.0,4.40,2015-05,26.654725,-42.156400,173.082300,-43.0_173.0,-1.0,25.991581,-42.177900,173.105100,-1.0,32.291453,-42.060200,173.006600,-1.0
2619262,2011-06-10 23:14:38.090,-63.0,-59.0,4.70,2011-06,27.649442,-62.240000,-58.613000,-63.0_-59.0,-1.0,,,,,,,,


In [14]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)

In [15]:
# x_train = df_train.drop(["time", "time_new", "mag", "label", "latitude", "longitude", "pos"], axis=1).iloc[:, ::-1].to_numpy().reshape(-1, block_size, 4)
y_train = df_train["label"].to_numpy().reshape(-1, 1)
# x_val = df_val.drop(["time", "time_new", "mag", "label", "latitude", "longitude", "pos"], axis=1).iloc[:, ::-1].to_numpy().reshape(-1, block_size, 4)
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [16]:
# calculate balance of classes
unique, counts = np.unique(y_train, return_counts=True)
counts[0] / len(y_train)

0.9094897724382652

In [17]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1, dtype=np.float32)
y_val = np.concatenate((y_val, 1 - y_val), axis=1, dtype=np.float32)

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(512, input_shape=(block_size, 4), return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2)
])

2023-10-23 05:04:12.915533: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-23 05:04:13.169429: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-23 05:04:13.169538: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-23 05:04:13.204455: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-23 05:04:13.204630: I tensorflow/compile

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [20]:
model.fit(df_train.drop(["time", "time_new", "mag", "label", "latitude", "longitude", "pos"], axis=1).iloc[:, ::-1].to_numpy().reshape(-1, block_size, 4), y_train, epochs=10, batch_size=1024, validation_data=(df_val.drop(["time", "time_new", "mag", "label", "latitude", "longitude", "pos"], axis=1).iloc[:, ::-1].to_numpy().reshape(-1, block_size, 4), y_val))

2023-10-23 05:06:50.812973: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1056908800 exceeds 10% of free system memory.
2023-10-23 05:07:01.764480: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1056908800 exceeds 10% of free system memory.


Epoch 1/10


2023-10-23 05:07:18.592136: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /home/majkel/miniconda3/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget
2023-10-23 05:07:19.972365: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7efc1c0d33d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-23 05:07:19.972450: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060 with Max-Q Design, Compute Capability 7.5
2023-10-23 05:07:20.096555: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-23 05:07:20.847829: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the proces



2023-10-23 05:12:21.308437: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 549693952 exceeds 10% of free system memory.
2023-10-23 05:12:23.987435: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 549693952 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7efcbcdcff10>