# data_world.csv

Tylko energia grupowana po regionie i czasie. LSTM, Dense, XGB

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')

In [20]:
df = pd.read_csv('../../data/new_usgs_small.csv')
df.head()

Unnamed: 0,time,longitude,latitude,depth,mag
0,1949-12-31 23:30:08.230000,-117.522,34.191167,4.49,2.58
1,1949-12-31 07:16:07.740000,-117.650667,35.859333,0.0,2.01
2,1949-12-31 00:14:20.310000,-116.628833,32.143333,6.0,3.3
3,1949-12-30 21:27:38.960000,-118.089333,33.856167,0.25,1.83
4,1949-12-30 12:13:50.070000,-116.806333,32.113,6.0,3.02


In [21]:
df = df[["time", "latitude", "longitude", "mag"]]

In [22]:
df["time"] = df["time"].apply(lambda x: x[:7])
df

Unnamed: 0,time,latitude,longitude,mag
0,1949-12,34.191167,-117.522000,2.58
1,1949-12,35.859333,-117.650667,2.01
2,1949-12,32.143333,-116.628833,3.30
3,1949-12,33.856167,-118.089333,1.83
4,1949-12,32.113000,-116.806333,3.02
...,...,...,...,...
4293100,2023-09,17.937167,-66.917667,2.28
4293101,2023-09,60.279600,-147.859600,2.00
4293102,2023-09,33.486000,-116.586500,0.64
4293103,2023-09,35.348667,-97.926667,0.84


In [23]:
time_cut = "1980-01"
df = df[df["time"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)
df["label"] = np.where(df["mag"] > 5, 1, 0)

In [24]:
geo_split = 10

In [25]:
df["latitude"] = (df["latitude"] // geo_split).astype(int)
df["latitude"] = df["latitude"] + np.abs(np.min(df["latitude"]))

df["longitude"] = (df["longitude"] // geo_split).astype(int)
df["longitude"] = df["longitude"] + np.abs(np.min(df["longitude"]))
df["pos"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

In [26]:
# def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
#     tmp_pos = []
#     df_agg = df.groupby(["pos", "time"]).agg({"mag": "count", "label": max}).reset_index()
#     for pos in df_agg["pos"].unique():
#         tmp = df_agg[df_agg["pos"] == pos]
#         tmp = tmp.set_index("time")
#         idx = pd.date_range(tmp.index.min(), df_agg["time"].max(), freq="MS").strftime('%Y-%m')
#         tmp = tmp.reindex(idx, fill_value=0)
#         tmp.index.name = "time"
#         tmp["label"] = tmp["label"].shift(-1)
#         if tmp["label"].sum() / len(tmp) > threshold:
#             tmp_pos.append(pos)
#     df = df[df["pos"].isin(tmp_pos)]
#     return df

In [27]:
# th = 0.1
# df = filter_regions(df, th)

In [28]:
df

Unnamed: 0,time,latitude,longitude,mag,energy,label,pos
203350,1980-02,13,5,1.37,1.632300e+07,0,13_5
203351,1980-02,12,5,2.19,2.475141e+08,0,12_5
205898,1980-03,12,5,2.97,3.287002e+09,0,12_5
205899,1980-03,12,6,1.70,4.875285e+07,0,12_6
205900,1980-03,12,6,2.10,1.836538e+08,0,12_6
...,...,...,...,...,...,...,...
4293100,2023-09,10,11,2.28,3.335800e+08,0,10_11
4293101,2023-09,15,3,2.00,1.318257e+08,0,15_3
4293102,2023-09,12,6,0.64,1.450775e+06,0,12_6
4293103,2023-09,12,8,0.84,2.815788e+06,0,12_8


In [29]:
df_agg = df.groupby(["latitude", "longitude", "time"]).agg({"energy": "sum", "label": "max"}).reset_index()
df_agg

Unnamed: 0,latitude,longitude,time,energy,label
0,0,13,1995-01,1.018591e+12,0
1,0,22,2012-08,3.767038e+11,0
2,0,29,2020-09,1.940886e+11,0
3,0,31,2003-10,5.248075e+11,0
4,0,31,2011-11,7.311391e+11,0
...,...,...,...,...,...
74709,17,30,2021-07,5.248075e+11,0
74710,17,30,2022-02,5.248075e+11,0
74711,17,30,2022-04,1.940886e+11,0
74712,17,30,2022-09,2.703958e+11,0


In [40]:
def make_ds(df_agg: pd.DataFrame, block_size: int) -> (pd.DataFrame, pd.DataFrame):
    dfs_train, dfs_val = [], []
    df_agg["energy"] = np.log(df_agg["energy"] + 1)
    for i in df_agg["latitude"].unique():
        for j in df_agg["longitude"].unique():
            tmp = df_agg[(df_agg["latitude"] == i) & (df_agg["longitude"] == j)]
            if len(tmp) > 1:
                tmp = tmp.set_index("time")
                idx = pd.date_range(tmp.index.min(), tmp.index.max(), freq="MS").strftime('%Y-%m')
                tmp = tmp.reindex(idx, fill_value=0)
                tmp.index.name = "time"
                tmp["label"] = tmp["label"].shift(-1)
                tmp["latitude"] = i
                tmp["longitude"] = j
                n = int(0.8 * len(tmp))
                df_train = tmp[:n]
                df_val = tmp[n:]
                scaler = MinMaxScaler()
                df_train["energy"] = scaler.fit_transform(df_train[["energy"]])
                df_val["energy"] = scaler.transform(df_val[["energy"]])
                for idx in range(1, block_size):
                    df_train["energy" + str(idx)] = df_train["energy"].shift(idx)
                    df_val["energy" + str(idx)] = df_val["energy"].shift(idx)
                dfs_train.append(df_train)
                dfs_val.append(df_val)
    df_final_train = pd.concat(dfs_train)
    df_final_val = pd.concat(dfs_val)
    return df_final_train, df_final_val

In [41]:
block_size = 24
df_train, df_val = make_ds(df_agg, block_size)
df_train

Unnamed: 0_level_0,latitude,longitude,energy,label,energy1,energy2,energy3,energy4,energy5,energy6,...,energy14,energy15,energy16,energy17,energy18,energy19,energy20,energy21,energy22,energy23
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-10,0,31,1.000000,0.0,,,,,,,...,,,,,,,,,,
2003-11,0,31,0.000000,0.0,1.0,,,,,,...,,,,,,,,,,
2003-12,0,31,0.000000,0.0,0.0,1.0,,,,,...,,,,,,,,,,
2004-01,0,31,0.000000,0.0,0.0,0.0,1.0,,,,...,,,,,,,,,,
2004-02,0,31,0.000000,0.0,0.0,0.0,0.0,1.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-07,17,23,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-08,17,23,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-09,17,23,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-10,17,23,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)

In [48]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True), input_shape=(block_size, 1)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2),
])

In [49]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [50]:
x_train = df_train.drop(["label", "latitude", "longitude"], axis=1).to_numpy()
x_train = x_train.reshape(-1, block_size, 1)
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["label", "latitude", "longitude"], axis=1).to_numpy()
x_val = x_val.reshape(-1, block_size, 1)
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [51]:
# calculate balance of classes
unique, counts = np.unique(y_train, return_counts=True)
counts[0] / len(y_train)

0.8810124114627331

In [52]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [53]:
model.fit(x_train, y_train, epochs=20, batch_size=2048, validation_data=(x_val, y_val), shuffle=True)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

KeyboardInterrupt: 

In [43]:
x_train = df_train.drop(["label", "latitude", "longitude"], axis=1).to_numpy()
x_train = x_train.reshape(-1, block_size, 1)
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["label", "latitude", "longitude"], axis=1).to_numpy()
x_val = x_val.reshape(-1, block_size, 1)
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [44]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [45]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(64, 3, activation='relu', padding="same", input_shape=(block_size, 1)),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Conv1D(128, 3, activation='relu', padding="same"),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2),
])

In [46]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [47]:
model.fit(x_train, y_train, epochs=20, batch_size=2048, validation_data=(x_val, y_val), shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f217aaefe50>

In [None]:
x_train = df_train.drop(["time", "label", "latitude", "longitude"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["time", "label", "latitude", "longitude"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [79]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [80]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(block_size, activation='relu', input_shape=(block_size, )),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2),
])

In [81]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [84]:
model.fit(x_train, y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f6c90bd5420>

In [86]:
x_train = df_train.drop(["time", "label", "latitude", "longitude"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["time", "label", "latitude", "longitude"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [87]:
model = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, n_jobs=-1)

In [88]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric='logloss', early_stopping_rounds=10)

[0]	validation_0-logloss:0.68654
[1]	validation_0-logloss:0.68007
[2]	validation_0-logloss:0.67372
[3]	validation_0-logloss:0.66750
[4]	validation_0-logloss:0.66138
[5]	validation_0-logloss:0.65537
[6]	validation_0-logloss:0.64949
[7]	validation_0-logloss:0.64371
[8]	validation_0-logloss:0.63804
[9]	validation_0-logloss:0.63247
[10]	validation_0-logloss:0.62700
[11]	validation_0-logloss:0.62163
[12]	validation_0-logloss:0.61636
[13]	validation_0-logloss:0.61116
[14]	validation_0-logloss:0.60607
[15]	validation_0-logloss:0.60106
[16]	validation_0-logloss:0.59615
[17]	validation_0-logloss:0.59132
[18]	validation_0-logloss:0.58656
[19]	validation_0-logloss:0.58189
[20]	validation_0-logloss:0.57731
[21]	validation_0-logloss:0.57280
[22]	validation_0-logloss:0.56836
[23]	validation_0-logloss:0.56399
[24]	validation_0-logloss:0.55970
[25]	validation_0-logloss:0.55548
[26]	validation_0-logloss:0.55133
[27]	validation_0-logloss:0.54724
[28]	validation_0-logloss:0.54321
[29]	validation_0-loglos

In [90]:
y_pred = model.predict(x_val)

In [91]:
accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)

(0.8938714841070203, 0.4426564188783475)