# data_world.csv

Energia grupowana po regionie i czasie + informacje o regionie + informacje o sasiadach. LSTM, Dense, XGB

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')

In [144]:
df = pd.read_csv('../../data/merge.csv')
df.head()

Unnamed: 0,network_code,receiver_code,receiver_type,receiver_latitude,receiver_longitude,receiver_elevation_m,p_arrival_sample,p_status,p_weight,p_travel_sec,...,source_magnitude_author,source_mechanism_strike_dip_rake,source_distance_deg,source_distance_km,back_azimuth_deg,snr_db,coda_end_sample,trace_start_time,trace_category,trace_name
0,TA,109C,HH,32.8889,-117.1051,150.0,,,,,...,,,,,,,,2015-10-21 05:55:00,noise,109C.TA_201510210555_NO
1,TA,109C,HH,32.8889,-117.1051,150.0,,,,,...,,,,,,,,2015-11-06 14:50:00,noise,109C.TA_201511061450_NO
2,TA,109C,HH,32.8889,-117.1051,150.0,,,,,...,,,,,,,,2015-11-07 02:20:00,noise,109C.TA_201511070220_NO
3,TA,109C,HH,32.8889,-117.1051,150.0,,,,,...,,,,,,,,2015-11-14 05:15:00,noise,109C.TA_201511140515_NO
4,TA,109C,HH,32.8889,-117.1051,150.0,,,,,...,,,,,,,,2015-12-25 18:50:00,noise,109C.TA_201512251850_NO


In [145]:
df = df[df["trace_category"] == 'earthquake_local']
df.rename(columns={"source_latitude": "latitude", "source_longitude": "longitude", "source_magnitude": "mag", "trace_start_time": "time"}, inplace=True)

In [146]:
df = df[["time", "latitude", "longitude", "mag"]]

In [147]:
df["time"] = pd.to_datetime(df["time"], format="mixed")
df

Unnamed: 0,time,latitude,longitude,mag
235426,2006-07-23 15:59:00.960,33.74960,-117.49380,3.60
235427,2006-11-03 15:56:53.610,32.70770,-116.04460,4.30
235428,2006-11-03 16:12:24.700,32.72530,-116.03480,3.60
235429,2006-11-14 13:32:22.540,32.70630,-116.02410,3.80
235430,2006-11-27 10:46:41.060,31.96790,-117.19440,3.60
...,...,...,...,...
1265652,2017-06-21 09:40:43.810,44.77950,-111.03833,0.39
1265653,2017-06-21 12:21:05.390,44.76967,-110.99700,1.72
1265654,2017-06-21 12:23:40.200,44.77100,-110.99900,1.61
1265655,2017-06-21 13:34:40.330,44.77567,-111.03983,1.25


In [148]:
time_cut = dt.datetime(2004, 1, 1)
df = df[df["time"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)
df["label"] = np.where(df["mag"] > 5, 1, 0)

In [149]:
# keep only year and month in time
df["time"] = df["time"].dt.strftime('%Y-%m')

In [150]:
geo_split = 10

In [151]:
df["latitude"] = (df["latitude"] // geo_split).astype(int)
df["latitude"] = df["latitude"] + np.abs(np.min(df["latitude"]))

df["longitude"] = (df["longitude"] // geo_split).astype(int)
df["longitude"] = df["longitude"] + np.abs(np.min(df["longitude"]))
df["pos"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

In [152]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    tmp_pos = []
    df_agg = df.groupby(["pos", "time"]).agg({"mag": "count", "label": max}).reset_index()
    for pos in df_agg["pos"].unique():
        tmp = df_agg[df_agg["pos"] == pos]
        tmp = tmp.set_index("time")
        idx = pd.date_range(tmp.index.min(), df_agg["time"].max(), freq="MS").strftime('%Y-%m')
        tmp = tmp.reindex(idx, fill_value=0)
        tmp.index.name = "time"
        tmp["label"] = tmp["label"].shift(-1)
        if tmp["label"].sum() / len(tmp) > threshold:
            tmp_pos.append(pos)
    df = df[df["pos"].isin(tmp_pos)]
    return df

In [153]:
th = 0.05
df = filter_regions(df, th)

In [154]:
df

Unnamed: 0,time,latitude,longitude,mag,energy,label,pos
235426,2006-07,8,6,3.6,2.654606e+10,0,8_6
235427,2006-11,8,6,4.3,2.703958e+11,0,8_6
235428,2006-11,8,6,3.6,2.654606e+10,0,8_6
235429,2006-11,8,6,3.8,5.152286e+10,0,8_6
235430,2006-11,8,6,3.6,2.654606e+10,0,8_6
...,...,...,...,...,...,...,...
1263522,2018-07,8,6,1.2,9.289664e+06,0,8_6
1263523,2018-07,8,6,0.9,3.435579e+06,0,8_6
1263524,2018-07,8,6,0.9,3.435579e+06,0,8_6
1263525,2018-07,8,6,1.1,6.668068e+06,0,8_6


In [155]:
df_agg = df.groupby(["latitude", "longitude", "time"]).agg({"energy": "sum", "label": "max"}).reset_index()
df_agg

Unnamed: 0,latitude,longitude,time,energy,label
0,1,10,2005-03,1.107843e+12,0
1,1,10,2010-06,5.058247e+09,0
2,1,10,2010-07,1.509850e+13,1
3,1,10,2010-08,5.393239e+12,0
4,1,10,2010-09,1.742608e+13,1
...,...,...,...,...,...
1573,11,3,2018-08,6.167129e+14,1
1574,11,3,2018-09,8.170281e+11,0
1575,11,3,2018-10,1.054681e+13,0
1576,11,3,2018-11,4.498881e+15,1


In [156]:
df_agg["label"].value_counts()[0] / df_agg["label"].value_counts().sum()

0.8523447401774398

In [102]:
def make_ds(df_agg):
    dfs = []
    for i in df_agg["latitude"].unique():
        for j in df_agg["longitude"].unique():
            tmp = df_agg[(df_agg["latitude"] == i) & (df_agg["longitude"] == j)]
            if len(tmp) > 1:
                tmp = tmp.set_index("time")
                idx = pd.date_range(tmp.index.min(), df_agg["time"].max(), freq="MS").strftime('%Y-%m')
                tmp = tmp.reindex(idx, fill_value=0)
                tmp.index.name = "time"
                tmp["label"] = tmp["label"].shift(-1)
                tmp["latitude"] = i
                tmp["longitude"] = j
                tmp["neighbor0"] = 0 
                dfs.append(tmp)
    df_final = pd.concat(dfs)
    return df_final

In [43]:
df_final = make_ds(df_agg)

In [44]:
df_final

Unnamed: 0_level_0,latitude,longitude,energy,label,neighbor0
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-07,2,354,5.248075e+11,0.0,0
2004-08,2,354,0.000000e+00,0.0,0
2004-09,2,354,0.000000e+00,0.0,0
2004-10,2,354,0.000000e+00,0.0,0
2004-11,2,354,0.000000e+00,0.0,0
...,...,...,...,...,...
2018-08,113,35,4.093718e+14,0.0,0
2018-09,113,35,5.342576e+11,0.0,0
2018-10,113,35,1.808764e+12,0.0,0
2018-11,113,35,1.097306e+12,0.0,0


In [45]:
for i in df_final["latitude"].unique():
    for j in df_final["longitude"].unique():
        if not df_final[(df_final["latitude"] == i) & (df_final["longitude"] == j)].empty:
            for x in [-5, 0, 5]:
                for y in [-5, 0, 5]:
                    if not df_final[(df_final["latitude"] == i + x) & (df_final["longitude"] == j + y)].empty and not (x == 0 and y == 0):
                        df_final.loc[(df_final["latitude"] == i) & (df_final["longitude"] == j), "neighbor0"] += df_final[(df_final["latitude"] == i + x) & (df_final["longitude"] == j + y)]["energy"]
                        df_final["neighbor0"] = df_final["neighbor0"].fillna(0)

In [57]:
block_size = 60
dfs_train, dfs_val = [], []
for i in df_final["latitude"].unique():
    for j in df_final["longitude"].unique():
        tmp = df_final[(df_final["latitude"] == i) & (df_final["longitude"] == j)]
        n = int(len(tmp) * 0.2)
        tmp1 = tmp[:-n]
        tmp2 = tmp[-n:]
        if not tmp1.empty and not tmp2.empty:
            scaler = StandardScaler()
            tmp1["neighbor0"] = scaler.fit_transform(tmp1[["neighbor0"]])
            tmp2["neighbor0"] = scaler.transform(tmp2[["neighbor0"]])
            tmp1["energy0"] = scaler.fit_transform(tmp1[["energy"]])
            tmp2["energy0"] = scaler.transform(tmp2[["energy"]])
            for idx in range(1, block_size):
                tmp1["neighbor" + str(idx)] = tmp1["neighbor0"].shift(idx)
                tmp2["neighbor" + str(idx)] = tmp2["neighbor0"].shift(idx)
                tmp1["energy" + str(idx)] = tmp1["energy0"].shift(idx)
                tmp2["energy" + str(idx)] = tmp2["energy0"].shift(idx)  
            dfs_train.append(tmp1)
            dfs_val.append(tmp2)
df_final_train = pd.concat(dfs_train)
df_final_val = pd.concat(dfs_val)

In [58]:
df_final_train

Unnamed: 0_level_0,latitude,longitude,energy,label,neighbor0,energy0,neighbor1,energy1,neighbor2,energy2,...,neighbor55,energy55,neighbor56,energy56,neighbor57,energy57,neighbor58,energy58,neighbor59,energy59
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-07,2,354,5.248075e+11,0.0,0.0,-0.110567,,,,,...,,,,,,,,,,
2004-08,2,354,0.000000e+00,0.0,0.0,-0.115002,0.0,-0.110567,,,...,,,,,,,,,,
2004-09,2,354,0.000000e+00,0.0,0.0,-0.115002,0.0,-0.115002,0.0,-0.110567,...,,,,,,,,,,
2004-10,2,354,0.000000e+00,0.0,0.0,-0.115002,0.0,-0.115002,0.0,-0.115002,...,,,,,,,,,,
2004-11,2,354,0.000000e+00,0.0,0.0,-0.115002,0.0,-0.115002,0.0,-0.115002,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07,113,35,2.130508e+08,0.0,0.0,-0.151674,0.0,-0.152197,0.0,-0.152725,...,,,,,,,,,,
2017-08,113,35,2.987360e+09,0.0,0.0,-0.137989,0.0,-0.151674,0.0,-0.152197,...,,,,,,,,,,
2017-09,113,35,2.037611e+08,0.0,0.0,-0.151720,0.0,-0.137989,0.0,-0.151674,...,,,,,,,,,,
2017-10,113,35,4.164859e+09,0.0,0.0,-0.132180,0.0,-0.151720,0.0,-0.137989,...,,,,,,,,,,


In [59]:
df_final_train.dropna(inplace=True)
df_final_val.dropna(inplace=True)

In [60]:
x_train = df_final_train.drop(["label", "longitude", "latitude", "energy"], axis=1).to_numpy()
y_train = df_final_train["label"].to_numpy()
x_val = df_final_val.drop(["label", "longitude", "latitude", "energy"], axis=1).to_numpy()
y_val = df_final_val["label"].to_numpy()

In [61]:
x_train = x_train.reshape(-1, block_size, 2)
x_val = x_val.reshape(-1, block_size, 2)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

In [62]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [63]:
x_pos_train = df_final_train[["longitude", "latitude"]].to_numpy()
x_pos_val = df_final_val[["longitude", "latitude"]].to_numpy()

In [64]:
class MyModel(tf.keras.Model):
    def __init__(self, n_embed, vocab_size):
        super(MyModel, self).__init__()
        self.n_embed = n_embed
        self.loc_emb_x = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.loc_emb_y = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.conc = tf.keras.layers.Concatenate()
        self.dense_h = tf.keras.layers.Dense(2 * n_embed)
        self.dense_c = tf.keras.layers.Dense(2 * n_embed)
        self.lstm1 = tf.keras.layers.LSTM(n_embed * 2, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(n_embed * 2)
        self.dense = tf.keras.layers.Dense(n_embed, activation='relu')
        self.out = tf.keras.layers.Dense(2)
        

    def call(self, inputs):
        loc, timestep = inputs
        x = self.loc_emb_x(loc[:, 0])
        y = self.loc_emb_y(loc[:, 1])
        x_loc = self.conc([x, y])
        x_loc = tf.reshape(x_loc, [-1, 2 * self.n_embed])
        h = self.dense_h(x_loc)
        c = self.dense_c(x_loc)
        x = self.lstm1(timestep, initial_state=[c, h])
        x = self.lstm2(x, initial_state=[c, h])
        x = self.dense(x)
        return self.out(x)

In [65]:
n_embed = 256
vocab_size = 2000
model = MyModel(n_embed, vocab_size)

In [66]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [67]:
model.fit([x_pos_train, x_train], y_train, epochs=20, batch_size=32, validation_data=([x_pos_val, x_val], y_val))

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20

KeyboardInterrupt: 

In [168]:
x_train = df_final_train.drop(["label", "energy"], axis=1).to_numpy()
y_train = df_final_train["label"].to_numpy()
x_val = df_final_val.drop(["label", "energy"], axis=1).to_numpy()
y_val = df_final_val["label"].to_numpy()

In [169]:
model = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, n_jobs=-1)

In [170]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric='logloss', early_stopping_rounds=10)

[0]	validation_0-logloss:0.68624
[1]	validation_0-logloss:0.67951
[2]	validation_0-logloss:0.67290
[3]	validation_0-logloss:0.66642
[4]	validation_0-logloss:0.66005
[5]	validation_0-logloss:0.65382
[6]	validation_0-logloss:0.64769
[7]	validation_0-logloss:0.64167
[8]	validation_0-logloss:0.63576
[9]	validation_0-logloss:0.62997
[10]	validation_0-logloss:0.62427
[11]	validation_0-logloss:0.61870
[12]	validation_0-logloss:0.61321
[13]	validation_0-logloss:0.60781
[14]	validation_0-logloss:0.60252
[15]	validation_0-logloss:0.59730
[16]	validation_0-logloss:0.59220
[17]	validation_0-logloss:0.58717
[18]	validation_0-logloss:0.58222
[19]	validation_0-logloss:0.57740
[20]	validation_0-logloss:0.57263
[21]	validation_0-logloss:0.56793
[22]	validation_0-logloss:0.56329
[23]	validation_0-logloss:0.55871
[24]	validation_0-logloss:0.55423
[25]	validation_0-logloss:0.54980
[26]	validation_0-logloss:0.54549
[27]	validation_0-logloss:0.54126
[28]	validation_0-logloss:0.53705
[29]	validation_0-loglos

In [171]:
y_pred = model.predict(x_val)
accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)

(0.9064726515400636, 0.5715003138731952)