# data_world.csv

Energia grupowana po regionie i czasie + informacje o regionie. LSTM, Dense, XGB

In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')

In [195]:
df = pd.read_csv('../../data/data_world.csv')
df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,1930-12-08T08:01:02.000Z,23.261,120.277,15.0,6.3,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
1,1930-12-03T18:51:47.000Z,18.233,96.298,10.0,7.4,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
2,1930-12-02T07:01:30.000Z,25.854,98.356,35.0,6.2,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
3,1930-11-28T07:32:56.000Z,18.779,-106.767,15.0,6.3,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
4,1930-11-25T19:02:53.000Z,35.05,139.129,15.0,6.9,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,


In [196]:
df = df[["time", "latitude", "longitude", "mag"]]

In [197]:
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%dT%H:%M:%S.%fZ")
df

Unnamed: 0,time,latitude,longitude,mag
0,1930-12-08 08:01:02.000,23.2610,120.2770,6.3
1,1930-12-03 18:51:47.000,18.2330,96.2980,7.4
2,1930-12-02 07:01:30.000,25.8540,98.3560,6.2
3,1930-11-28 07:32:56.000,18.7790,-106.7670,6.3
4,1930-11-25 19:02:53.000,35.0500,139.1290,6.9
...,...,...,...,...
797041,2018-09-01 01:14:38.230,-30.4830,-177.9279,4.3
797042,2018-09-01 01:07:59.120,-10.7558,124.3621,4.0
797043,2018-09-01 01:00:13.810,-5.5167,147.1735,4.6
797044,2018-09-01 00:27:11.440,46.8819,155.6566,4.3


In [198]:
time_cut = dt.datetime(1973, 1, 1)
df = df[df["time"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)
df["label"] = np.where(df["mag"] > 5, 1, 0)

In [199]:
df["time"] = df["time"].dt.strftime('%Y-%m')

In [200]:
geo_split = 1

In [201]:
df["latitude"] = (df["latitude"] // geo_split).astype(int)
df["latitude"] = df["latitude"] + np.abs(np.min(df["latitude"]))

df["longitude"] = (df["longitude"] // geo_split).astype(int)
df["longitude"] = df["longitude"] + np.abs(np.min(df["longitude"]))
df["pos"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

In [202]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    tmp = []
    df_f = df[df["mag"] >= 5]
    df_agg = df_f.groupby(["pos"])["mag"].count().reset_index()
    for _, row in df_agg.iterrows():
        if row["mag"] >= threshold:
            tmp.append(row["pos"])
    df = df[df["pos"].isin(tmp)]
    return df

In [203]:
th = 45
df = filter_regions(df, th)

In [204]:
df_agg = df.groupby(["latitude", "longitude", "time"]).agg({"energy": "sum", "label": "max"}).reset_index()
df_agg

Unnamed: 0,latitude,longitude,time,energy,label
0,24,153,1974-04,1.037528e+13,1
1,24,153,1974-05,3.772820e+12,0
2,24,153,1976-10,1.419058e+12,0
3,24,153,1977-05,1.060177e+13,1
4,24,153,1978-04,1.976970e+12,0
...,...,...,...,...,...
80377,149,162,2018-05,2.703958e+11,0
80378,149,162,2018-06,1.018591e+12,0
80379,149,162,2018-09,5.248075e+11,0
80380,149,162,2018-10,5.248075e+11,0


In [205]:
def make_ds(df_agg: pd.DataFrame, block_size: int) -> (pd.DataFrame, pd.DataFrame):
    dfs_train, dfs_val = [], []
    for i in df_agg["latitude"].unique():
        for j in df_agg["longitude"].unique():
            tmp = df_agg[(df_agg["latitude"] == i) & (df_agg["longitude"] == j)]
            if len(tmp) > 1:
                tmp = tmp.set_index("time")
                idx = pd.date_range(tmp.index.min(), tmp.index.max(), freq="MS").strftime('%Y-%m')
                tmp = tmp.reindex(idx, fill_value=0)
                tmp.index.name = "time"
                tmp["label"] = tmp["label"].shift(-1)
                tmp["latitude"] = i
                tmp["longitude"] = j
                n = int(0.8 * len(tmp))
                df_train = tmp[:n]
                print(i,j,(df_train["energy"] == 0).sum() / len(df_train))
                df_val = tmp[n:]
                scaler = MinMaxScaler()
                df_train["energy"] = scaler.fit_transform(df_train[["energy"]])
                df_val["energy"] = scaler.transform(df_val[["energy"]])
                for idx in range(block_size):
                    df_train["energy" + str(idx)] = df_train["energy"].shift(idx)
                    df_val["energy" + str(idx)] = df_val["energy"].shift(idx)
                dfs_train.append(df_train)
                dfs_val.append(df_val)
    df_final_train = pd.concat(dfs_train)
    df_final_val = pd.concat(dfs_val)
    return df_final_train, df_final_val

In [206]:
block_size = 16
df_train, df_val = make_ds(df_agg, block_size)
df_train

24 153 0.8787878787878788
25 153 0.7231121281464531
26 154 0.6819221967963387
27 154 0.634090909090909
28 153 0.6933638443935927
28 154 0.8073394495412844
28 152 0.5636363636363636
29 153 0.7840909090909091
29 152 0.5795454545454546
29 151 0.8
46 106 0.9227272727272727
47 106 0.8590909090909091
48 106 0.8758782201405152
48 357 0.7624703087885986
50 107 0.5844748858447488
50 108 0.5023923444976076
51 107 0.4470046082949309
51 108 0.44874715261959
51 0 0.6113636363636363
51 1 0.6727272727272727
52 108 0.3325740318906606
52 0 0.6903669724770642
52 1 0.6032110091743119
53 108 0.45
53 0 0.6450116009280742
53 1 0.5694444444444444
53 2 0.7136150234741784
53 112 0.6866359447004609
54 108 0.47165532879818595
54 1 0.5194508009153318
54 2 0.4590909090909091
55 108 0.547945205479452
55 1 0.6990740740740741
55 2 0.5818181818181818
55 3 0.7909090909090909
56 2 0.726027397260274
56 3 0.7454128440366973
57 108 0.7522727272727273
57 3 0.6628440366972477
58 2 0.7695852534562212
58 3 0.7928571428571428
5

Unnamed: 0_level_0,latitude,longitude,energy,label,energy0,energy1,energy2,energy3,energy4,energy5,energy6,energy7,energy8,energy9,energy10,energy11,energy12,energy13,energy14,energy15
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1974-04,24,153,0.004837,0.0,0.004837,,,,,,,,,,,,,,,
1974-05,24,153,0.001759,0.0,0.001759,0.004837,,,,,,,,,,,,,,
1974-06,24,153,0.000000,0.0,0.000000,0.001759,0.004837,,,,,,,,,,,,,
1974-07,24,153,0.000000,0.0,0.000000,0.000000,0.001759,0.004837,,,,,,,,,,,,
1974-08,24,153,0.000000,0.0,0.000000,0.000000,0.000000,0.001759,0.004837,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009-06,149,162,0.000000,0.0,0.000000,0.000000,0.000000,0.047918,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-07,149,162,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.047918,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-08,149,162,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.047918,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-09,149,162,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.047918,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [207]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)

In [208]:
df_train["label"].value_counts()

label
0.0    138906
1.0     15202
Name: count, dtype: int64

In [209]:
class MyModel(tf.keras.Model):
    def __init__(self, n_embed, vocab_size):
        super(MyModel, self).__init__()
        self.n_embed = n_embed
        self.loc_emb_x = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.loc_emb_y = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.lstm1 = tf.keras.layers.LSTM(n_embed * 2, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(n_embed)
        self.dense_h = tf.keras.layers.Dense(2 * n_embed)
        self.dense_c = tf.keras.layers.Dense(2 * n_embed)
        self.dense = tf.keras.layers.Dense(n_embed, activation='relu')
        self.conc = tf.keras.layers.Concatenate()
        self.out = tf.keras.layers.Dense(2)
        

    def call(self, inputs):
        loc, timestep = inputs
        x = self.loc_emb_x(loc[:, 0])
        y = self.loc_emb_y(loc[:, 1])
        x_loc = self.conc([x, y])
        x_loc = tf.reshape(x_loc, [-1, 2 * self.n_embed])  
        x_loc = tf.reshape(x_loc, [-1, 2 * self.n_embed])
        h = self.dense_h(x_loc)
        c = self.dense_c(x_loc)
        x = self.lstm1(timestep, initial_state=[c, h])
        x = self.lstm2(x)
        x = self.dense(x)
        return self.out(x)

In [210]:
x_train = df_train.drop(["label", "latitude", "longitude", "energy"], axis=1).to_numpy()
x_train = x_train.reshape(-1, block_size, 1)
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["label", "latitude", "longitude", "energy"], axis=1).to_numpy()
x_val = x_val.reshape(-1, block_size, 1)
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [211]:
# calculate balance of classes
unique, counts = np.unique(y_train, return_counts=True)
counts[0] / len(y_train)

0.9013548939704623

In [212]:
x_pos_train = df_train[["latitude", "longitude"]].to_numpy()
x_pos_val = df_val[["latitude", "longitude"]].to_numpy()

In [213]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [214]:
n_embed = 256
vocab_size = 2000
model = MyModel(n_embed, vocab_size)

In [215]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [216]:
model.fit([x_pos_train, x_train], y_train, epochs=20, batch_size=2048, validation_data=([x_pos_val, x_val], y_val))

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

KeyboardInterrupt: 

: 

In [24]:
class MyModel(tf.keras.Model):
    def __init__(self, n_embed, vocab_size):
        super(MyModel, self).__init__()
        self.n_embed = n_embed
        self.loc_emb_h = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.loc_emb_c = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.lstm1 = tf.keras.layers.LSTM(n_embed * 2, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(n_embed * 2)
        self.dense = tf.keras.layers.Dense(n_embed, activation='relu')
        self.out = tf.keras.layers.Dense(2)
        

    def call(self, inputs):
        loc, timestep = inputs
        h = self.loc_emb_h(loc)
        c = self.loc_emb_c(loc)
        # reshape h and c to (batch_size, n_embed)
        h = tf.reshape(h, [-1, 2 * self.n_embed])
        c = tf.reshape(c, [-1, 2 * self.n_embed])
        x = self.lstm1(timestep, initial_state=[c, h])
        x = self.lstm2(x, initial_state=[c, h])
        x = self.dense(x)
        return self.out(x)

In [25]:
n_embed = 256
vocab_size = 2000
model = MyModel(n_embed, vocab_size)

In [26]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [27]:
model.fit([x_pos_train, x_train], y_train, epochs=20, batch_size=512, validation_data=([x_pos_val, x_val], y_val))

Epoch 1/20


2023-09-21 20:09:23.402333: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
Could not load symbol cublasGetSmCountTarget from libcublas.so.11. Error: /home/majkel/miniconda3/lib/libcublas.so.11: undefined symbol: cublasGetSmCountTarget
2023-09-21 20:09:24.129018: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3e6b57f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-21 20:09:24.129172: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060 with Max-Q Design, Compute Capability 7.5
2023-09-21 20:09:24.414868: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-21 20:09:25.767611: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fbb71191f30>

In [38]:
x_train = df_train.drop(["time", "label"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["time", "label"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [39]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [40]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(block_size + 2, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2)
])

In [41]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [42]:
model.fit(x_train, y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fbb707f4250>

In [29]:
x_train = df_train.drop(["time", "label"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["time", "label"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [30]:
model = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, n_jobs=-1)

In [31]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric='logloss', early_stopping_rounds=10)

[0]	validation_0-logloss:0.68621
[1]	validation_0-logloss:0.67941
[2]	validation_0-logloss:0.67275
[3]	validation_0-logloss:0.66620
[4]	validation_0-logloss:0.65977
[5]	validation_0-logloss:0.65347
[6]	validation_0-logloss:0.64731
[7]	validation_0-logloss:0.64124
[8]	validation_0-logloss:0.63528
[9]	validation_0-logloss:0.62942
[10]	validation_0-logloss:0.62366
[11]	validation_0-logloss:0.61802
[12]	validation_0-logloss:0.61247
[13]	validation_0-logloss:0.60702
[14]	validation_0-logloss:0.60168
[15]	validation_0-logloss:0.59642
[16]	validation_0-logloss:0.59126
[17]	validation_0-logloss:0.58617
[18]	validation_0-logloss:0.58118
[19]	validation_0-logloss:0.57629
[20]	validation_0-logloss:0.57147
[21]	validation_0-logloss:0.56674
[22]	validation_0-logloss:0.56211
[23]	validation_0-logloss:0.55753
[24]	validation_0-logloss:0.55303
[25]	validation_0-logloss:0.54859
[26]	validation_0-logloss:0.54425
[27]	validation_0-logloss:0.53996
[28]	validation_0-logloss:0.53574
[29]	validation_0-loglos

In [33]:
y_pred = model.predict(x_val)
accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)

(0.9099254631152033, 0.5674430846163969)