# data_world.csv

Energia grupowana po regionie i czasie + informacje o regionie + ruchoma srednia. LSTM, Dense, XGB

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
import xgboost as xgb
warnings.filterwarnings('ignore')

2023-10-13 01:49:37.112634: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('../../data/data_world.csv')
df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,1930-12-08T08:01:02.000Z,23.261,120.277,15.0,6.3,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
1,1930-12-03T18:51:47.000Z,18.233,96.298,10.0,7.4,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
2,1930-12-02T07:01:30.000Z,25.854,98.356,35.0,6.2,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
3,1930-11-28T07:32:56.000Z,18.779,-106.767,15.0,6.3,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,
4,1930-11-25T19:02:53.000Z,35.05,139.129,15.0,6.9,mw,,,,,...,2015-05-13T18:52:43.000Z,,,,,,,,,


In [3]:
df = df[["time", "latitude", "longitude", "mag"]]

In [4]:
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%dT%H:%M:%S.%fZ")
df

Unnamed: 0,time,latitude,longitude,mag
0,1930-12-08 08:01:02.000,23.2610,120.2770,6.3
1,1930-12-03 18:51:47.000,18.2330,96.2980,7.4
2,1930-12-02 07:01:30.000,25.8540,98.3560,6.2
3,1930-11-28 07:32:56.000,18.7790,-106.7670,6.3
4,1930-11-25 19:02:53.000,35.0500,139.1290,6.9
...,...,...,...,...
797041,2018-09-01 01:14:38.230,-30.4830,-177.9279,4.3
797042,2018-09-01 01:07:59.120,-10.7558,124.3621,4.0
797043,2018-09-01 01:00:13.810,-5.5167,147.1735,4.6
797044,2018-09-01 00:27:11.440,46.8819,155.6566,4.3


In [5]:
time_cut = dt.datetime(1973, 1, 1)
df = df[df["time"] > time_cut]
df["energy"] = 10**(1.44*df["mag"]+5.24)
df["label"] = np.where(df["mag"] > 5, 1, 0)

In [6]:
time_split = 30

In [7]:
# group the data into 30 days intervals
df["time"] = df["time"].dt.floor('d')
df["time"] = df["time"] - np.min(df["time"])
df["time"] = df["time"].dt.days // time_split
df["time"] = df["time"].astype(int)

In [8]:
geo_split = 10

In [9]:
df["latitude"] = df["latitude"] // geo_split
df["latitude"] = df["latitude"] + np.abs(np.min(df["latitude"]))

df["longitude"] = df["longitude"] // geo_split
df["longitude"] = df["longitude"] + np.abs(np.min(df["longitude"]))

In [10]:
df

Unnamed: 0,time,latitude,longitude,mag,energy,label
21801,12,7.0,34.0,4.5,5.248075e+11,0
21802,12,6.0,11.0,4.4,3.767038e+11,0
21803,12,7.0,0.0,5.1,3.837072e+12,1
21804,12,7.0,34.0,4.4,3.767038e+11,0
21805,12,6.0,34.0,4.6,7.311391e+11,0
...,...,...,...,...,...,...
797041,555,5.0,0.0,4.3,2.703958e+11,0
797042,555,7.0,30.0,4.0,1.000000e+11,0
797043,555,8.0,32.0,4.6,7.311391e+11,0
797044,555,13.0,33.0,4.3,2.703958e+11,0


In [11]:
df_agg = df.groupby(["latitude", "longitude", "time"]).agg({"energy": "sum", "label": "sum"}).reset_index()
df_agg

Unnamed: 0,latitude,longitude,time,energy,label
0,0.0,13.0,268,1.018591e+12,0
1,0.0,22.0,482,3.767038e+11,0
2,0.0,31.0,375,5.248075e+11,0
3,0.0,31.0,473,7.311391e+11,0
4,0.0,33.0,280,2.703958e+11,0
...,...,...,...,...,...
71216,17.0,30.0,487,1.018591e+12,0
71217,17.0,30.0,489,3.767038e+11,0
71218,17.0,30.0,509,7.311391e+11,0
71219,17.0,30.0,538,3.767038e+11,0


In [12]:
df_agg["label"] = np.where(df_agg["label"] > 0, 1, 0)

In [13]:
def make_ds(df_agg, block_size):
    dfs_train, dfs_val = [], []
    for i in df_agg["latitude"].unique():
        for j in df_agg["longitude"].unique():
            tmp = df_agg[(df_agg["latitude"] == i) & (df_agg["longitude"] == j)]
            if not tmp.empty:
                start = max(tmp["time"].min() - block_size, 0)
                end = min(tmp["time"].max() + block_size, df_agg["time"].max())
                tmp = tmp.set_index("time").reindex(range(start, end)).fillna(0).rename_axis('time').reset_index()
                tmp["label"] = tmp["label"].shift(-1)
                tmp["latitude"] = i
                tmp["longitude"] = j
                n = int(0.8 * len(tmp))
                df_train = tmp[:n]
                df_val = tmp[n:]
                scaler = MinMaxScaler()
                df_train["energy"] = scaler.fit_transform(df_train[["energy"]])
                df_val["energy"] = scaler.transform(df_val[["energy"]])
                for ws in [4, 8, 12, 24, 36]:
                    df_train[f"energy_ma_{ws}"] = df_train["energy"].rolling(ws).mean()
                    df_val[f"energy_ma_{ws}"] = df_val["energy"].rolling(ws).mean()
                for idx in range(block_size):
                    for ws in [4, 8, 12, 24, 36]:
                        df_train[f"energy_ma_{ws}" + "_" + str(idx)] = df_train[f"energy_ma_{ws}"].shift(idx)
                        df_val[f"energy_ma_{ws}" + "_" + str(idx)] = df_val[f"energy_ma_{ws}"].shift(idx) 
                    if idx > 0:
                        df_train["energy" + str(idx)] = df_train["energy"].shift(idx)
                        df_val["energy" + str(idx)] = df_val["energy"].shift(idx)      
                dfs_train.append(df_train)
                dfs_val.append(df_val)
    df_final_train = pd.concat(dfs_train)
    df_final_val = pd.concat(dfs_val)
    return df_final_train, df_final_val

In [14]:
block_size = 16
df_train, df_val = make_ds(df_agg, block_size)

In [15]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)

In [16]:
df_train

Unnamed: 0,time,latitude,longitude,energy,label,energy_ma_4,energy_ma_8,energy_ma_12,energy_ma_24,energy_ma_36,...,energy_ma_12_14,energy_ma_24_14,energy_ma_36_14,energy14,energy_ma_4_15,energy_ma_8_15,energy_ma_12_15,energy_ma_24_15,energy_ma_36_15,energy15
50,409,0.0,31.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.027778,...,0.000000,0.041667,0.027778,0.0,0.000000,0.000000,0.000000,0.041667,0.027778,0.0
51,410,0.0,31.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.027778,...,0.000000,0.041667,0.027778,0.0,0.000000,0.000000,0.000000,0.041667,0.027778,0.0
52,411,0.0,31.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.041667,0.027778,0.0,0.000000,0.000000,0.000000,0.041667,0.027778,0.0
53,412,0.0,31.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.041667,0.027778,0.0,0.000000,0.000000,0.000000,0.041667,0.027778,0.0
54,413,0.0,31.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.027778,0.0,0.000000,0.000000,0.000000,0.041667,0.027778,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,488,17.0,23.0,0.000000,0.0,0.000000,0.011464,0.007643,0.009145,0.006097,...,0.010647,0.005324,0.008494,0.0,0.031942,0.015971,0.010647,0.005324,0.008494,0.0
260,489,17.0,23.0,0.000000,0.0,0.000000,0.011464,0.007643,0.009145,0.006097,...,0.010647,0.005324,0.008494,0.0,0.031942,0.015971,0.010647,0.005324,0.008494,0.0
261,490,17.0,23.0,0.178002,0.0,0.044501,0.033714,0.022476,0.016562,0.011041,...,0.010647,0.005324,0.008494,0.0,0.031942,0.015971,0.010647,0.005324,0.008494,0.0
262,491,17.0,23.0,0.000000,0.0,0.044501,0.033714,0.022476,0.016562,0.011041,...,0.010647,0.005324,0.008494,0.0,0.000000,0.015971,0.010647,0.005324,0.008494,0.0


In [17]:
x_train = df_train.drop(["label", "time", "longitude", "latitude", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy()
x_val = df_val.drop(["label", "time", "longitude", "latitude", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy()

In [18]:
x_train = x_train.reshape(-1, block_size, 6)
x_val = x_val.reshape(-1, block_size, 6)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

In [19]:
# calculate balance of classes
unique, counts = np.unique(y_train, return_counts=True)
counts[0] / len(y_train)

0.8831725474939169

In [74]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [75]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, input_shape=(block_size, 6), return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(2),
])

In [76]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [77]:
model.fit(x_train, y_train, epochs=10, batch_size=512, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7efb4ce37070>

In [78]:
x_train = df_train.drop(["label", "time", "longitude", "latitude", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy()
x_val = df_val.drop(["label", "time", "longitude", "latitude", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy()

In [79]:
x_train = x_train.reshape(-1, block_size, 6)
x_val = x_val.reshape(-1, block_size, 6)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

In [80]:
x_pos_train = df_train[["latitude", "longitude"]].to_numpy()
x_pos_val = df_val[["latitude", "longitude"]].to_numpy()

In [81]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [90]:
class MyModel(tf.keras.Model):
    def __init__(self, n_embed, vocab_size):
        super(MyModel, self).__init__()
        self.n_embed = n_embed
        self.loc_emb_h = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.loc_emb_c = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.lstm1 = tf.keras.layers.LSTM(n_embed * 2, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(n_embed * 2)
        self.dense = tf.keras.layers.Dense(n_embed, activation='relu')
        self.out = tf.keras.layers.Dense(2)
        

    def call(self, inputs):
        loc, timestep = inputs
        h = self.loc_emb_h(loc)
        c = self.loc_emb_c(loc)
        # reshape h and c to (batch_size, n_embed)
        h = tf.reshape(h, [-1, 2 * self.n_embed])
        c = tf.reshape(c, [-1, 2 * self.n_embed])
        x = self.lstm1(timestep, initial_state=[c, h])
        x = self.lstm2(x, initial_state=[c, h])
        x = self.dense(x)
        return self.out(x)

In [91]:
n_embed = 256
vocab_size = 2000
model = MyModel(n_embed, vocab_size)

In [92]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [93]:
model.fit([x_pos_train, x_train], y_train, epochs=40, batch_size=512, validation_data=([x_pos_val, x_val], y_val))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7efb50f88250>

In [None]:
# jedno initial_state = [c, h]

In [89]:
model.fit([x_pos_train, x_train], y_train, epochs=40, batch_size=512, validation_data=([x_pos_val, x_val], y_val))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7efb4e6df820>

In [105]:
x_train = df_train.drop(["label", "time", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy().reshape(-1, 1)
x_val = df_val.drop(["label", "time", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy().reshape(-1, 1)

In [106]:
# add anoother column whith 1 - label
y_train = np.concatenate((y_train, 1 - y_train), axis=1)
y_val = np.concatenate((y_val, 1 - y_val), axis=1)

In [107]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(block_size * 6 + 2, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2)
])

In [108]:
model.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy', tf.keras.metrics.F1Score()])

In [109]:
model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7efb50e66bc0>

In [94]:
x_train = df_train.drop(["label", "time", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_train = df_train["label"].to_numpy()
x_val = df_val.drop(["label", "time", "energy_ma_4", "energy_ma_8", "energy_ma_12", "energy_ma_24", "energy_ma_36"], axis=1).to_numpy()
y_val = df_val["label"].to_numpy()

In [95]:
model = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, n_jobs=-1)

In [96]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric='logloss', early_stopping_rounds=10)

[0]	validation_0-logloss:0.68658
[1]	validation_0-logloss:0.68017
[2]	validation_0-logloss:0.67383
[3]	validation_0-logloss:0.66764
[4]	validation_0-logloss:0.66153
[5]	validation_0-logloss:0.65556
[6]	validation_0-logloss:0.64969
[7]	validation_0-logloss:0.64394
[8]	validation_0-logloss:0.63826
[9]	validation_0-logloss:0.63271
[10]	validation_0-logloss:0.62726
[11]	validation_0-logloss:0.62192
[12]	validation_0-logloss:0.61663
[13]	validation_0-logloss:0.61150
[14]	validation_0-logloss:0.60645
[15]	validation_0-logloss:0.60144
[16]	validation_0-logloss:0.59655
[17]	validation_0-logloss:0.59174
[18]	validation_0-logloss:0.58700
[19]	validation_0-logloss:0.58234
[20]	validation_0-logloss:0.57777
[21]	validation_0-logloss:0.57326
[22]	validation_0-logloss:0.56884
[23]	validation_0-logloss:0.56449
[24]	validation_0-logloss:0.56020
[25]	validation_0-logloss:0.55600
[26]	validation_0-logloss:0.55189
[27]	validation_0-logloss:0.54782
[28]	validation_0-logloss:0.54383
[29]	validation_0-loglos

In [97]:
y_pred = model.predict(x_val)
accuracy_score(y_val, y_pred), f1_score(y_val, y_pred)

(0.8998212773962938, 0.5549519431675721)