In [85]:
import pandas as pd

In [86]:
df = pd.read_csv('STORM_preprocessed_medianfill_1.csv', index_col=0)

**Target 1 : TotalDeaths**

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

LINEAR_TARGETS = ["TotalDeaths", "NoInjured", "TotalDamageAdjusted(000US$)"]
ATTRIBUTES = ['Year', 'Month', 'MainLandfallLocation', 'OFDAResponse', 'Appeal', 'Declaration', 'LandfallMagnitude(kph)', 'LandfallPressure(mb)']

X = df[ATTRIBUTES]
y = df[LINEAR_TARGETS[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**TabNet**

In [67]:
!pip install pytorch-tabnet



In [98]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

# Initialize TabNet Regressor
reg = TabNetRegressor()

# Train the model
reg.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_test.values, y_test.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=100,
    batch_size=32,
    virtual_batch_size=64
)



epoch 0  | loss: 275648.84229| val_0_rmse: 597.82883|  0:00:00s
epoch 1  | loss: 274991.08685| val_0_rmse: 603.84134|  0:00:00s
epoch 2  | loss: 269967.74829| val_0_rmse: 769.86386|  0:00:00s
epoch 3  | loss: 273765.76123| val_0_rmse: 658.80863|  0:00:00s
epoch 4  | loss: 267876.98798| val_0_rmse: 624.7017|  0:00:00s
epoch 5  | loss: 271841.46069| val_0_rmse: 608.09086|  0:00:00s
epoch 6  | loss: 270977.59723| val_0_rmse: 611.33247|  0:00:00s
epoch 7  | loss: 60917.63281| val_0_rmse: 610.20857|  0:00:00s
epoch 8  | loss: 269494.67648| val_0_rmse: 612.50961|  0:00:00s
epoch 9  | loss: 268754.3208| val_0_rmse: 604.35402|  0:00:00s
epoch 10 | loss: 59609.57074| val_0_rmse: 601.77483|  0:00:01s
epoch 11 | loss: 262947.07831| val_0_rmse: 590.98906|  0:00:01s
epoch 12 | loss: 258476.49295| val_0_rmse: 600.46178|  0:00:01s
epoch 13 | loss: 262432.14478| val_0_rmse: 595.19218|  0:00:01s
epoch 14 | loss: 256387.64868| val_0_rmse: 595.27197|  0:00:01s
epoch 15 | loss: 263034.09619| val_0_rmse: 5



In [99]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_regression(model, X_test, y_test):
    """Evaluates a regression model's performance.

    Args:
        model (object): The trained regression model.
        X_test (array-like): The input features for testing.
        y_test (array-like): The true labels for testing.

    Returns:
        dict: A dictionary containing evaluation metrics:
              - 'rmse': Root Mean Squared Error (rounded to 2 decimal places)
              - 'mae': Mean Absolute Error (rounded to 2 decimal places)
              - 'r2': R² Score (rounded to 2 decimal places)
    """
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return {
        'rmse': round(rmse, 2),
        'mae': round(mae, 2),
        'r2': round(r2, 2)
    }

In [101]:
# Evaluate the regression model
results_1 = evaluate_regression(reg, X_test.values, y_test.values)

# Display the results
print(results_1)

{'rmse': 544.9, 'mae': 142.73, 'r2': 0.16}




Đánh giá : mô hình có độ sai lệch khá đáng kể, chưa giải thích tốt sự biến thiên của dữ liệu

**ResNet**

In [102]:
from tensorflow import keras
from tensorflow.keras import layers

In [103]:
# Define the residual block
def residual_block(x, filters):
    shortcut = x
    x = layers.Dense(filters, activation='relu')(x)
    x = layers.Dense(filters)(x)
    x = layers.add([x, shortcut])
    x = layers.Activation('relu')(x)
    return x

# Build the ResNet model
def build_resnet(input_shape):
    inputs = keras.Input(shape=input_shape)
    x = layers.Dense(64, activation='relu')(inputs)

    # Stack of residual blocks
    for _ in range(3):  # Number of residual blocks
        x = residual_block(x, 64)

    # Output layer for regression
    outputs = layers.Dense(1)(x)
    model_res = keras.Model(inputs, outputs)
    return model

# Create and compile the model
input_shape = (X_train_scaled.shape[1],)  # Number of features
model_res = build_resnet(input_shape)
model_res.compile(optimizer='adam', loss='mean_squared_error')

# Print the model summary
model_res.summary()

# Fit the model
model_res.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - loss: 7234.8271 - val_loss: 203788656.0000
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 9530.7568 - val_loss: 232960464.0000
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 5629.0928 - val_loss: 161450080.0000
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 3707.8176 - val_loss: 145227312.0000
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 5712.4844 - val_loss: 169632064.0000
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 5018.6353 - val_loss: 144603168.0000
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 3046.2612 - val_loss: 125304808.0000
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 3279.

<keras.src.callbacks.history.History at 0x7d2d41c20220>

In [123]:
# Evaluate the regression model
results_2 = evaluate_regression(model_res, X_test, y_test)

# Display the results
print(results_2)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
{'rmse': 5597.71, 'mae': 5559.15, 'r2': -87.43}




sai số quá lớn, không giải thích được sự biến thiên, có thể đang bị overfitting

**Simple NN**

In [105]:
import tensorflow as tf

In [106]:
# Build the Sequential Model
model_simple = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train_scaled.shape[1]]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model
model_simple.compile(optimizer='adam', loss='mean_squared_error')

# Print the model summary
model_simple.summary()

# Train the model
model_simple.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 376461.0312 - val_loss: 428434.5312
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 179966.1719 - val_loss: 747374.5625
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 144014.0938 - val_loss: 1434318.7500
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 208460.7500 - val_loss: 2764322.2500
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 148407.6875 - val_loss: 5090626.0000
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 395843.2500 - val_loss: 9022145.0000
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 269512.0938 - val_loss: 14983600.0000
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 125532.1406

<keras.src.callbacks.history.History at 0x7d2d417c7400>

In [124]:
# Evaluate the regression model
results_3 = evaluate_regression(model_simple, X_test, y_test)

# Display the results
print(results_3)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
{'rmse': 7355.41, 'mae': 7231.63, 'r2': -151.68}




cùng một vấn đề

**LSTM**

In [118]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [119]:
# Reshape the data for LSTM
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y.iloc[i + time_steps])  # Corresponding y value
    return np.array(Xs), np.array(ys)

TIME_STEPS = 1  # You can change this value based on your needs
X_train_lstm, y_train_lstm = create_dataset(pd.DataFrame(X_train_scaled), pd.Series(y_train), TIME_STEPS)
X_test_lstm, y_test_lstm = create_dataset(pd.DataFrame(X_test_scaled), pd.Series(y_test), TIME_STEPS)

# Reshape input to be [samples, time steps, features]
X_train_lstm = X_train_lstm.reshape((X_train_lstm.shape[0], X_train_lstm.shape[1], X_train_lstm.shape[2]))
X_test_lstm = X_test_lstm.reshape((X_test_lstm.shape[0], X_test_lstm.shape[1], X_test_lstm.shape[2]))

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))  # Output layer for regression (adjust this based on the number of targets)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_lstm, y_train_lstm, epochs=100, batch_size=32, validation_data=(X_test_lstm, y_test_lstm))

Epoch 1/100


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - loss: 278259.5625 - val_loss: 386140.3438
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 172526.3281 - val_loss: 386135.7500
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 268747.6562 - val_loss: 386130.6562
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 188537.5625 - val_loss: 386125.5000
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 389057.6875 - val_loss: 386120.1562
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 164200.3125 - val_loss: 386114.8750
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 238250.6719 - val_loss: 386109.7500
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 121464.6562 - val_loss:

In [122]:
# Evaluate the regression model
evaluation_metrics = evaluate_regression(model, X_test_lstm, y_test_lstm)
# Display the results
print(evaluation_metrics)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
{'rmse': 619.36, 'mae': 144.78, 'r2': -0.05}




**Target 2 : NoInjured**

In [125]:
LINEAR_TARGETS = ["TotalDeaths", "NoInjured", "TotalDamageAdjusted(000US$)"]
ATTRIBUTES = ['Year', 'Month', 'MainLandfallLocation', 'OFDAResponse', 'Appeal', 'Declaration', 'LandfallMagnitude(kph)', 'LandfallPressure(mb)']

X = df[ATTRIBUTES]
y = df[LINEAR_TARGETS[1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Qua 4 model đã thử thì nhận thấy TabNet cho ra kết quả tốt nhất tính đến thời điểm hiện tại nên quyết định dùng tương tự cho các target còn lại

In [126]:
# Initialize TabNet Regressor
reg_2 = TabNetRegressor()

# Train the model
reg_2.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_test.values, y_test.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=100,
    batch_size=32,
    virtual_batch_size=64
)



epoch 0  | loss: 36780.60132| val_0_rmse: 898.77783|  0:00:00s
epoch 1  | loss: 30328.0672| val_0_rmse: 727.93183|  0:00:00s
epoch 2  | loss: 37122.86316| val_0_rmse: 530.11816|  0:00:00s
epoch 3  | loss: 28632.47192| val_0_rmse: 616.61393|  0:00:00s
epoch 4  | loss: 39843.71173| val_0_rmse: 823.05195|  0:00:00s
epoch 5  | loss: 36546.33862| val_0_rmse: 595.58399|  0:00:00s
epoch 6  | loss: 39422.11987| val_0_rmse: 828.90607|  0:00:00s
epoch 7  | loss: 34363.52478| val_0_rmse: 608.85554|  0:00:00s
epoch 8  | loss: 38283.04907| val_0_rmse: 580.49125|  0:00:00s
epoch 9  | loss: 38398.02673| val_0_rmse: 526.7911|  0:00:00s
epoch 10 | loss: 37460.18542| val_0_rmse: 437.28665|  0:00:01s
epoch 11 | loss: 34438.10913| val_0_rmse: 447.37077|  0:00:01s
epoch 12 | loss: 29068.36884| val_0_rmse: 438.05758|  0:00:01s
epoch 13 | loss: 28073.38892| val_0_rmse: 431.91003|  0:00:01s
epoch 14 | loss: 32136.65759| val_0_rmse: 476.99857|  0:00:01s
epoch 15 | loss: 31006.41809| val_0_rmse: 1200.29353|  0:



In [129]:
# Evaluate the regression model
results_5 = evaluate_regression(reg_2, X_test.values, y_test.values)

# Display the results
print(results_5)

{'rmse': 286200.35, 'mae': 123548.4, 'r2': -0.23}




**Target 3 : TotalDamageAdjusted(000US$)**

In [128]:
LINEAR_TARGETS = ["TotalDeaths", "NoInjured", "TotalDamageAdjusted(000US$)"]
ATTRIBUTES = ['Year', 'Month', 'MainLandfallLocation', 'OFDAResponse', 'Appeal', 'Declaration', 'LandfallMagnitude(kph)', 'LandfallPressure(mb)']

X = df[ATTRIBUTES]
y = df[LINEAR_TARGETS[2]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [130]:
# Initialize TabNet Regressor
reg_3 = TabNetRegressor()

# Train the model
reg_3.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_test.values, y_test.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=100,
    batch_size=32,
    virtual_batch_size=64
)



epoch 0  | loss: 59829126144.0| val_0_rmse: 286109.74289|  0:00:00s
epoch 1  | loss: 59332948992.0| val_0_rmse: 285418.81337|  0:00:00s
epoch 2  | loss: 49645234176.0| val_0_rmse: 285411.34264|  0:00:00s
epoch 3  | loss: 48122540032.0| val_0_rmse: 285986.31638|  0:00:00s
epoch 4  | loss: 58960413696.0| val_0_rmse: 286011.34011|  0:00:00s
epoch 5  | loss: 43847360000.0| val_0_rmse: 286300.15076|  0:00:00s
epoch 6  | loss: 60881985536.0| val_0_rmse: 286145.4567|  0:00:00s
epoch 7  | loss: 53245045248.0| val_0_rmse: 286295.2832|  0:00:00s
epoch 8  | loss: 56217275392.0| val_0_rmse: 286293.32935|  0:00:00s
epoch 9  | loss: 51271887872.0| val_0_rmse: 286259.51593|  0:00:00s
epoch 10 | loss: 53665156096.0| val_0_rmse: 286268.26689|  0:00:01s
epoch 11 | loss: 45710773248.0| val_0_rmse: 286245.17359|  0:00:01s
epoch 12 | loss: 52665954560.0| val_0_rmse: 286273.14794|  0:00:01s
epoch 13 | loss: 56987848192.0| val_0_rmse: 286207.11298|  0:00:01s
epoch 14 | loss: 46395379712.0| val_0_rmse: 286184



In [131]:
# Evaluate the regression model
results_6 = evaluate_regression(reg_3, X_test.values, y_test.values)

# Display the results
print(results_6)

{'rmse': 284683.36, 'mae': 122522.9, 'r2': -0.22}


