In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer

from joblib import dump, load

import pandas as pd
import matplotlib.pyplot as plt

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
from src.const import DATA_HEAD, WEIGHTS_HEAD
ML_dir = WEIGHTS_HEAD / "ML"

data_filename = DATA_HEAD / "STORM_preprocessed_medianfill_1.csv"
base_df = pd.read_csv(str(data_filename), index_col=0)
base_df.head()

Unnamed: 0,ID,EventName,Year,Month,DisasterType,DisasterSubtype,MainLandfallLocation,Flood,Slide,OFDAResponse,Appeal,Declaration,LandfallMagnitude(kph),LandfallPressure(mb),TotalDeaths,NoInjured,TotalDamage(000US$),TotalDamageAdjusted(000US$),CPI
0,STORM_NAN_1953,,1953,9,Storm,Tropical cyclone,8,0,0,0,0,0,92,989,1000,20,19400,211880,9.156133
1,STORM_VIOLET_1964,Violet,1964,9,Storm,Tropical cyclone,4,1,1,0,0,0,92,989,18,20,10000,94354,10.598376
2,STORM_IRIS_1964,Iris,1964,11,Storm,Tropical cyclone,5,1,0,0,0,0,150,960,5100,20,70000,660479,10.598376
3,STORM_JOAN_1964,Joan,1964,11,Storm,Tropical cyclone,5,1,0,0,0,0,130,980,2500,20,15000,141531,10.598376
4,STORM_KATE_1964,Kate,1964,11,Storm,Tropical cyclone,6,1,0,0,0,0,130,970,0,20,3000,28306,10.598376


In [3]:
base_df['MainLandfallLocation'] = base_df['MainLandfallLocation'].astype(int)

# Featue importance

target: death, injure, total dmg adjust . metric: mse, terminate the large error in predict dmg, phu hop voi gbms


| **Mô hình**               | **Ưu điểm**                            | **Nhược điểm**                          | **Khi nào dùng?**                       |
|---------------------------|----------------------------------------|-----------------------------------------|----------------------------------------|
| XGBoost                   | Hiệu quả cao, chống overfitting        | Tốn tài nguyên, huấn luyện chậm         | Dữ liệu lớn, nhiều tính năng                   |
| LightGBM                  | Nhanh, hiệu quả trên dữ liệu lớn       | Thiếu chính xác với dữ liệu nhỏ         | Dữ liệu lớn, cần tốc độ        |
| CatBoost                  | Tốt cho dữ liệu phân loại              | Cần nhiều tài nguyên                    | Khi có nhiều tính năng phân loại                       |
| AdaBoost                  | Đơn giản, dễ triển khai                | Hiệu suất kém hơn các GBM khác          | Dữ liệu nhỏ, ít biến động                       |
| Decision Trees (GBDT)     | Dự đoán chính xác với dữ liệu phi tuyến| Chậm khi số lượng cây lớn               | Dữ liệu phi tuyến, phức tạp                     |
| NGBoost                   | Mô hình hóa phân phối xác suất         | Phức tạp hơn, ít phổ biến               | Khi cần dự đoán phân phối xác suất                   |

**Model selection**
1. `XGBoost`
   - XGBoost is highly effective for data that contains both linear and non-linear relationships. 
   - It is suitable for datasets with many features, where overfitting might be a concern. XGBoost provides robust feature importance, making it ideal for understanding key factors in complex datasets.
2. `LightGBM`
   - LightGBM is a fast and efficient model that is particularly effective on datasets with a large number of features. 
   - It is suitable when quick training is required without sacrificing performance, making it a strong option when speed is prioritized and feature importance needs to be calculated quickly.
3. `CatBoost`
   - CatBoost is designed to handle datasets with many categorical variables without the need for extensive preprocessing, such as one-hot encoding. 
   - It is particularly useful when working with datasets that contain mixed feature types (numerical and categorical) and when minimal data preprocessing is desired, while still delivering accurate feature importance.
4. `Decision Trees (GBDT)`
   - Decision Trees (GBDT) are effective at capturing complex, non-linear patterns in data.
   - It is suitable for datasets with non-linear relationships and complex interactions between features, especially when the goal is to model accurate predictions while identifying the most important features contributing to those predictions. 

## One hot encoding ( Not use yet)

In [24]:
x_label = ['North_East', 'North_West', 'Red_River_Delta', 'North_Central_Coast', 'South_Central_Coast', 'Central_Highlands', 'South_East', 'Mekong_River_Delta']
map_dict = {i: name for i, name in enumerate(x_label, start=1)}

base_df['MainLandfallLocation'] = base_df['MainLandfallLocation'].map(map_dict)

## XGBOOST

### PREDICT LINEAR_TARGET

In [20]:
from src.const import LINEAR_TARGETS, ATTRIBUTES, CATEGORICAL_TARGETS 

predictors = ATTRIBUTES + CATEGORICAL_TARGETS 

# Exclude highly correlated variables like TotalDamage and focus on TotalDamageAdjusted
target = LINEAR_TARGETS

X = base_df[predictors]
y = base_df[target]

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(random_state=42)

In [22]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Increase number of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Try smaller learning rates for smoother learning
    'max_depth': [3, 5, 7],  # Test deeper trees for better performance with complex relationships
    'subsample': [0.7, 0.8, 1.0],  # Introduce randomness to reduce overfitting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Randomly sample columns to increase model robustness
    'min_child_weight': [1, 3, 5],  # Prevent splitting with too few samples
    'gamma': [0, 0.1, 0.3],  # Regularization to reduce overly complex trees
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization for feature selection
    'reg_lambda': [1, 1.5, 2]  # L2 regularization to prevent overfitting
}



# Set up cross-validation with grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using cross-validation to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Use the best found model to make predictions
best_model = grid_search.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Step 6: Evaluate the model using MSE for both training and testing sets
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

# Step 7: Analyze feature importance from the best model
feature_importance = best_model.feature_importances_

# Create a DataFrame to display feature importance alongside their names
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Output the results: MSE and feature importance
mse_train, mse_test, grid_search.best_params_, importance_df.head(10)

Fitting 3 folds for each of 512 candidates, totalling 1536 fits


(2196904077.038385,
 12995833700.612913,
 {'colsample_bytree': 0.8,
  'gamma': 0.1,
  'learning_rate': 0.05,
  'max_depth': 3,
  'min_child_weight': 3,
  'n_estimators': 200,
  'reg_alpha': 0.1,
  'reg_lambda': 2,
  'subsample': 0.7},
                   Feature  Importance
 4                  Appeal    0.199368
 6  LandfallMagnitude(kph)    0.143474
 9                   Slide    0.131806
 0                    Year    0.115112
 2    MainLandfallLocation    0.099156
 3            OFDAResponse    0.094092
 7    LandfallPressure(mb)    0.092087
 1                   Month    0.080761
 8                   Flood    0.044143
 5             Declaration    0.000000)

### PREDICT CATEGORICAL_TARGETS 

In [25]:
from src.const import ATTRIBUTES, CATEGORICAL_TARGETS 

predictors = ATTRIBUTES 

# Exclude highly correlated variables like TotalDamage and focus on TotalDamageAdjusted
target = CATEGORICAL_TARGETS

X = base_df[predictors]
y = base_df[target]

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Increase number of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Try smaller learning rates for smoother learning
    'max_depth': [3, 5, 7],  # Test deeper trees for better performance with complex relationships
    'subsample': [0.7, 0.8, 1.0],  # Introduce randomness to reduce overfitting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Randomly sample columns to increase model robustness
    'min_child_weight': [1, 3, 5],  # Prevent splitting with too few samples
    'gamma': [0, 0.1, 0.3],  # Regularization to reduce overly complex trees
    'reg_alpha': [0, 0.1, 0.5],  # L1 regularization for feature selection
    'reg_lambda': [1, 1.5, 2]  # L2 regularization to prevent overfitting
}


# Set up cross-validation with grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using cross-validation to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Use the best found model to make predictions
best_model = grid_search.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Step 6: Evaluate the model using MSE for both training and testing sets
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

# Step 7: Analyze feature importance from the best model
feature_importance = best_model.feature_importances_

# Create a DataFrame to display feature importance alongside their names
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Output the results: MSE and feature importance
mse_train, mse_test, grid_search.best_params_, importance_df.head(10)

Fitting 3 folds for each of 19683 candidates, totalling 59049 fits


## LightGBM 

In [None]:
param_grid_lgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [15, 31, 50],
    'max_depth': [-1, 10, 20],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [3, 5, 10]
}

### PREDITC LINEAR_TARGET

In [16]:
from src.const import LINEAR_TARGETS, ATTRIBUTES

predictors = ATTRIBUTES + LINEAR_TARGETS

# Exclude highly correlated variables like TotalDamage and focus on TotalDamageAdjusted
target = LINEAR_TARGETS

X = base_df[predictors]
y = base_df[target]

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'

In [12]:
import lightgbm as lgb
from src.const import LINEAR_TARGETS, ATTRIBUTES

predictors = ATTRIBUTES + CATEGORICAL_TARGETS 
mse_results = {}

for target in (LINEAR_TARGETS + CATEGORICAL_TARGETS):
    X = base_df[predictors]
    y = base_df[target]  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'

In [29]:
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='mse',
    boosting_type='gbdt',
    verbose=-1
)

# Create an MSE scoring function for GridSearch
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=5,  # 5-fold cross-validation
    verbose=1
)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train the final model using the best parameters
best_model = lgb.LGBMRegressor(**best_params)

# Fit the model on the training data
best_model.fit(X_train, y_train)

# Make predictions
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: MainLandfallLocation: object

In [14]:
# Feature importance
feature_importance = bst.feature_importance()
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Display the top 10 features
importance_df.head(10)

Unnamed: 0,Feature,Importance
4,Year,72
1,LandfallPressure(mb),42
5,Month,42
0,LandfallMagnitude(kph),36
3,Slide,30
12,DisasterSubtype_Tropical cyclone,7
2,Flood,0
6,DisasterSubtype_Hail,0
7,DisasterSubtype_Lightning/Thunderstorms,0
8,DisasterSubtype_Severe weather,0


### PREDICT CATEGORICAL_TARGET

In [None]:
from src.const import ATTRIBUTES, CATEGORICAL_TARGETS 

predictors = ATTRIBUTES + CATEGORICAL_TARGETS 
mse_results = {}

for target in (LINEAR_TARGETS + CATEGORICAL_TARGETS):
    X = base_df[predictors]
    y = base_df[target]  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='mse',
    boosting_type='gbdt',
    verbose=-1
)

# Create an MSE scoring function for GridSearch
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=5,  # 5-fold cross-validation
    verbose=1
)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train the final model using the best parameters
best_model = lgb.LGBMRegressor(**best_params)

# Fit the model on the training data
best_model.fit(X_train, y_train)

# Make predictions
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

In [None]:
# Feature importance
feature_importance = bst.feature_importance()
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Display the top 10 features
importance_df.head(10)

## CatBoost

In [18]:
# Step 2: Set up parameter grid
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64, 128],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bylevel': [0.7, 0.8, 1.0]
}

In [17]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(silent=True)

### PREDICT LINEAR_TARGET

In [19]:
grid_search = GridSearchCV(estimator=cat_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test)

feature_importance = best_model.get_feature_importance()

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

mse_train, mse_test, grid_search.best_params_, importance_df.head(10)

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


6561 fits failed out of a total of 6561.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6561 fits failed with the following error:
Traceback (most recent call last):
  File "d:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\anaconda3\lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
  File "d:\anaconda3\lib\site-packages\catboost\core.py", line 2410, in _fit
    self._train(
  File "d:\anaconda3\lib\site-packages\catboost\core.py", line 1790, in _train
    self._object._train(train_pool,

CatBoostError: catboost/private/libs/target/data_providers.cpp:639: Currently only multi-regression, multilabel and survival objectives work with multidimensional target

### PREDICT CATEGORICAL_TARGET

## NGBoost

In [None]:
from ngboost import NGBRegressor
ngb_model = NGBRegressor()  

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting iterations
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate for each boosting step
    'minibatch_frac': [0.7, 0.8, 1.0],  # Fraction of data to use for each boosting step
    'max_depth': [3, 5],  # Maximum depth of base learners (trees)
    'colsample_bytree': [0.7, 0.8, 1.0],  # Fraction of features to be used in each boosting round
    'natural_gradient': [True, False]  # Whether to use natural gradients (default is True)
}


In [None]:
grid_search = GridSearchCV(estimator=ngb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test)

mse_train, mse_test, grid_search.best_params_