# Gradient Boosting: Hyperparameter Tuning

## Packages and Presets

In [46]:
import pandas as pd
import numpy as np
import yaml
import os
import joblib

from sklearn.metrics import (
    f1_score, 
    balanced_accuracy_score,
)

from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns 

from imblearn.over_sampling import SMOTE

import catboost as cb

from tsfresh.transformers.per_column_imputer import PerColumnImputer
from tsfresh.feature_extraction.settings import EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features


pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

import optuna
import warnings

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
NUM_TRIALS = 2

with open("classical_ml_config.yaml", "r") as file:
    config = yaml.safe_load(file)

## No Feature Engineering

In [27]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [28]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [61]:
# create objective function for optuna
def objective_no_feat_eng(trial):
    use_smote = trial.suggest_categorical("use_smote", [True, False]),

    # see: https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.5, log=True),
        "depth": trial.suggest_int("depth", 5, 10),
        "iterations": trial.suggest_int("n_estimators", 100, 1000),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e2, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        if use_smote:
            smote = SMOTE(random_state=config["general"]["seed"])
            X_train, y_train = smote.fit_resample(X_train, y_train) 
        
        
        boost = cb.CatBoostClassifier(
            task_type="GPU", 
            random_state=config["general"]["seed"],
            **hyperparams
        )
        boost.fit(X_train, y_train, verbose=0)
        
        y_preds = boost.predict(X_val)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(score)
            
        
    return np.mean(f1_scores)

In [62]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_no_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="boost_no_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-04-29 22:44:28,893] A new study created in memory with name: boost_no_feat_eng


In [63]:
study_no_feat_eng.optimize(
    objective_no_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 8 * 60 * 60, # timeout after 8 hours
    show_progress_bar=True
)

  0%|          | 0/2 [00:00<?, ?it/s]

[I 2024-04-29 22:44:42,143] Trial 0 finished with value: 0.9816911897769195 and parameters: {'use_smote': False, 'learning_rate': 0.05100627805979915, 'depth': 8, 'n_estimators': 240, 'l2_leaf_reg': 0.0008629132190071859, 'min_data_in_leaf': 6}. Best is trial 0 with value: 0.9816911897769195.
[I 2024-04-29 22:46:07,822] Trial 1 finished with value: 0.9764760784173653 and parameters: {'use_smote': True, 'learning_rate': 0.04160439645256607, 'depth': 5, 'n_estimators': 973, 'l2_leaf_reg': 9.877700294007917, 'min_data_in_leaf': 22}. Best is trial 0 with value: 0.9816911897769195.


In [35]:
best_params = study_no_feat_eng.best_params
    
print(best_params)

if "catboost_no_feat_eng" in config:
    config["catboost_no_feat_eng"].update(best_params)
else:
    config["catboost_no_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'use_smote': False, 'learning_rate': 0.05100627805979915, 'depth': 8, 'n_estimators': 240, 'l2_leaf_reg': 0.0008629132190071859, 'min_data_in_leaf': 6}


In [34]:
p_importance_no_feat_eng = optuna.visualization.plot_param_importances(study_no_feat_eng)
p_importance_no_feat_eng.show()

In [36]:
p_history_no_feat_eng = optuna.visualization.plot_optimization_history(study_no_feat_eng)
p_history_no_feat_eng.show()

## With Feature Engineering

In [37]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [38]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [None]:
X_train_all.columns = X_train_all.columns.astype(str)

In [55]:
# create objective function for optuna
def objective_feat_eng(trial):
    use_smote = trial.suggest_categorical("use_smote", [True, False]),

    # see: https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.5, log=True),
        "depth": trial.suggest_int("depth", 5, 10),
        "iterations": trial.suggest_int("n_estimators", 100, 1000),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e2, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        
        if use_smote:
            smote = SMOTE(random_state=config["general"]["seed"])
            X_train, y_train = smote.fit_resample(X_train, y_train) 
        
        
        X_train["id"] = X_train.index
        X_train_melted =(
            X_train
            .melt(id_vars="id", var_name="time", value_name="value")
            .sort_values(by=["id", "time"])
        )
        # get rid of padding to not ruin the engineered features
        # for simplicity, we drop all 0s, as only few "true" 0s are in the data
        X_train_melted["value"] = X_train_melted["value"].replace(0, np.nan)
        X_train_melted = X_train_melted.dropna()
        
        X_val["id"] = X_val.index
        X_val_melted =(
            X_val
            .melt(id_vars="id", var_name="time", value_name="value")
            .sort_values(by=["id", "time"])
        )
        # get rid of padding to not ruin the engineered features
        # for simplicity, we drop all 0s, as only few "true" 0s are in the data
        X_val_melted["value"] = X_val_melted["value"].replace(0, np.nan)
        X_val_melted = X_val_melted.dropna()
        
        
        # We will follow this tutorial:
        # https://towardsdatascience.com/expanding-your-regression-repertoire-with-regularisation-903d2c9f7b28
        # but will use the ComprehensiveFCParameters instead of the EfficientFCParameters
        # for feature extraction
        X_train_augmented = extract_features(
            X_train_melted,
            column_id="id",
            column_sort="time",
            column_value="value",
            default_fc_parameters=ComprehensiveFCParameters(),
        )  
        X_val_augmented = extract_features(
            X_val_melted,
            column_id="id",
            column_sort="time",
            column_value="value",
            default_fc_parameters=ComprehensiveFCParameters(),
        )  
        
        X_train_merged = pd.merge(
            X_train, X_train_augmented, left_index=True, right_index=True
        )
        X_val_merged = pd.merge(
            X_val, X_val_augmented, left_index=True, right_index=True
        )

        # assert that no rows were lost
        assert X_train_merged.shape[0] == X_train.shape[0]
        assert X_val_merged.shape[0] == X_val.shape[0]
        assert X_train_merged.index.equals(X_train.index)
        assert X_val_merged.index.equals(X_val.index)
        
        imputer = PerColumnImputer()
        X_train_merged = imputer.fit_transform(X_train_merged)
        X_val_merged = imputer.transform(X_val_merged)
        
        boost = cb.CatBoostClassifier(
            task_type="GPU", 
            random_state=config["general"]["seed"],
            **hyperparams
        )
        boost.fit(X_train_merged, y_train, verbose=0)
        
        y_preds = boost.predict(X_val_merged)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(f1_score)
            
        
    return np.mean(score)

In [53]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="boost_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-04-29 22:17:32,165] A new study created in memory with name: boost_feat_eng


In [56]:
warnings.filterwarnings('ignore') #ignore pandas warnings

study_feat_eng.optimize(
    objective_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 10 * 60 * 60, # timeout after 8 hours
    show_progress_bar=True
)

  0%|          | 0/2 [00:00<?, ?it/s]

Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.98it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 11.11it/s]
Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.99it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.97it/s]
Feature Extraction: 100%|██████████| 80/80 [00:42<00:00,  1.88it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.41it/s]
Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.99it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.84it/s]
Feature Extraction: 100%|██████████| 80/80 [00:39<00:00,  2.00it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00,  9.93it/s]


[I 2024-04-29 22:33:59,907] Trial 1 finished with value: 0.9475968992248062 and parameters: {'use_smote': True, 'learning_rate': 0.04160439645256607, 'depth': 5, 'n_estimators': 973, 'l2_leaf_reg': 9.877700294007917, 'min_data_in_leaf': 22}. Best is trial 1 with value: 0.9475968992248062.


Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.96it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.50it/s]
Feature Extraction: 100%|██████████| 80/80 [00:42<00:00,  1.89it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.22it/s]
Feature Extraction: 100%|██████████| 80/80 [00:41<00:00,  1.93it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.42it/s]
Feature Extraction: 100%|██████████| 80/80 [00:43<00:00,  1.85it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.80it/s]
Feature Extraction: 100%|██████████| 80/80 [00:39<00:00,  2.02it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 11.03it/s]


[I 2024-04-29 22:40:00,671] Trial 2 finished with value: 0.9158023184868822 and parameters: {'use_smote': False, 'learning_rate': 0.00133469775741781, 'depth': 8, 'n_estimators': 489, 'l2_leaf_reg': 0.005589524205217926, 'min_data_in_leaf': 62}. Best is trial 1 with value: 0.9475968992248062.


In [57]:
best_params = study_feat_eng.best_params
    
print(best_params)

if "catboost_feat_eng" in config:
    config["catboost_feat_eng"].update(best_params)
else:
    config["catboost_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'use_smote': True, 'learning_rate': 0.04160439645256607, 'depth': 5, 'n_estimators': 973, 'l2_leaf_reg': 9.877700294007917, 'min_data_in_leaf': 22}


In [58]:
p_importance_feat_eng = optuna.visualization.plot_param_importances(study_feat_eng)
p_importance_feat_eng.show()

In [59]:
p_history_feat_eng = optuna.visualization.plot_optimization_history(study_feat_eng)
p_history_feat_eng.show()