# Gradient Boosting: Hyperparameter Tuning

## Packages and Presets

In [1]:
import pandas as pd
import numpy as np
import yaml
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import f1_score

from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns 

from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier

from tsfresh.transformers.per_column_imputer import PerColumnImputer
from tsfresh.feature_extraction.settings import EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features


pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

import optuna
import warnings

In [2]:
NUM_TRIALS = 500

with open("classical_ml_config.yaml", "r") as file:
    config = yaml.safe_load(file)

## No Feature Engineering

In [3]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [4]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [5]:
smote = SMOTE(random_state=config["general"]["seed"])
X_train_all, y_train_all = smote.fit_resample(X_train_all, y_train_all)

In [6]:
# create objective function for optuna
def objective_no_feat_eng(trial):
    
    # see: https://towardsdatascience.com/beginners-guide-to-the-must-know-lightgbm-hyperparameters-a0005a812702
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("depth", 5, 15),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15)
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]

        
        
        boost = LGBMClassifier(
            random_state=config["general"]["seed"],
            verbose=-1,
            n_jobs=-1,
            **hyperparams
        )
        boost.fit(X_train, y_train)
        
        y_preds = boost.predict(X_val)
        
        score = f1_score(y_val, y_preds)
        f1_scores.append(score)
            
        trial.report(np.mean(f1_scores), fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
            
        
    return np.mean(f1_scores)

In [7]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_no_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="boost_no_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-10 01:48:25,344] A new study created in memory with name: boost_no_feat_eng


In [8]:
study_no_feat_eng.optimize(
    objective_no_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 10 * 60 * 60, # timeout after 10 hours
    show_progress_bar=True
)

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-05-10 01:48:54,888] Trial 0 finished with value: 0.9727231946106423 and parameters: {'learning_rate': 0.005611516415334507, 'depth': 15, 'num_leaves': 149, 'n_estimators': 639, 'min_data_in_leaf': 24, 'min_gain_to_split': 2.3399178050430396}. Best is trial 0 with value: 0.9727231946106423.
[I 2024-05-10 01:49:22,860] Trial 1 finished with value: 0.9281055158226851 and parameters: {'learning_rate': 0.0013066739238053278, 'depth': 14, 'num_leaves': 124, 'n_estimators': 737, 'min_data_in_leaf': 11, 'min_gain_to_split': 14.548647782429915}. Best is trial 0 with value: 0.9727231946106423.
[I 2024-05-10 01:49:26,018] Trial 2 finished with value: 0.9532628853965488 and parameters: {'learning_rate': 0.04622589001020832, 'depth': 7, 'num_leaves': 44, 'n_estimators': 265, 'min_data_in_leaf': 37, 'min_gain_to_split': 7.871346474483568}. Best is trial 0 with value: 0.9727231946106423.
[I 2024-05-10 01:49:50,549] Trial 3 finished with value: 0.9293390945984374 and parameters: {'learning_rat

In [9]:
best_params = study_no_feat_eng.best_params
    
print(best_params)

if "catboost_no_feat_eng" in config:
    config["catboost_no_feat_eng"].update(best_params)
else:
    config["catboost_no_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'learning_rate': 0.09991472256779813, 'depth': 14, 'num_leaves': 184, 'n_estimators': 308, 'min_data_in_leaf': 80, 'min_gain_to_split': 0.002058283006754594}


In [10]:
p_importance_no_feat_eng = optuna.visualization.plot_param_importances(study_no_feat_eng)
p_importance_no_feat_eng.show()

In [11]:
p_history_no_feat_eng = optuna.visualization.plot_optimization_history(study_no_feat_eng)
p_history_no_feat_eng.show()

## With Feature Engineering

In [12]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [13]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [15]:
X_train_all.columns = X_train_all.columns.astype(str)

Note that for reasons of time, we will not carry out feature engineering for every validation split separately. Instead, we will use the same feature engineering for all validation splits. This is not ideal, but it is a compromise that we have to make in order to keep the runtime of this notebook within reasonable limits.

In [16]:
X_train_all["id"] = X_train_all.index
X_train_melted =(
    X_train_all
    .melt(id_vars="id", var_name="time", value_name="value")
    .sort_values(by=["id", "time"])
)

# get rid of padding to not ruin the engineered features
# for simplicity, we drop all 0s, as only few "true" 0s are in the data
X_train_melted["value"] = X_train_melted["value"].replace(0, np.nan)
X_train_melted = X_train_melted.dropna()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [17]:
# We will follow this tutorial:
# https://towardsdatascience.com/expanding-your-regression-repertoire-with-regularisation-903d2c9f7b28
# but will use the ComprehensiveFCParameters instead of the EfficientFCParameters
# for feature extraction
X_train_augmented = extract_features(
    X_train_melted,
    column_id="id",
    column_sort="time",
    column_value="value",
    default_fc_parameters=EfficientFCParameters(),
)  

X_train_merged = pd.merge(
    X_train_all, X_train_augmented, left_index=True, right_index=True
)

# assert that no rows were lost
assert X_train_merged.shape[0] == X_train_all.shape[0]
assert X_train_merged.index.equals(X_train_all.index)

Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.97it/s]


Impute the newly created missing values:

In [18]:
imputer = PerColumnImputer()
X_train_merged = imputer.fit_transform(X_train_merged)


The columns ['value__fft_coefficient__attr_"real"__coeff_94'
 'value__fft_coefficient__attr_"real"__coeff_95'
 'value__fft_coefficient__attr_"real"__coeff_96'
 'value__fft_coefficient__attr_"real"__coeff_97'
 'value__fft_coefficient__attr_"real"__coeff_98'
 'value__fft_coefficient__attr_"real"__coeff_99'
 'value__fft_coefficient__attr_"imag"__coeff_94'
 'value__fft_coefficient__attr_"imag"__coeff_95'
 'value__fft_coefficient__attr_"imag"__coeff_96'
 'value__fft_coefficient__attr_"imag"__coeff_97'
 'value__fft_coefficient__attr_"imag"__coeff_98'
 'value__fft_coefficient__attr_"imag"__coeff_99'
 'value__fft_coefficient__attr_"abs"__coeff_94'
 'value__fft_coefficient__attr_"abs"__coeff_95'
 'value__fft_coefficient__attr_"abs"__coeff_96'
 'value__fft_coefficient__attr_"abs"__coeff_97'
 'value__fft_coefficient__attr_"abs"__coeff_98'
 'value__fft_coefficient__attr_"abs"__coeff_99'
 'value__fft_coefficient__attr_"angle"__coeff_94'
 'value__fft_coefficient__attr_"angle"__coeff_95'
 'value__ff

Next, we create a Pipeline to standardize and one-hot encode the data. This is necessary since some of the newly created features are categorical or do not lie within the same range as the original features.

In [19]:
cat_cols = X_train_merged.loc[:, X_train_merged.nunique() < 5].columns
numeric_cols = X_train_merged.loc[:, X_train_merged.nunique() >= 5].columns

# preprocessing pipeline for numerical features
num_trans = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# preprocessing pipeline for categorical features
cat_trans = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer(transformers=[
    ("num", num_trans, numeric_cols),
    ("cat", cat_trans, cat_cols)
])

In [20]:
scaler = StandardScaler()

# create objective function for optuna
def objective_feat_eng(trial):

    # see: https://towardsdatascience.com/beginners-guide-to-the-must-know-lightgbm-hyperparameters-a0005a812702
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15)
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_merged, y_train_all)):
        X_train, X_val = X_train_merged.iloc[train_idx], X_train_merged.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx] 
        
        smote = SMOTE(random_state=config["general"]["seed"])
        X_train, y_train = smote.fit_resample(X_train, y_train)
        
        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)       
        
        boost = LGBMClassifier(
            random_state=config["general"]["seed"],
            verbose=-1,
            n_jobs=-1,
            **hyperparams
        )
        boost.fit(X_train, y_train)
        
        y_preds = boost.predict(X_val)
        
        score = f1_score(y_val, y_preds)
        
        f1_scores.append(score)
            
        trial.report(np.mean(f1_scores), fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()            
        
    return np.mean(score)

In [21]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="boost_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-10 02:36:10,456] A new study created in memory with name: boost_feat_eng


In [22]:
warnings.filterwarnings('ignore') #ignore pandas warnings

study_feat_eng.optimize(
    objective_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 20 hours
    show_progress_bar=True
)

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-05-10 02:36:39,618] Trial 0 finished with value: 0.9831510493644694 and parameters: {'learning_rate': 0.005611516415334507, 'depth': 15, 'num_leaves': 149, 'n_estimators': 639, 'min_data_in_leaf': 24, 'min_gain_to_split': 2.3399178050430396}. Best is trial 0 with value: 0.9831510493644694.
[I 2024-05-10 02:37:00,272] Trial 1 finished with value: 0.9619214997070885 and parameters: {'learning_rate': 0.0013066739238053278, 'depth': 14, 'num_leaves': 124, 'n_estimators': 737, 'min_data_in_leaf': 11, 'min_gain_to_split': 14.548647782429915}. Best is trial 0 with value: 0.9831510493644694.
[I 2024-05-10 02:37:05,340] Trial 2 finished with value: 0.9781453041937389 and parameters: {'learning_rate': 0.04622589001020832, 'depth': 7, 'num_leaves': 44, 'n_estimators': 265, 'min_data_in_leaf': 37, 'min_gain_to_split': 7.871346474483568}. Best is trial 0 with value: 0.9831510493644694.
[I 2024-05-10 02:37:14,643] Trial 3 finished with value: 0.96901740926527 and parameters: {'learning_rate'

In [23]:
best_params = study_feat_eng.best_params
    
print(best_params)

if "catboost_feat_eng" in config:
    config["catboost_feat_eng"].update(best_params)
else:
    config["catboost_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'learning_rate': 0.062170719319985725, 'depth': 5, 'num_leaves': 91, 'n_estimators': 868, 'min_data_in_leaf': 15, 'min_gain_to_split': 0.002071608294861996}


In [24]:
p_importance_feat_eng = optuna.visualization.plot_param_importances(study_feat_eng)
p_importance_feat_eng.show()

In [25]:
p_history_feat_eng = optuna.visualization.plot_optimization_history(study_feat_eng)
p_history_feat_eng.show()