# Random Forest: Hyperparameter Tuning

## Packages and Presets

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import joblib

from sklearn.metrics import (
    f1_score, 
    balanced_accuracy_score,
)

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns 

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier

from tsfresh.transformers.per_column_imputer import PerColumnImputer
from tsfresh.feature_extraction.settings import EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

import optuna
import warnings

In [2]:
NUM_TRIALS = 500

with open("classical_ml_config.yaml", "r") as file:
    config = yaml.safe_load(file)

## No Feature Engineering

In [3]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [4]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [5]:
# create objective function for optuna
def objective_no_feat_eng(trial):

    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        smote = SMOTE(random_state=config["general"]["seed"])
        X_train, y_train = smote.fit_resample(X_train, y_train)
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train, y_train)
        
        y_preds = rf.predict(X_val)
        
        score = f1_score(y_val, y_preds)
        f1_scores.append(score)
            
        trial.report(np.mean(f1_scores), fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
                    
        
    return np.mean(f1_scores)

In [6]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_no_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_no_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-13 02:47:37,097] A new study created in memory with name: rf_no_feat_eng


In [7]:
study_no_feat_eng.optimize(
    objective_no_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 20 hours
    show_progress_bar=True
)

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-05-13 02:47:43,491] Trial 0 finished with value: 0.9569141022666043 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20, 'criterion': 'gini', 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.9569141022666043.
[I 2024-05-13 02:48:00,785] Trial 1 finished with value: 0.9745820069295019 and parameters: {'n_estimators': 973, 'max_depth': 44, 'min_samples_split': 8, 'min_samples_leaf': 6, 'criterion': 'log_loss', 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.9745820069295019.
[I 2024-05-13 02:48:04,956] Trial 2 finished with value: 0.9517243808293319 and parameters: {'n_estimators': 363, 'max_depth': 25, 'min_samples_split': 16, 'min_samples_leaf': 26, 'criterion': 'log_loss', 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 0.9745820069295019.
[I 2024-05-13 02:48:19,788] Trial 3 finished with value: 0.972773930680168 and parameters: {'n_estimators': 954, 'm

In [8]:
best_params = study_no_feat_eng.best_params
    
print(best_params)

if "random_forest_no_feat_eng" in config:
    config["random_forest_no_feat_eng"].update(best_params)
else:
    config["random_forest_no_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'n_estimators': 402, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'log_loss', 'max_features': 'sqrt', 'bootstrap': False}


In [9]:
p_importance_no_feat_eng = optuna.visualization.plot_param_importances(study_no_feat_eng)
p_importance_no_feat_eng.show()

In [10]:
p_history_no_feat_eng = optuna.visualization.plot_optimization_history(study_no_feat_eng)
p_history_no_feat_eng.show()

## With Feature Engineering

In [11]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [12]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [13]:
X_train_all.columns = X_train_all.columns.astype(str)

Note that for reasons of time, we will not carry out feature engineering for every validation split separately. Instead, we will use the same feature engineering for all validation splits. This is not ideal, but it is a compromise that we have to make in order to keep the runtime of this notebook within reasonable limits.

In [14]:
X_train_all["id"] = X_train_all.index
X_train_melted =(
    X_train_all
    .melt(id_vars="id", var_name="time", value_name="value")
    .sort_values(by=["id", "time"])
)

# get rid of padding to not ruin the engineered features
# for simplicity, we drop all 0s, as only few "true" 0s are in the data
X_train_melted["value"] = X_train_melted["value"].replace(0, np.nan)
X_train_melted = X_train_melted.dropna()

In [15]:
# We will follow this tutorial:
# https://towardsdatascience.com/expanding-your-regression-repertoire-with-regularisation-903d2c9f7b28
# and will use the EfficientFCParameters
# for feature extraction
X_train_augmented = extract_features(
    X_train_melted,
    column_id="id",
    column_sort="time",
    column_value="value",
    default_fc_parameters=EfficientFCParameters(),
)  

X_train_merged = pd.merge(
    X_train_all, X_train_augmented, left_index=True, right_index=True
)

# assert that no rows were lost
assert X_train_merged.shape[0] == X_train_all.shape[0]
assert X_train_merged.index.equals(X_train_all.index)

Feature Extraction: 100%|██████████| 80/80 [00:27<00:00,  2.86it/s]


In [16]:
imputer = PerColumnImputer()
X_train_merged = imputer.fit_transform(X_train_merged)


The columns ['value__fft_coefficient__attr_"real"__coeff_93'
 'value__fft_coefficient__attr_"real"__coeff_94'
 'value__fft_coefficient__attr_"real"__coeff_95'
 'value__fft_coefficient__attr_"real"__coeff_96'
 'value__fft_coefficient__attr_"real"__coeff_97'
 'value__fft_coefficient__attr_"real"__coeff_98'
 'value__fft_coefficient__attr_"real"__coeff_99'
 'value__fft_coefficient__attr_"imag"__coeff_93'
 'value__fft_coefficient__attr_"imag"__coeff_94'
 'value__fft_coefficient__attr_"imag"__coeff_95'
 'value__fft_coefficient__attr_"imag"__coeff_96'
 'value__fft_coefficient__attr_"imag"__coeff_97'
 'value__fft_coefficient__attr_"imag"__coeff_98'
 'value__fft_coefficient__attr_"imag"__coeff_99'
 'value__fft_coefficient__attr_"abs"__coeff_93'
 'value__fft_coefficient__attr_"abs"__coeff_94'
 'value__fft_coefficient__attr_"abs"__coeff_95'
 'value__fft_coefficient__attr_"abs"__coeff_96'
 'value__fft_coefficient__attr_"abs"__coeff_97'
 'value__fft_coefficient__attr_"abs"__coeff_98'
 'value__fft_

Next, we create a Pipeline to standardize and one-hot encode the data. This is necessary since some of the newly created features are categorical or do not lie within the same range as the original features.

In [17]:
cat_cols = X_train_merged.loc[:, X_train_merged.nunique() < 5].columns
numeric_cols = X_train_merged.loc[:, X_train_merged.nunique() >= 5].columns

# preprocessing pipeline for numerical features
num_trans = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# preprocessing pipeline for categorical features
cat_trans = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer(transformers=[
    ("num", num_trans, numeric_cols),
    ("cat", cat_trans, cat_cols)
])

In [18]:
# create objective function for optuna
def objective_feat_eng(trial):
    
    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }
    
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_merged, y_train_all)):
        X_train, X_val = X_train_merged.iloc[train_idx], X_train_merged.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx] 
        
        smote = SMOTE(random_state=config["general"]["seed"])
        X_train, y_train = smote.fit_resample(X_train, y_train) 
        
        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train, y_train)
        
        y_preds = rf.predict(X_val)
        
        score = f1_score(y_val, y_preds)
        f1_scores.append(score)
            
        trial.report(np.mean(f1_scores), fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
            
        
    return np.mean(score)

In [19]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-13 03:51:42,654] A new study created in memory with name: rf_feat_eng


In [20]:
warnings.filterwarnings('ignore') #ignore pandas warnings

study_feat_eng.optimize(
    objective_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 8 hours
    show_progress_bar=True
)

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-05-13 03:51:56,556] Trial 0 finished with value: 0.9636904761904762 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20, 'criterion': 'gini', 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.9636904761904762.
[I 2024-05-13 03:52:26,324] Trial 1 finished with value: 0.9735512630014859 and parameters: {'n_estimators': 973, 'max_depth': 44, 'min_samples_split': 8, 'min_samples_leaf': 6, 'criterion': 'log_loss', 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.9735512630014859.
[I 2024-05-13 03:52:32,089] Trial 2 finished with value: 0.89125 and parameters: {'n_estimators': 363, 'max_depth': 25, 'min_samples_split': 16, 'min_samples_leaf': 26, 'criterion': 'log_loss', 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 0.9735512630014859.
[I 2024-05-13 03:52:41,765] Trial 3 finished with value: 0.9119203236850296 and parameters: {'n_estimators': 954, 'max_depth':

In [21]:
best_params = study_feat_eng.best_params
    
print(best_params)

if "random_forest_feat_eng" in config:
    config["random_forest_feat_eng"].update(best_params)
else:
    config["random_forest_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'n_estimators': 998, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'entropy', 'max_features': 'sqrt', 'bootstrap': False}


In [22]:
p_importance_feat_eng = optuna.visualization.plot_param_importances(study_feat_eng)
p_importance_feat_eng.show()

In [23]:
p_history_feat_eng = optuna.visualization.plot_optimization_history(study_feat_eng)
p_history_feat_eng.show()