# Random Forest: Hyperparameter Tuning

## Packages and Presets

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import joblib

from sklearn.metrics import (
    f1_score, 
    balanced_accuracy_score,
)

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns 

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier

from tsfresh.transformers.per_column_imputer import PerColumnImputer
from tsfresh.feature_extraction.settings import EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features


pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

import optuna
import warnings

In [2]:
NUM_TRIALS = 200

with open("classical_ml_config.yaml", "r") as file:
    config = yaml.safe_load(file)

## No Feature Engineering

In [3]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [4]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [5]:
smote = SMOTE(random_state=config["general"]["seed"])
X_train_all, y_train_all = smote.fit_resample(X_train_all, y_train_all)

In [6]:
# create objective function for optuna
def objective_no_feat_eng(trial):

    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train, y_train)
        
        y_preds = rf.predict(X_val)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(score)
            
        
    return np.mean(f1_scores)

In [7]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_no_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_no_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-09 02:12:42,643] A new study created in memory with name: rf_no_feat_eng


In [8]:
study_no_feat_eng.optimize(
    objective_no_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 20 hours
    show_progress_bar=True
)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2024-05-09 02:12:56,251] Trial 0 finished with value: 0.957659803364671 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.957659803364671.
[I 2024-05-09 02:13:16,571] Trial 1 finished with value: 0.9273140227649449 and parameters: {'n_estimators': 737, 'max_depth': 10, 'min_samples_split': 32, 'min_samples_leaf': 27, 'max_features': 'sqrt', 'criterion': 'log_loss'}. Best is trial 0 with value: 0.957659803364671.
[I 2024-05-09 02:13:29,205] Trial 2 finished with value: 0.9695713437811596 and parameters: {'n_estimators': 489, 'max_depth': 21, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 2 with value: 0.9695713437811596.
[I 2024-05-09 02:13:46,204] Trial 3 finished with value: 0.9578510929823476 and parameters: {'n_estimators': 563, 'max_depth': 34, 'min_samples_split': 3, 'min_samples_leaf

In [9]:
best_params = study_no_feat_eng.best_params
    
print(best_params)

if "random_forest_no_feat_eng" in config:
    config["random_forest_no_feat_eng"].update(best_params)
else:
    config["random_forest_no_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'n_estimators': 667, 'max_depth': 47, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'criterion': 'entropy'}


In [10]:
p_importance_no_feat_eng = optuna.visualization.plot_param_importances(study_no_feat_eng)
p_importance_no_feat_eng.show()

In [11]:
p_history_no_feat_eng = optuna.visualization.plot_optimization_history(study_no_feat_eng)
p_history_no_feat_eng.show()

## With Feature Engineering

In [12]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [13]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [14]:
smote = SMOTE(random_state=config["general"]["seed"])
X_train_all, y_train_all = smote.fit_resample(X_train_all, y_train_all)

In [15]:
X_train_all.columns = X_train_all.columns.astype(str)

Note that for reasons of time, we will not carry out feature engineering for every validation split separately. Instead, we will use the same feature engineering for all validation splits. This is not ideal, but it is a compromise that we have to make in order to keep the runtime of this notebook within reasonable limits.

In [16]:
X_train_all["id"] = X_train_all.index
X_train_melted =(
    X_train_all
    .melt(id_vars="id", var_name="time", value_name="value")
    .sort_values(by=["id", "time"])
)

# get rid of padding to not ruin the engineered features
# for simplicity, we drop all 0s, as only few "true" 0s are in the data
X_train_melted["value"] = X_train_melted["value"].replace(0, np.nan)
X_train_melted = X_train_melted.dropna()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [17]:
# We will follow this tutorial:
# https://towardsdatascience.com/expanding-your-regression-repertoire-with-regularisation-903d2c9f7b28
# but will use the ComprehensiveFCParameters instead of the EfficientFCParameters
# for feature extraction
X_train_augmented = extract_features(
    X_train_melted,
    column_id="id",
    column_sort="time",
    column_value="value",
    default_fc_parameters=ComprehensiveFCParameters(),
)  

X_train_merged = pd.merge(
    X_train_all, X_train_augmented, left_index=True, right_index=True
)

# assert that no rows were lost
assert X_train_merged.shape[0] == X_train_all.shape[0]
assert X_train_merged.index.equals(X_train_all.index)

Feature Extraction: 100%|██████████| 80/80 [01:23<00:00,  1.04s/it]


In [18]:
scaler = StandardScaler()


# create objective function for optuna
def objective_feat_eng(trial):
    use_smote = trial.suggest_categorical("use_smote", [True, False]),

    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
    }
    
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_merged, y_train_all)):
        X_train, X_val = X_train_merged.iloc[train_idx], X_train_merged.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx] 
        
       
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train, y_train)
        
        y_preds = rf.predict(X_val)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(score)
            
        
    return np.mean(score)

In [19]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-05-09 03:19:29,633] A new study created in memory with name: rf_feat_eng


In [20]:
warnings.filterwarnings('ignore') #ignore pandas warnings

study_feat_eng.optimize(
    objective_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 8 hours
    show_progress_bar=True
)

  0%|          | 0/200 [00:00<?, ?it/s]

[I 2024-05-09 03:19:59,949] Trial 0 finished with value: 0.9801833776989056 and parameters: {'use_smote': False, 'n_estimators': 759, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9801833776989056.
[I 2024-05-09 03:20:18,117] Trial 1 finished with value: 0.9764428739693758 and parameters: {'use_smote': True, 'n_estimators': 291, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9801833776989056.
[I 2024-05-09 03:20:43,494] Trial 2 finished with value: 0.9692712906057945 and parameters: {'use_smote': False, 'n_estimators': 510, 'max_depth': 42, 'min_samples_split': 8, 'min_samples_leaf': 17, 'max_features': 'sqrt', 'criterion': 'gini'}. Best is trial 0 with value: 0.9801833776989056.
[I 2024-05-09 03:21:20,572] Trial 3 finished with value: 0.9712609970674487 and parameters: {'use_smote': False, 'n_e

In [21]:
best_params = study_feat_eng.best_params
    
print(best_params)

if "random_forest_feat_eng" in config:
    config["random_forest_feat_eng"].update(best_params)
else:
    config["random_forest_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'use_smote': False, 'n_estimators': 903, 'max_depth': 23, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'criterion': 'entropy'}


In [22]:
p_importance_feat_eng = optuna.visualization.plot_param_importances(study_feat_eng)
p_importance_feat_eng.show()

In [23]:
p_history_feat_eng = optuna.visualization.plot_optimization_history(study_feat_eng)
p_history_feat_eng.show()