# Random Forest: Hyperparameter Tuning

## Packages and Presets

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import joblib

from sklearn.metrics import (
    f1_score, 
    balanced_accuracy_score,
)

from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns 

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier

from tsfresh.transformers.per_column_imputer import PerColumnImputer
from tsfresh.feature_extraction.settings import EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction import extract_features


pd.set_option('display.max_columns', None)
%load_ext blackcellmagic
%load_ext autoreload

import optuna
import warnings

In [2]:
NUM_TRIALS = 200

with open("classical_ml_config.yaml", "r") as file:
    config = yaml.safe_load(file)

## No Feature Engineering

In [3]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [4]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [5]:
# create objective function for optuna
def objective_no_feat_eng(trial):
    use_smote = trial.suggest_categorical("use_smote", [True, False]),

    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
    }
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        if use_smote:
            smote = SMOTE(random_state=config["general"]["seed"])
            X_train, y_train = smote.fit_resample(X_train, y_train) 
        
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train, y_train)
        
        y_preds = rf.predict(X_val)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(score)
            
        
    return np.mean(f1_scores)

In [6]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_no_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_no_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-04-29 23:13:23,069] A new study created in memory with name: rf_no_feat_eng


In [7]:
study_no_feat_eng.optimize(
    objective_no_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 20 hours
    show_progress_bar=True
)

  0%|          | 0/2 [00:00<?, ?it/s]

[I 2024-04-29 23:13:31,524] Trial 0 finished with value: 0.9739029780153174 and parameters: {'use_smote': False, 'n_estimators': 759, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9739029780153174.
[I 2024-04-29 23:13:36,761] Trial 1 finished with value: 0.9680749280411456 and parameters: {'use_smote': True, 'n_estimators': 291, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9739029780153174.


In [8]:
best_params = study_no_feat_eng.best_params
    
print(best_params)

if "random_forest_no_feat_eng" in config:
    config["random_forest_no_feat_eng"].update(best_params)
else:
    config["random_forest_no_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'use_smote': False, 'n_estimators': 759, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}


In [9]:
p_importance_no_feat_eng = optuna.visualization.plot_param_importances(study_no_feat_eng)
p_importance_no_feat_eng.show()

In [10]:
p_history_no_feat_eng = optuna.visualization.plot_optimization_history(study_no_feat_eng)
p_history_no_feat_eng.show()

## With Feature Engineering

In [11]:
# Load data
train_df = pd.read_csv(config["paths"]["ptb_train"], header=None)

In [12]:
X_train_all = train_df.iloc[:, :-1]
y_train_all = train_df.iloc[:, -1]

In [13]:
X_train_all.columns = X_train_all.columns.astype(str)

In [14]:
# create objective function for optuna
def objective_feat_eng(trial):
    use_smote = trial.suggest_categorical("use_smote", [True, False]),

    # see: https://medium.com/@ethannabatchian/optimizing-random-forest-models-a-deep-dive-into-hyperparameter-tuning-with-optuna-b8e4fe7f3670
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
    }
    
    
    
    f1_scores = []
    
    skf = StratifiedKFold(n_splits=5, random_state=config["general"]["seed"], shuffle=True)
    for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        X_train, X_val = X_train_all.iloc[train_idx], X_train_all.iloc[val_idx]
        y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
        
        if use_smote:
            smote = SMOTE(random_state=config["general"]["seed"])
            X_train, y_train = smote.fit_resample(X_train, y_train) 
        
        
        X_train["id"] = X_train.index
        X_train_melted =(
            X_train
            .melt(id_vars="id", var_name="time", value_name="value")
            .sort_values(by=["id", "time"])
        )
        # get rid of padding to not ruin the engineered features
        # for simplicity, we drop all 0s, as only few "true" 0s are in the data
        X_train_melted["value"] = X_train_melted["value"].replace(0, np.nan)
        X_train_melted = X_train_melted.dropna()
        
        X_val["id"] = X_val.index
        X_val_melted =(
            X_val
            .melt(id_vars="id", var_name="time", value_name="value")
            .sort_values(by=["id", "time"])
        )
        # get rid of padding to not ruin the engineered features
        # for simplicity, we drop all 0s, as only few "true" 0s are in the data
        X_val_melted["value"] = X_val_melted["value"].replace(0, np.nan)
        X_val_melted = X_val_melted.dropna()
        
        
        # We will follow this tutorial:
        # https://towardsdatascience.com/expanding-your-regression-repertoire-with-regularisation-903d2c9f7b28
        # but will use the ComprehensiveFCParameters instead of the EfficientFCParameters
        # for feature extraction
        X_train_augmented = extract_features(
            X_train_melted,
            column_id="id",
            column_sort="time",
            column_value="value",
            default_fc_parameters=ComprehensiveFCParameters(),
        )  
        X_val_augmented = extract_features(
            X_val_melted,
            column_id="id",
            column_sort="time",
            column_value="value",
            default_fc_parameters=ComprehensiveFCParameters(),
        )  
        
        X_train_merged = pd.merge(
            X_train, X_train_augmented, left_index=True, right_index=True
        )
        X_val_merged = pd.merge(
            X_val, X_val_augmented, left_index=True, right_index=True
        )

        # assert that no rows were lost
        assert X_train_merged.shape[0] == X_train.shape[0]
        assert X_val_merged.shape[0] == X_val.shape[0]
        assert X_train_merged.index.equals(X_train.index)
        assert X_val_merged.index.equals(X_val.index)
        
        imputer = PerColumnImputer()
        X_train_merged = imputer.fit_transform(X_train_merged)
        X_val_merged = imputer.transform(X_val_merged)
        
        rf = RandomForestClassifier(
            random_state=config["general"]["seed"],
            n_jobs=-1,
            **hyperparams
        )
        rf.fit(X_train_merged, y_train)
        
        y_preds = rf.predict(X_val_merged)
        
        score = f1_score(y_val, y_preds)
            
        trial.report(score, fold_num)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        f1_scores.append(score)
            
        
    return np.mean(score)

In [15]:
# prune bad trials 
pruner = optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=2)

study_feat_eng = optuna.create_study(
    direction="maximize",
    study_name="rf_feat_eng",
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"]),
    pruner=pruner,
)

[I 2024-04-29 23:13:37,451] A new study created in memory with name: rf_feat_eng


In [16]:
warnings.filterwarnings('ignore') #ignore pandas warnings

study_feat_eng.optimize(
    objective_feat_eng, 
    n_trials=NUM_TRIALS,
    timeout = 20 * 60 * 60, # timeout after 8 hours
    show_progress_bar=True
)

  0%|          | 0/2 [00:00<?, ?it/s]

Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.96it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.53it/s]
Feature Extraction: 100%|██████████| 80/80 [00:41<00:00,  1.95it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.60it/s]
Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.95it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.18it/s]
Feature Extraction: 100%|██████████| 80/80 [00:42<00:00,  1.87it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00,  9.88it/s]
Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.99it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.87it/s]


[I 2024-04-29 23:18:40,574] Trial 0 finished with value: 0.9760991442903512 and parameters: {'use_smote': False, 'n_estimators': 759, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9760991442903512.


Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.98it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.66it/s]
Feature Extraction: 100%|██████████| 80/80 [00:41<00:00,  1.95it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.59it/s]
Feature Extraction: 100%|██████████| 80/80 [00:40<00:00,  1.98it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.93it/s]
Feature Extraction: 100%|██████████| 80/80 [00:43<00:00,  1.83it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00, 10.37it/s]
Feature Extraction: 100%|██████████| 80/80 [00:42<00:00,  1.89it/s]
Feature Extraction: 100%|██████████| 78/78 [00:07<00:00,  9.91it/s]


[I 2024-04-29 23:23:45,783] Trial 1 finished with value: 0.975177304964539 and parameters: {'use_smote': True, 'n_estimators': 291, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'criterion': 'entropy'}. Best is trial 0 with value: 0.9760991442903512.


In [17]:
best_params = study_feat_eng.best_params
    
print(best_params)

if "random_forest_feat_eng" in config:
    config["random_forest_feat_eng"].update(best_params)
else:
    config["random_forest_feat_eng"] = best_params

# see: https://stackoverflow.com/questions/12470665/how-can-i-write-data-in-yaml-format-in-a-file
with open("classical_ml_config.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

{'use_smote': False, 'n_estimators': 759, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2', 'criterion': 'entropy'}


In [18]:
p_importance_feat_eng = optuna.visualization.plot_param_importances(study_feat_eng)
p_importance_feat_eng.show()

In [19]:
p_history_feat_eng = optuna.visualization.plot_optimization_history(study_feat_eng)
p_history_feat_eng.show()