Imports and Setup

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
import os
import warnings
from sklearn.exceptions import FitFailedWarning
#importing models
from utils.models import model_mapping

seed = 123456
runs = 10 # 100 for final experiments

  from .autonotebook import tqdm as notebook_tqdm


Loading Dataset

In [2]:

data_dir = "./data"
results_dir = "./results"
summary_dir = os.path.join(results_dir, "summaries")
pickle_dir = os.path.join(results_dir, "pickles")
os.makedirs(summary_dir, exist_ok=True)
os.makedirs(pickle_dir, exist_ok=True)

dataset_names = sorted([
    f.replace(".csv", "") for f in os.listdir(data_dir)
    if f.endswith(".csv") and not f.startswith(".")
])

#supressing fitfailed warnings, we're going to have lots of those when tuning: 
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Running HyperParam Tuning Pipeline


In [3]:
for dataset_name in dataset_names:
    # loading data
    df = pd.read_csv(f"{data_dir}/{dataset_name}.csv", index_col=0)
    X = df.drop(columns=["Diagnosis", "Diagnosis_labeled"])
    y = df["Diagnosis_labeled"]

    # encoding labels
    le = LabelEncoder()
    y = le.fit_transform(y)

    # initializing models from mapping
    models = {
        name: model_class() 
        for name, model_class in model_mapping.items()
        if hasattr(model_class(), 'tune_params')}
    


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)
    
    # preprocessing
    vt = VarianceThreshold(threshold=0)
    X_train = vt.fit_transform(X_train)
    X_test = vt.transform(X_test)
    
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for name, model in models.items():
        if(dataset_name == "rarefied-feature-table-labeled"):
            print(name)
            model.reset(name, dataset_name)
            model.tune_params(X_train_scaled, y_train,runs,name,dataset_name)


Decision Tree


[I 2025-04-21 03:59:29,840] A new study created in RDB with name: Decision Tree rarefied-feature-table-labeled
[I 2025-04-21 03:59:29,955] Trial 0 finished with value: 0.580609756097561 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.580609756097561.
[I 2025-04-21 03:59:30,108] Trial 1 finished with value: 0.6106097560975611 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 1 with value: 0.6106097560975611.
[I 2025-04-21 03:59:30,208] Trial 2 finished with value: 0.625609756097561 and parameters: {'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 13, 'min_samples_split': 14, 'min_samples_leaf': 18, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.625609756097561.
[I 2025-04-21 03:59:30,304] Trial 3 finished with value: 0.56073170731

Random Forest


[I 2025-04-21 03:59:46,987] Trial 0 finished with value: 0.7145121951219512 and parameters: {'n_estimators': 388, 'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 18, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 0.7145121951219512.
[I 2025-04-21 04:00:06,092] Trial 1 finished with value: 0.6947560975609756 and parameters: {'n_estimators': 442, 'criterion': 'gini', 'max_depth': 49, 'min_samples_split': 16, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 0 with value: 0.7145121951219512.
[I 2025-04-21 04:00:06,491] Trial 2 finished with value: 0.581219512195122 and parameters: {'n_estimators': 54, 'criterion': 'entropy', 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 0 with value: 0.7145121951219512.
[I 2025-04-21 04:00:07,034] Trial 3 finished with value: 0.6606097560975611 and parameters: {'n_estimators': 75, 'criterion': 'entropy', 'max_depth': 16, 'min_samples_split': 13, '

Gradient Boosting


[I 2025-04-21 04:00:38,685] Trial 0 finished with value: 0.5167073170731709 and parameters: {'loss': 'log_loss', 'learning_rate': 2.2479264704804223, 'n_estimators': 62, 'subsample': 0.9791981334481983, 'criterion': 'squared_error', 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5167073170731709.
[I 2025-04-21 04:00:38,789] Trial 1 pruned. 
[I 2025-04-21 04:00:38,882] Trial 2 pruned. 


Skipping due to error
Skipping due to error


[I 2025-04-21 04:00:41,772] Trial 3 finished with value: 0.5019512195121951 and parameters: {'loss': 'log_loss', 'learning_rate': 2.0625581639022688, 'n_estimators': 34, 'subsample': 0.7604079442639889, 'criterion': 'squared_error', 'max_depth': 15, 'min_samples_split': 19, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 0.5167073170731709.
[I 2025-04-21 04:00:42,801] Trial 4 finished with value: 0.6848780487804877 and parameters: {'loss': 'log_loss', 'learning_rate': 0.0356379161962499, 'n_estimators': 93, 'subsample': 0.9419515436929495, 'criterion': 'friedman_mse', 'max_depth': 32, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.6848780487804877.
[I 2025-04-21 04:00:42,895] Trial 5 pruned. 
[I 2025-04-21 04:00:42,986] Trial 6 pruned. 


Skipping due to error
Skipping due to error


[I 2025-04-21 04:00:46,057] Trial 7 finished with value: 0.5857317073170731 and parameters: {'loss': 'exponential', 'learning_rate': 1.7089515632882035, 'n_estimators': 154, 'subsample': 0.6692738308925273, 'criterion': 'squared_error', 'max_depth': 35, 'min_samples_split': 5, 'min_samples_leaf': 20, 'max_features': None}. Best is trial 4 with value: 0.6848780487804877.
[I 2025-04-21 04:00:48,413] Trial 8 finished with value: 0.5570731707317073 and parameters: {'loss': 'log_loss', 'learning_rate': 2.868896013032783, 'n_estimators': 114, 'subsample': 0.43448014610501384, 'criterion': 'friedman_mse', 'max_depth': 29, 'min_samples_split': 17, 'min_samples_leaf': 11, 'max_features': None}. Best is trial 4 with value: 0.6848780487804877.
[I 2025-04-21 04:00:48,527] Trial 9 pruned. 
[I 2025-04-21 04:00:48,672] A new study created in RDB with name: Extreme Gradient Boosting rarefied-feature-table-labeled


Skipping due to error
Extreme Gradient Boosting


[I 2025-04-21 04:00:49,846] Trial 0 finished with value: 0.6893902439024391 and parameters: {'n_estimators': 294, 'max_depth': 8, 'max_leaves': 2, 'learning_rate': 0.662302651185219, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:50,019] Trial 1 pruned. 


Skipping due to error
Skipping due to error


[I 2025-04-21 04:00:50,207] Trial 2 pruned. 
[I 2025-04-21 04:00:51,800] Trial 3 finished with value: 0.6504878048780488 and parameters: {'n_estimators': 472, 'max_depth': 12, 'max_leaves': 275, 'learning_rate': 1.0129960142203185, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:52,568] Trial 4 finished with value: 0.6502439024390244 and parameters: {'n_estimators': 140, 'max_depth': 48, 'max_leaves': 12, 'learning_rate': 1.2424165016880622, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:52,724] Trial 5 pruned. 


Skipping due to error


[I 2025-04-21 04:00:53,233] Trial 6 finished with value: 0.6354878048780488 and parameters: {'n_estimators': 35, 'max_depth': 42, 'max_leaves': 726, 'learning_rate': 0.559157091830178, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:54,444] Trial 7 finished with value: 0.6453658536585365 and parameters: {'n_estimators': 321, 'max_depth': 24, 'max_leaves': 375, 'learning_rate': 0.8229338758545879, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:54,597] Trial 8 pruned. 


Skipping due to error


[I 2025-04-21 04:00:55,095] Trial 9 finished with value: 0.6501219512195122 and parameters: {'n_estimators': 60, 'max_depth': 48, 'max_leaves': 457, 'learning_rate': 1.60036929630167, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.6893902439024391.
[I 2025-04-21 04:00:55,220] A new study created in RDB with name: Light Gradient Boosting Model rarefied-feature-table-labeled


Light Gradient Boosting Model


[I 2025-04-21 04:00:55,468] Trial 0 pruned. 


Skipping due to error


[I 2025-04-21 04:00:55,740] Trial 1 finished with value: 0.5028048780487805 and parameters: {'num_leaves': 148, 'min_data_in_leaf': 12, 'max_depth': 34, 'learning_rate': 2.868479188652052, 'num_iterations': 17}. Best is trial 1 with value: 0.5028048780487805.
[I 2025-04-21 04:00:55,822] Trial 2 pruned. 


Skipping due to error


[I 2025-04-21 04:00:56,106] Trial 3 finished with value: 0.5174390243902438 and parameters: {'num_leaves': 109, 'min_data_in_leaf': 12, 'max_depth': 36, 'learning_rate': 2.2828444976133166, 'num_iterations': 90}. Best is trial 3 with value: 0.5174390243902438.
[I 2025-04-21 04:00:56,431] Trial 4 finished with value: 0.6652439024390244 and parameters: {'num_leaves': 126, 'min_data_in_leaf': 15, 'max_depth': 39, 'learning_rate': 0.661214369312674, 'num_iterations': 46}. Best is trial 4 with value: 0.6652439024390244.
[I 2025-04-21 04:00:56,512] Trial 5 pruned. 


Skipping due to error


[I 2025-04-21 04:00:56,845] Trial 6 finished with value: 0.6847560975609757 and parameters: {'num_leaves': 205, 'min_data_in_leaf': 21, 'max_depth': 47, 'learning_rate': 0.2789102889214625, 'num_iterations': 75}. Best is trial 6 with value: 0.6847560975609757.
[I 2025-04-21 04:00:57,156] Trial 7 finished with value: 0.5567073170731707 and parameters: {'num_leaves': 100, 'min_data_in_leaf': 10, 'max_depth': 36, 'learning_rate': 2.2656526347268198, 'num_iterations': 87}. Best is trial 6 with value: 0.6847560975609757.
[I 2025-04-21 04:00:57,594] Trial 8 finished with value: 0.586219512195122 and parameters: {'num_leaves': 283, 'min_data_in_leaf': 14, 'max_depth': 33, 'learning_rate': 2.3630895211754055, 'num_iterations': 81}. Best is trial 6 with value: 0.6847560975609757.
[I 2025-04-21 04:00:57,758] Trial 9 pruned. 
[I 2025-04-21 04:00:57,910] A new study created in RDB with name: Extremely Randomized Trees rarefied-feature-table-labeled


Skipping due to error
Extremely Randomized Trees


[I 2025-04-21 04:00:59,291] Trial 0 finished with value: 0.6357317073170731 and parameters: {'n_estimators': 201, 'criterion': 'entropy', 'max_depth': 37, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6357317073170731.
[I 2025-04-21 04:01:01,370] Trial 1 finished with value: 0.6599999999999999 and parameters: {'n_estimators': 346, 'criterion': 'log_loss', 'max_depth': 6, 'min_samples_split': 14, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6599999999999999.
[W 2025-04-21 04:01:12,215] Trial 2 failed with parameters: {'n_estimators': 390, 'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 16, 'min_samples_leaf': 16, 'max_features': None} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  

KeyboardInterrupt: 