In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../src') 

from data_pipeline.extractor import execute_query
from gams.logistic_gam import train_logistic_gam_model
from gams.ebm_gam import train_ebm_model
from ml_models.random_forest import train_random_forest_model
from ml_models.xgb_model import train_xgboost_model

import optuna
from pygam import s, l, te
from sklearn.metrics import roc_auc_score

In [3]:
query = """
SELECT s.*, a.hospital_expire_flag as mortality
FROM apsiii s
LEFT JOIN admissions a
ON s.subject_id = a.subject_id
AND s.hadm_id = a.hadm_id;
"""

df = execute_query(query)

# Exclude specific columns
df_filtered = df.drop(columns=['subject_id', 'hadm_id', 'icustay_id', 'apsiii', 'apsiii_prob'])

# Replace NaNs with 0
df_filtered = df_filtered.fillna(0)

# Separate majority and minority classes
df_majority = df_filtered[df_filtered.mortality == 0]
df_minority = df_filtered[df_filtered.mortality == 1]

# Oversample minority class to twice its size
df_minority_oversampled = resample(df_minority,
                                   replace=True,     # sample with replacement
                                   n_samples=2*len(df_minority),  # double the minority class size
                                   random_state=42)  # reproducible results

# Downsample majority class to match the original minority class size
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority_oversampled),  # to match original minority class size
                                   random_state=42)  # reproducible results

# Combine downsampled majority class with oversampled minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority_oversampled])

# Separate features and target variable
y_balanced = df_balanced['mortality']
X_balanced = df_balanced.drop(columns=['mortality'])

# Print class distribution 
print("Class distribution:")
print(y_balanced.value_counts())

Class distribution:
mortality
0    13218
1    13218
Name: count, dtype: int64


## LogisticGam Optuna Optimization

In [4]:
# Define the objective function for Optuna hyperparameter tuning
def objective_gam(trial: optuna.Trial):
    try:
        # Suggest using 'auto' or custom term setup
        terms_option = trial.suggest_categorical("terms_option", ['auto', 'custom'])

        if terms_option == 'auto':
            terms = 'auto'
        else:
            # Predefined set of terms focusing on 15 continuous features
            predefined_terms = [
                # Spline terms for features 0-14
                s(0, n_splines=10, spline_order=3),   # Spline term for feature 0
                s(1, n_splines=15, spline_order=3),   # Spline term for feature 1
                s(2, n_splines=20, spline_order=3),   # Spline term for feature 2
                s(3, n_splines=12, spline_order=3),   # Spline term for feature 3
                s(4, n_splines=18, spline_order=3),   # Spline term for feature 4
                s(5, n_splines=10, spline_order=3),   # Spline term for feature 5
                s(6, n_splines=14, spline_order=3),   # Spline term for feature 6
                s(7, n_splines=16, spline_order=3),   # Spline term for feature 7
                s(8, n_splines=12, spline_order=3),   # Spline term for feature 8
                s(9, n_splines=10, spline_order=3),   # Spline term for feature 9
                s(10, n_splines=15, spline_order=3),  # Spline term for feature 10
                s(11, n_splines=10, spline_order=3),  # Spline term for feature 11
                s(12, n_splines=18, spline_order=3),  # Spline term for feature 12
                s(13, n_splines=12, spline_order=3),  # Spline term for feature 13
                s(14, n_splines=20, spline_order=3),  # Spline term for feature 14
                s(15, n_splines=18, spline_order=3),  # Spline term for feature 14


                # Linear terms for features 0-14
                l(0), l(1), l(2), l(3), l(4), l(5), l(6), l(7), l(8), l(9), 
                l(10), l(11), l(12), l(13), l(14), l(15),

                # Interaction terms between pairs of features
                te(s(0), s(1)),  # Interaction between spline terms of features 0 and 1
                te(s(2), s(3)),  # Interaction between spline terms of features 2 and 3
                te(s(4), s(5)),  # Interaction between spline terms of features 4 and 5
                te(l(6), l(7)),  # Interaction between linear terms of features 6 and 7
                te(l(8), l(9)),  # Interaction between linear terms of features 8 and 9
                te(s(10), l(11)), # Interaction between spline term of feature 10 and linear term of feature 11
                te(s(12), s(13)), # Interaction between spline terms of features 12 and 13
                te(l(13), l(14)),  # Interaction between linear terms of features 13 and 14
                te(l(14), l(15))  # Interaction between linear terms of features 13 and 14

            ]
            
            # Let Optuna select one term from the predefined set
            selected_term_index = trial.suggest_int("selected_term", 0, len(predefined_terms) - 1)
            terms = predefined_terms[selected_term_index]

        # Hyperparameters for the LogisticGAM
        max_iter = trial.suggest_int("max_iter", 50, 500)
        tol = trial.suggest_loguniform("tol", 1e-5, 1e-1)
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])

        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Train the model with the sampled hyperparameters
        _, results = train_logistic_gam_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            terms=terms,
            max_iter=max_iter,
            tol=tol,
            fit_intercept=fit_intercept,
            verbose=False,
            include_summary = False,
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")

        # Return 0.0 ROC-AUC to indicate failure
        return 0.0  # Low ROC-AUC to signal failure

In [5]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_gam = optuna.create_study(
    study_name="Case 13: LogisticGam_APSIII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
)

# Perform a trial with default values first
default_params = {
    'terms_option': 'auto',
    'max_iter': 100,
    'tol': 0.0001,
    'fit_intercept': True
}
study_gam.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_gam.optimize(objective_gam, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 05:04:07,181] A new study created in RDB with name: Case 13: LogisticGam_APSIII_Balanced_Data
[I 2024-10-22 05:04:13,422] Trial 0 finished with value: 0.8389464285305582 and parameters: {'terms_option': 'auto', 'max_iter': 100, 'tol': 0.0001, 'fit_intercept': True}. Best is trial 0 with value: 0.8389464285305582.
[I 2024-10-22 05:04:13,566] Trial 1 finished with value: 0.49990244231794767 and parameters: {'terms_option': 'custom', 'selected_term': 36, 'max_iter': 397, 'tol': 0.0003004756735232314, 'fit_intercept': False}. Best is trial 0 with value: 0.8389464285305582.
[I 2024-10-22 05:04:18,941] Trial 2 finished with value: 0.8389465715770128 and parameters: {'terms_option': 'auto', 'max_iter': 454, 'tol': 0.00463251623540913, 'fit_intercept': True}. Best is trial 2 with value: 0.8389465715770128.
[I 2024-10-22 05:04:19,212] Trial 3 finished with value: 0.6749215676289306 and parameters: {'terms_option': 'custom', 'selected_term': 1, 'max_iter': 487, 'tol': 0.01151387405

In [6]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_gam.best_params)
print("Best ROC-AUC score:", study_gam.best_value)

Best hyperparameters: {'terms_option': 'auto', 'max_iter': 368, 'tol': 0.069513604993303, 'fit_intercept': False}
Best ROC-AUC score: 0.8389574431075641


In [7]:
# Save the best configurations to a JSON file
config_file_path = Path("logisticgam_apsiii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_gam.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_gam.best_params
        saved_configs["best_value"] = study_gam.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_gam.best_params,
        "best_value": study_gam.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

## EBM Optuna Optimization

In [8]:
# Define the objective function for Optuna hyperparameter tuning
def objective_ebm(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for EBM
        max_bins = trial.suggest_int('max_bins', 64, 1024)
        max_interaction_bins = trial.suggest_int('max_interaction_bins', 16, 64)
        interactions = trial.suggest_float('interactions', 0.0, 1.0)
        outer_bags = trial.suggest_int('outer_bags', 1, 20)
        inner_bags = trial.suggest_int('inner_bags', 0, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 100)
        max_leaves = trial.suggest_int('max_leaves', 1, 10)
        max_rounds = trial.suggest_int('max_rounds', 5000, 25000)
        early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 10, 100)
        smoothing_rounds = trial.suggest_int('smoothing_rounds', 50, 500)
        greedy_ratio = trial.suggest_float('greedy_ratio', 0.0, 3.0)

        # Train the model with the sampled hyperparameters
        _, results_ebm = train_ebm_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            max_bins=max_bins,
            max_interaction_bins=max_interaction_bins,
            interactions=interactions,
            outer_bags=outer_bags,
            inner_bags=inner_bags,
            learning_rate=learning_rate,
            min_samples_leaf=min_samples_leaf,
            max_leaves=max_leaves,
            max_rounds=max_rounds,
            early_stopping_rounds=early_stopping_rounds,
            smoothing_rounds=smoothing_rounds,
            greedy_ratio=greedy_ratio,
            random_state=42,  # Fixed random state for reproducibility
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_ebm['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0

In [9]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_ebm = optuna.create_study(
    study_name="Case 14: EBM_APSIII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'max_bins': 1024,
    'max_interaction_bins': 32,
    'interactions': 0.9,
    'outer_bags': 14,
    'inner_bags': 0,
    'learning_rate': 0.01,
    'min_samples_leaf': 2,
    'max_leaves': 3,
    'max_rounds': 25000,
    'early_stopping_rounds': 50,
    'smoothing_rounds': 200,
    'greedy_ratio': 1.5
}
study_ebm.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_ebm.optimize(objective_ebm, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 05:15:21,140] A new study created in RDB with name: Case 14: EBM_APSIII_Balanced_Data
[I 2024-10-22 05:15:43,069] Trial 0 finished with value: 0.8534913920365467 and parameters: {'max_bins': 1024, 'max_interaction_bins': 32, 'interactions': 0.9, 'outer_bags': 14, 'inner_bags': 0, 'learning_rate': 0.01, 'min_samples_leaf': 2, 'max_leaves': 3, 'max_rounds': 25000, 'early_stopping_rounds': 50, 'smoothing_rounds': 200, 'greedy_ratio': 1.5}. Best is trial 0 with value: 0.8534913920365467.
[I 2024-10-22 05:15:44,464] Trial 1 finished with value: 0.5 and parameters: {'max_bins': 826, 'max_interaction_bins': 22, 'interactions': 0.001040149710192395, 'outer_bags': 6, 'inner_bags': 7, 'learning_rate': 0.030477301366052097, 'min_samples_leaf': 45, 'max_leaves': 1, 'max_rounds': 15471, 'early_stopping_rounds': 28, 'smoothing_rounds': 70, 'greedy_ratio': 0.5213660522894998}. Best is trial 0 with value: 0.8534913920365467.
[I 2024-10-22 05:15:46,575] Trial 2 finished with value: 0.8498

In [10]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_ebm.best_params)
print("Best ROC-AUC score:", study_ebm.best_value)

Best hyperparameters: {'max_bins': 581, 'max_interaction_bins': 47, 'interactions': 0.9998168404207781, 'outer_bags': 18, 'inner_bags': 10, 'learning_rate': 0.04367020510396994, 'min_samples_leaf': 96, 'max_leaves': 5, 'max_rounds': 10124, 'early_stopping_rounds': 97, 'smoothing_rounds': 386, 'greedy_ratio': 1.3695083018257845}
Best ROC-AUC score: 0.856105565994768


In [11]:
# Save the best configurations to a JSON file
config_file_path = Path("ebm_apsiii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_ebm.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_ebm.best_params
        saved_configs["best_value"] = study_ebm.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_ebm.best_params,
        "best_value": study_ebm.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

## Random Forest Optuna Optimization

In [12]:
def objective_rf(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets (ensure stratification)
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for RandomForestClassifier, but ensure ranges are sensible
        n_estimators = trial.suggest_int('n_estimators', 100, 300)  # Limit range based on reasonable number of trees
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        max_depth = trial.suggest_int('max_depth', 10, 30)  # Limit max_depth to prevent overfitting
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.2)  # Tighten the range
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Avoid 'auto'

        # Handle None for max_leaf_nodes by using suggest_categorical
        max_leaf_nodes = trial.suggest_categorical('max_leaf_nodes', [None, 20, 50, 100])

        min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0, 0.1)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        # Handle oob_score: If bootstrap is False, oob_score must be False
        if bootstrap:
            oob_score = trial.suggest_categorical('oob_score', [True, False])
            max_samples = trial.suggest_categorical('max_samples', [None, 0.5, 0.75, 1.0])
        else:
            oob_score = False  # Set to False when bootstrap is False
            max_samples = None  # Force max_samples to be None if bootstrap is False

        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
        ccp_alpha = trial.suggest_float('ccp_alpha', 0.0, 0.05)  # Tighten range

        # Train the model with the sampled hyperparameters
        _, results_rf = train_random_forest_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            oob_score=oob_score,
            class_weight=class_weight,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
            random_state=42,  # Fix random state for reproducibility
            verbose=False
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_rf['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0


In [13]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_rf = optuna.create_study(
    study_name="Case 15: Random_Forest_APSIII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'n_estimators': 100,
    'criterion': 'gini',
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': 'sqrt',
    'max_leaf_nodes': 25,
    'min_impurity_decrease': 0.0,
    'bootstrap': True,
    'oob_score': False,
    'class_weight': None,
    'ccp_alpha': 0.0,
    'max_samples': 0.5
}
study_rf.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_rf.optimize(objective_rf, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 05:31:43,704] A new study created in RDB with name: Case 15: Random_Forest_APSIII_Balanced_Data
[I 2024-10-22 05:31:44,010] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.0.


Trial failed with error: '25' not in (None, 20, 50, 100).


[I 2024-10-22 05:31:45,705] Trial 1 finished with value: 0.7357555055719455 and parameters: {'n_estimators': 104, 'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.1070394843024652, 'max_features': 'log2', 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.0007251930380897887, 'bootstrap': False, 'class_weight': 'balanced_subsample', 'ccp_alpha': 0.04249498865677911}. Best is trial 1 with value: 0.7357555055719455.
[I 2024-10-22 05:31:49,344] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 282, 'criterion': 'gini', 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.09090411894024629, 'max_features': 'log2', 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.06781876464759538, 'bootstrap': True, 'oob_score': False, 'max_samples': 0.5, 'class_weight': 'balanced_subsample', 'ccp_alpha': 0.013716849734950888}. Best is trial 1 with value: 0.7357555055719455.
[I 2024-10-2

In [14]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_rf.best_params)
print("Best ROC-AUC score:", study_rf.best_value)

Best hyperparameters: {'n_estimators': 160, 'criterion': 'gini', 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0043396778555745905, 'max_features': 'sqrt', 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.0007340882183458019, 'bootstrap': True, 'oob_score': False, 'max_samples': 0.5, 'class_weight': 'balanced_subsample', 'ccp_alpha': 0.00023054698848445313}
Best ROC-AUC score: 0.8362949194476804


In [15]:
# Specify the path where you want to save the JSON file
config_file_path = Path("random_forest_apsiii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_rf.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_rf.best_params
        saved_configs["best_value"] = study_rf.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_rf.best_params,
        "best_value": study_rf.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)


## XGBoost Optuna Optimization

In [16]:
# Define the objective function for Optuna hyperparameter tuning
def objective_xgboost(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for XGBoost
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        min_child_weight = trial.suggest_float('min_child_weight', 1, 10)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
        reg_alpha = trial.suggest_float('reg_alpha', 0.0, 10.0)
        reg_lambda = trial.suggest_float('reg_lambda', 0.0, 10.0)
        scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 5.0)
        n_estimators = trial.suggest_int('n_estimators', 100, 500)
        gamma = trial.suggest_float('gamma', 0.0, 5.0)
        max_delta_step = trial.suggest_float('max_delta_step', 0.0, 10.0)

        # Train the model with the sampled hyperparameters
        _, results_xgb = train_xgboost_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            scale_pos_weight=scale_pos_weight,
            n_estimators=n_estimators,
            gamma=gamma,
            max_delta_step=max_delta_step,
            random_state=42,  # Fixed random state for reproducibility
            verbosity=0
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_xgb['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0



In [17]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_xgb = optuna.create_study(
    study_name="Case 16: XGBoost_APSIII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 1.0,
    'scale_pos_weight': 1.0,
    'n_estimators': 100,
    'gamma': 0.0,
    'max_delta_step': 0.0
}
study_xgb.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_xgb.optimize(objective_xgboost, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 05:36:08,787] A new study created in RDB with name: Case 16: XGBoost_APSIII_Balanced_Data
[I 2024-10-22 05:36:10,498] Trial 0 finished with value: 0.8884486554777637 and parameters: {'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 1.0, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0, 'n_estimators': 100, 'gamma': 0.0, 'max_delta_step': 0.0}. Best is trial 0 with value: 0.8884486554777637.
[I 2024-10-22 05:36:12,688] Trial 1 finished with value: 0.8606243319730569 and parameters: {'learning_rate': 0.18364468947758936, 'max_depth': 5, 'min_child_weight': 4.235846919343394, 'subsample': 0.6269084981737464, 'colsample_bytree': 0.6727965957336687, 'reg_alpha': 7.992728691172348, 'reg_lambda': 2.171151775919271, 'scale_pos_weight': 2.7450352752415603, 'n_estimators': 397, 'gamma': 4.970738968984939, 'max_delta_step': 1.8336582537256174}. Best is trial 0 with value: 0.8884486554777637.
[I 2024-10-22 05:36:14,589] 

In [18]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_xgb.best_params)
print("Best ROC-AUC score:", study_xgb.best_value)

Best hyperparameters: {'learning_rate': 0.2896027779817393, 'max_depth': 10, 'min_child_weight': 1.2646569686590456, 'subsample': 0.9323226684735562, 'colsample_bytree': 0.6961040747673508, 'reg_alpha': 1.046220932209122, 'reg_lambda': 0.8626759113789945, 'scale_pos_weight': 4.541731121439532, 'n_estimators': 365, 'gamma': 0.004571785264911518, 'max_delta_step': 0.6586642621092984}
Best ROC-AUC score: 0.9370923176043267


In [19]:
# Save the best configurations to a JSON file
config_file_path = Path("xgboost_apsiii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_xgb.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_xgb.best_params
        saved_configs["best_value"] = study_xgb.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_xgb.best_params,
        "best_value": study_xgb.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

# End