In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../src') 

from data_pipeline.extractor import execute_query
from gams.logistic_gam import train_logistic_gam_model
from gams.ebm_gam import train_ebm_model
from ml_models.random_forest import train_random_forest_model
from ml_models.xgb_model import train_xgboost_model

import optuna
from pygam import s, l, te
from sklearn.metrics import roc_auc_score

In [2]:
query = """
SELECT s.*, a.hospital_expire_flag as mortality
FROM sapsii s
LEFT JOIN admissions a
ON s.subject_id = a.subject_id
AND s.hadm_id = a.hadm_id;
"""

# Execute the query and obtain the DataFrame
df = execute_query(query)

# Exclude specific columns
df_filtered = df.drop(columns=['subject_id', 'hadm_id', 'icustay_id', 'sapsii', 'sapsii_prob'])

# Replace NaNs with 0
df_filtered = df_filtered.fillna(0)

# Separate majority and minority classes
df_majority = df_filtered[df_filtered.mortality == 0]
df_minority = df_filtered[df_filtered.mortality == 1]

# Oversample minority class to twice its size
df_minority_oversampled = resample(df_minority,
                                   replace=True,     # sample with replacement
                                   n_samples=2*len(df_minority),  # double the minority class size
                                   random_state=42)  # reproducible results

# Downsample majority class to match the original minority class size
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority_oversampled),  # to match original minority class size
                                   random_state=42)  # reproducible results

# Combine downsampled majority class with oversampled minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority_oversampled])

# Separate features and target variable
y_balanced = df_balanced['mortality']
X_balanced = df_balanced.drop(columns=['mortality'])

# Print class distribution 
print("Class distribution:")
print(y_balanced.value_counts())


Class distribution:
mortality
0    13218
1    13218
Name: count, dtype: int64


## LogisticGam Optuna Optimization

In [3]:
# Define the objective function for Optuna hyperparameter tuning
def objective_gam(trial: optuna.Trial):
    try:
        # Suggest using 'auto' or custom term setup
        terms_option = trial.suggest_categorical("terms_option", ['auto', 'custom'])

        if terms_option == 'auto':
            terms = 'auto'
        else:
            # Predefined set of terms focusing on 15 continuous features
            predefined_terms = [
                # Spline terms for features 0-14
                s(0, n_splines=10, spline_order=3),   # Spline term for feature 0
                s(1, n_splines=15, spline_order=3),   # Spline term for feature 1
                s(2, n_splines=20, spline_order=3),   # Spline term for feature 2
                s(3, n_splines=12, spline_order=3),   # Spline term for feature 3
                s(4, n_splines=18, spline_order=3),   # Spline term for feature 4
                s(5, n_splines=10, spline_order=3),   # Spline term for feature 5
                s(6, n_splines=14, spline_order=3),   # Spline term for feature 6
                s(7, n_splines=16, spline_order=3),   # Spline term for feature 7
                s(8, n_splines=12, spline_order=3),   # Spline term for feature 8
                s(9, n_splines=10, spline_order=3),   # Spline term for feature 9
                s(10, n_splines=15, spline_order=3),  # Spline term for feature 10
                s(11, n_splines=10, spline_order=3),  # Spline term for feature 11
                s(12, n_splines=18, spline_order=3),  # Spline term for feature 12
                s(13, n_splines=12, spline_order=3),  # Spline term for feature 13
                s(14, n_splines=20, spline_order=3),  # Spline term for feature 14

                # Linear terms for features 0-14
                l(0), l(1), l(2), l(3), l(4), l(5), l(6), l(7), l(8), l(9), 
                l(10), l(11), l(12), l(13), l(14),

                # Interaction terms between pairs of features
                te(s(0), s(1)),  # Interaction between spline terms of features 0 and 1
                te(s(2), s(3)),  # Interaction between spline terms of features 2 and 3
                te(s(4), s(5)),  # Interaction between spline terms of features 4 and 5
                te(l(6), l(7)),  # Interaction between linear terms of features 6 and 7
                te(l(8), l(9)),  # Interaction between linear terms of features 8 and 9
                te(s(10), l(11)), # Interaction between spline term of feature 10 and linear term of feature 11
                te(s(12), s(13)), # Interaction between spline terms of features 12 and 13
                te(l(13), l(14))  # Interaction between linear terms of features 13 and 14
            ]
            
            # Let Optuna select one term from the predefined set
            selected_term_index = trial.suggest_int("selected_term", 0, len(predefined_terms) - 1)
            terms = predefined_terms[selected_term_index]

        # Hyperparameters for the LogisticGAM
        max_iter = trial.suggest_int("max_iter", 50, 500)
        tol = trial.suggest_loguniform("tol", 1e-5, 1e-1)
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])

        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Train the model with the sampled hyperparameters
        _, results = train_logistic_gam_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            terms=terms,
            max_iter=max_iter,
            tol=tol,
            fit_intercept=fit_intercept,
            verbose=False,
            include_summary = False,
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")

        # Return 0.0 ROC-AUC to indicate failure
        return 0.0  # Low ROC-AUC to signal failure

In [4]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_gam = optuna.create_study(
    study_name="Case 5: LogisticGam_SAPSII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
)

# Perform a trial with default values first
default_params = {
    'terms_option': 'auto',
    'max_iter': 100,
    'tol': 0.0001,
    'fit_intercept': True
}
study_gam.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_gam.optimize(objective_gam, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 03:47:32,083] A new study created in RDB with name: Case 5: LogisticGam_SAPSII_Balanced_Data
[I 2024-10-22 03:47:35,798] Trial 0 finished with value: 0.8461357287701896 and parameters: {'terms_option': 'auto', 'max_iter': 100, 'tol': 0.0001, 'fit_intercept': True}. Best is trial 0 with value: 0.8461357287701896.
[I 2024-10-22 03:47:36,011] Trial 1 finished with value: 0.5811333027023191 and parameters: {'terms_option': 'custom', 'selected_term': 1, 'max_iter': 155, 'tol': 0.01788555300247199, 'fit_intercept': True}. Best is trial 0 with value: 0.8461357287701896.
[I 2024-10-22 03:47:36,119] Trial 2 finished with value: 0.5716647002547371 and parameters: {'terms_option': 'custom', 'selected_term': 24, 'max_iter': 224, 'tol': 0.0007717613014337394, 'fit_intercept': True}. Best is trial 0 with value: 0.8461357287701896.
[I 2024-10-22 03:47:39,815] Trial 3 finished with value: 0.8461357287701896 and parameters: {'terms_option': 'auto', 'max_iter': 154, 'tol': 0.00080456249200

In [5]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_gam.best_params)
print("Best ROC-AUC score:", study_gam.best_value)

Best hyperparameters: {'terms_option': 'auto', 'max_iter': 272, 'tol': 0.09156529495554037, 'fit_intercept': True}
Best ROC-AUC score: 0.8461444546039214


In [6]:
# Save the best configurations to a JSON file
config_file_path = Path("logisticgam_sapsii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_gam.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_gam.best_params
        saved_configs["best_value"] = study_gam.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_gam.best_params,
        "best_value": study_gam.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

## EBM Optuna Optimization

In [7]:
# Define the objective function for Optuna hyperparameter tuning
def objective_ebm(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for EBM
        max_bins = trial.suggest_int('max_bins', 64, 1024)
        max_interaction_bins = trial.suggest_int('max_interaction_bins', 16, 64)
        interactions = trial.suggest_float('interactions', 0.0, 1.0)
        outer_bags = trial.suggest_int('outer_bags', 1, 20)
        inner_bags = trial.suggest_int('inner_bags', 0, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 100)
        max_leaves = trial.suggest_int('max_leaves', 1, 10)
        max_rounds = trial.suggest_int('max_rounds', 5000, 25000)
        early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 10, 100)
        smoothing_rounds = trial.suggest_int('smoothing_rounds', 50, 500)
        greedy_ratio = trial.suggest_float('greedy_ratio', 0.0, 3.0)

        # Train the model with the sampled hyperparameters
        _, results_ebm = train_ebm_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            max_bins=max_bins,
            max_interaction_bins=max_interaction_bins,
            interactions=interactions,
            outer_bags=outer_bags,
            inner_bags=inner_bags,
            learning_rate=learning_rate,
            min_samples_leaf=min_samples_leaf,
            max_leaves=max_leaves,
            max_rounds=max_rounds,
            early_stopping_rounds=early_stopping_rounds,
            smoothing_rounds=smoothing_rounds,
            greedy_ratio=greedy_ratio,
            random_state=42,  # Fixed random state for reproducibility
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_ebm['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0

In [8]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_ebm = optuna.create_study(
    study_name="Case 6: EBM_SAPSII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'max_bins': 1024,
    'max_interaction_bins': 32,
    'interactions': 0.9,
    'outer_bags': 14,
    'inner_bags': 0,
    'learning_rate': 0.01,
    'min_samples_leaf': 2,
    'max_leaves': 3,
    'max_rounds': 25000,
    'early_stopping_rounds': 50,
    'smoothing_rounds': 200,
    'greedy_ratio': 1.5
}
study_ebm.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_ebm.optimize(objective_ebm, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 03:52:36,600] A new study created in RDB with name: Case 6: EBM_SAPSII_Balanced_Data
[I 2024-10-22 03:52:41,308] Trial 0 finished with value: 0.8587581479260554 and parameters: {'max_bins': 1024, 'max_interaction_bins': 32, 'interactions': 0.9, 'outer_bags': 14, 'inner_bags': 0, 'learning_rate': 0.01, 'min_samples_leaf': 2, 'max_leaves': 3, 'max_rounds': 25000, 'early_stopping_rounds': 50, 'smoothing_rounds': 200, 'greedy_ratio': 1.5}. Best is trial 0 with value: 0.8587581479260554.
[I 2024-10-22 03:52:42,544] Trial 1 finished with value: 0.8584819252221797 and parameters: {'max_bins': 723, 'max_interaction_bins': 39, 'interactions': 0.9313296666266849, 'outer_bags': 7, 'inner_bags': 2, 'learning_rate': 0.031640978836777724, 'min_samples_leaf': 51, 'max_leaves': 4, 'max_rounds': 20094, 'early_stopping_rounds': 73, 'smoothing_rounds': 298, 'greedy_ratio': 0.29423862568504666}. Best is trial 0 with value: 0.8587581479260554.
[I 2024-10-22 03:52:45,227] Trial 2 finished with

In [9]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_ebm.best_params)
print("Best ROC-AUC score:", study_ebm.best_value)

Best hyperparameters: {'max_bins': 547, 'max_interaction_bins': 26, 'interactions': 0.9741496515249825, 'outer_bags': 1, 'inner_bags': 6, 'learning_rate': 0.015391514401568013, 'min_samples_leaf': 57, 'max_leaves': 6, 'max_rounds': 16190, 'early_stopping_rounds': 37, 'smoothing_rounds': 93, 'greedy_ratio': 0.8650897680807741}
Best ROC-AUC score: 0.8595679339056717


In [10]:
# Save the best configurations to a JSON file
config_file_path = Path("ebm_sapsii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_ebm.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_ebm.best_params
        saved_configs["best_value"] = study_ebm.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_ebm.best_params,
        "best_value": study_ebm.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

## Random Forest Optuna Optimization

In [11]:
def objective_rf(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets (ensure stratification)
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for RandomForestClassifier, but ensure ranges are sensible
        n_estimators = trial.suggest_int('n_estimators', 100, 300)  # Limit range based on reasonable number of trees
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        max_depth = trial.suggest_int('max_depth', 10, 30)  # Limit max_depth to prevent overfitting
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        min_weight_fraction_leaf = trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.2)  # Tighten the range
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Avoid 'auto'

        # Handle None for max_leaf_nodes by using suggest_categorical
        max_leaf_nodes = trial.suggest_categorical('max_leaf_nodes', [None, 20, 50, 100])

        min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0, 0.1)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        # Handle oob_score: If bootstrap is False, oob_score must be False
        if bootstrap:
            oob_score = trial.suggest_categorical('oob_score', [True, False])
            max_samples = trial.suggest_categorical('max_samples', [None, 0.5, 0.75, 1.0])
        else:
            oob_score = False  # Set to False when bootstrap is False
            max_samples = None  # Force max_samples to be None if bootstrap is False

        class_weight = trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample'])
        ccp_alpha = trial.suggest_float('ccp_alpha', 0.0, 0.05)  # Tighten range

        # Train the model with the sampled hyperparameters
        _, results_rf = train_random_forest_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            oob_score=oob_score,
            class_weight=class_weight,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
            random_state=42,  # Fix random state for reproducibility
            verbose=False
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_rf['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0


In [12]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_rf = optuna.create_study(
    study_name="Case 7: Random_Forest_SAPSII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'n_estimators': 100,
    'criterion': 'gini',
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_features': 'sqrt',
    'max_leaf_nodes': 25,
    'min_impurity_decrease': 0.0,
    'bootstrap': True,
    'oob_score': False,
    'class_weight': None,
    'ccp_alpha': 0.0,
    'max_samples': 0.5
}
study_rf.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_rf.optimize(objective_rf, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 03:55:23,107] A new study created in RDB with name: Case 7: Random_Forest_SAPSII_Balanced_Data
[I 2024-10-22 03:55:23,190] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 100, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.0.


Trial failed with error: '25' not in (None, 20, 50, 100).


[I 2024-10-22 03:55:24,820] Trial 1 finished with value: 0.7802326536147267 and parameters: {'n_estimators': 286, 'criterion': 'entropy', 'max_depth': 26, 'min_samples_split': 8, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.059236754922970626, 'max_features': 'sqrt', 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.048917730712254026, 'bootstrap': True, 'oob_score': True, 'max_samples': 0.75, 'class_weight': 'balanced_subsample', 'ccp_alpha': 0.019778164610568746}. Best is trial 1 with value: 0.7802326536147267.
[I 2024-10-22 03:55:25,632] Trial 2 finished with value: 0.7705502682407117 and parameters: {'n_estimators': 275, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.14432244189922064, 'max_features': 'sqrt', 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.013331515657659089, 'bootstrap': False, 'class_weight': 'balanced', 'ccp_alpha': 0.030610789808492512}. Best is trial 1 with value: 0.7802326536147267.
[

In [13]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_rf.best_params)
print("Best ROC-AUC score:", study_rf.best_value)

Best hyperparameters: {'n_estimators': 250, 'criterion': 'gini', 'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 4, 'min_weight_fraction_leaf': 0.004925992524158425, 'max_features': 'sqrt', 'max_leaf_nodes': 20, 'min_impurity_decrease': 9.191618146343788e-05, 'bootstrap': True, 'oob_score': False, 'max_samples': 0.5, 'class_weight': 'balanced_subsample', 'ccp_alpha': 0.0002531071206515281}
Best ROC-AUC score: 0.8431261028881651


In [14]:
# Specify the path where you want to save the JSON file
config_file_path = Path("random_forest_sapsii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_rf.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_rf.best_params
        saved_configs["best_value"] = study_rf.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_rf.best_params,
        "best_value": study_rf.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)


## XGBoost Optuna Optimization

In [15]:
# Define the objective function for Optuna hyperparameter tuning
def objective_xgboost(trial: optuna.Trial):
    try:
        # Split the data into training and validation sets
        x_train_split, x_valid_split, y_train_split, y_valid_split = train_test_split(
            X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
        )

        # Initialize StandardScaler
        scaler = StandardScaler()

        # Fit the scaler on the training data and transform both the training and validation sets
        x_train_split = scaler.fit_transform(x_train_split)
        x_valid_split = scaler.transform(x_valid_split)

        # Suggest hyperparameters for XGBoost
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        min_child_weight = trial.suggest_float('min_child_weight', 1, 10)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
        reg_alpha = trial.suggest_float('reg_alpha', 0.0, 10.0)
        reg_lambda = trial.suggest_float('reg_lambda', 0.0, 10.0)
        scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 5.0)
        n_estimators = trial.suggest_int('n_estimators', 100, 500)
        gamma = trial.suggest_float('gamma', 0.0, 5.0)
        max_delta_step = trial.suggest_float('max_delta_step', 0.0, 10.0)

        # Train the model with the sampled hyperparameters
        _, results_xgb = train_xgboost_model(
            X_train=x_train_split,
            y_train=y_train_split,
            X_test=x_valid_split,
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            scale_pos_weight=scale_pos_weight,
            n_estimators=n_estimators,
            gamma=gamma,
            max_delta_step=max_delta_step,
            random_state=42,  # Fixed random state for reproducibility
            verbosity=0
        )

        # Calculate the ROC-AUC score on validation data
        roc_auc = roc_auc_score(y_valid_split, results_xgb['y_pred_prob'])

        # Maximize ROC-AUC
        return roc_auc

    except Exception as e:
        # Print the error for debugging purposes (optional)
        print(f"Trial failed with error: {e}")
        
        # Return 0.0 ROC-AUC to indicate failure
        return 0.0



In [16]:
# Create an Optuna study with SQLite storage and TPE pruner
study_name = "Optuna_hyperparameter_optimization"
storage_name = "sqlite:///{}.db".format(study_name)

study_xgb = optuna.create_study(
    study_name="Case 8: XGBoost_SAPSII_Balanced_Data",
    direction="maximize",  # Maximize ROC-AUC
    storage=storage_name,
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  
)

# Perform a trial with default values first
default_params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'reg_alpha': 0.0,
    'reg_lambda': 1.0,
    'scale_pos_weight': 1.0,
    'n_estimators': 100,
    'gamma': 0.0,
    'max_delta_step': 0.0
}
study_xgb.enqueue_trial(default_params)

# Define the number of trials (hyperparameter combinations) to search
num_trials = 100

# Perform hyperparameter optimization
study_xgb.optimize(objective_xgboost, n_trials=num_trials, n_jobs=1)

[I 2024-10-22 03:57:07,869] A new study created in RDB with name: Case 8: XGBoost_SAPSII_Balanced_Data
[I 2024-10-22 03:57:08,294] Trial 0 finished with value: 0.884053195543359 and parameters: {'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 1.0, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0, 'n_estimators': 100, 'gamma': 0.0, 'max_delta_step': 0.0}. Best is trial 0 with value: 0.884053195543359.
[I 2024-10-22 03:57:09,092] Trial 1 finished with value: 0.8755852030458594 and parameters: {'learning_rate': 0.023824452468402587, 'max_depth': 10, 'min_child_weight': 4.002126391492083, 'subsample': 0.936207796327098, 'colsample_bytree': 0.7561774377137691, 'reg_alpha': 5.401964835101298, 'reg_lambda': 6.944838823063764, 'scale_pos_weight': 2.5369785240531946, 'n_estimators': 214, 'gamma': 0.8636267993622987, 'max_delta_step': 7.443719181906802}. Best is trial 0 with value: 0.884053195543359.
[I 2024-10-22 03:57:09,547] Tri

In [17]:
# Print the best hyperparameters and best ROC-AUC score
print("Best hyperparameters:", study_xgb.best_params)
print("Best ROC-AUC score:", study_xgb.best_value)

Best hyperparameters: {'learning_rate': 0.26109738696965085, 'max_depth': 10, 'min_child_weight': 2.8618598048479336, 'subsample': 0.8658549356431035, 'colsample_bytree': 0.6765296008309722, 'reg_alpha': 3.268084681188794, 'reg_lambda': 5.5498416919261135, 'scale_pos_weight': 2.6674262770496138, 'n_estimators': 444, 'gamma': 0.0030755496701013584, 'max_delta_step': 7.604503809246733}
Best ROC-AUC score: 0.8949516188281177


In [18]:
# Save the best configurations to a JSON file
config_file_path = Path("xgboost_sapsii_balanced_data_best_configs.json")

# Load the saved configuration (if it exists)
if config_file_path.exists():
    with open(config_file_path, "r") as config_file:
        saved_configs = json.load(config_file)

    # Compare the new best value with the saved one
    if study_xgb.best_value > saved_configs["best_value"]:
        print("New best value found. Replacing saved configuration.")
        saved_configs["best_params"] = study_xgb.best_params
        saved_configs["best_value"] = study_xgb.best_value

        # Update the saved configuration file
        with open(config_file_path, "w") as config_file:
            json.dump(saved_configs, config_file)
else:
    # If no saved configuration exists, create a new one
    saved_configs = {
        "best_params": study_xgb.best_params,
        "best_value": study_xgb.best_value
    }

    with open(config_file_path, "w") as config_file:
        json.dump(saved_configs, config_file)

# End