In [2]:
#pip install optuna

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, confusion_matrix

# 1. Load Data
# We use the prepared datasets
X_train = pd.read_parquet('X_train_tree_ready.parquet', engine='fastparquet')
X_test = pd.read_parquet('X_test_tree_ready.parquet', engine='fastparquet')

y_train = pd.read_csv('y_train_tree_ready.csv').values.ravel()
y_test = pd.read_csv('y_test_tree_ready.csv').values.ravel()

print(f"Data Loaded. Train: {X_train.shape}")

Data Loaded. Train: (1258858, 105)


In [4]:
# 2. Create a Validation Set for Tuning
# We split the Training set to evaluate hyperparameter performance
# without touching the final Test set (avoiding data leakage).

X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Optimization Train shape: {X_train_opt.shape}")
print(f"Optimization Val shape  : {X_val_opt.shape}")

Optimization Train shape: (1007086, 105)
Optimization Val shape  : (251772, 105)


In [5]:
# 3. Define the Objective Function
def objective(trial):
    # Define search space for Hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        
        # CRITICAL for Imbalanced Data:
        # Values > 1 force the model to focus more on Defaults
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
        
        'n_jobs': -1,
        'random_state': 42,
        'tree_method': 'hist' 
    }
    
    # Train model with suggested params
    model = XGBClassifier(**params)
    model.fit(X_train_opt, y_train_opt)
    
    # Evaluate
    y_prob = model.predict_proba(X_val_opt)[:, 1]
    
    # We optimize ROC-AUC 
    return roc_auc_score(y_val_opt, y_prob)

print("Objective function defined.")

Objective function defined.


In [6]:
# 4. Run Optimization
print("Starting Optuna Study... (This may take time)")

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30) # 30 trials is a good start

print("\n Optimization Complete.")
print(f"Best ROC-AUC: {study.best_value:.4f}")
print("Best Params:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2025-12-05 21:44:57,786] A new study created in memory with name: no-name-7863f5da-f969-48e6-a254-6ac8e7f7f36b


Starting Optuna Study... (This may take time)


[I 2025-12-05 21:46:10,624] Trial 0 finished with value: 0.8665980966947268 and parameters: {'n_estimators': 209, 'max_depth': 5, 'learning_rate': 0.06956629985753023, 'subsample': 0.7389874975774333, 'colsample_bytree': 0.6724380771616946, 'reg_alpha': 7.974145676139028, 'reg_lambda': 1.2639070982713396, 'min_child_weight': 7, 'scale_pos_weight': 6.6522637499373385}. Best is trial 0 with value: 0.8665980966947268.
[I 2025-12-05 21:47:28,496] Trial 1 finished with value: 0.8366453462396922 and parameters: {'n_estimators': 219, 'max_depth': 4, 'learning_rate': 0.011963832962751438, 'subsample': 0.6596125122187885, 'colsample_bytree': 0.7054373772954733, 'reg_alpha': 4.554494529594574, 'reg_lambda': 6.5360669756638625, 'min_child_weight': 9, 'scale_pos_weight': 3.983579389864797}. Best is trial 0 with value: 0.8665980966947268.
[I 2025-12-05 21:49:50,543] Trial 2 finished with value: 0.8644825363198132 and parameters: {'n_estimators': 365, 'max_depth': 8, 'learning_rate': 0.2147251737123


 Optimization Complete.
Best ROC-AUC: 0.8708
Best Params:
  n_estimators: 416
  max_depth: 7
  learning_rate: 0.0837556167812679
  subsample: 0.8651031003026716
  colsample_bytree: 0.7692061995983361
  reg_alpha: 0.2503693497199821
  reg_lambda: 0.8805999826209923
  min_child_weight: 7
  scale_pos_weight: 1.8696573221824924


In [12]:
best_params = {
    'n_estimators': 416,
    'max_depth': 7,
    'learning_rate': 0.0837556167812679,
    'subsample': 0.8651031003026716,
    'colsample_bytree': 0.7692061995983361,
    'reg_alpha': 0.2503693497199821,
    'reg_lambda': 0.8805999826209923,
    'min_child_weight': 7,
    'scale_pos_weight': 1.8696573221824924, 
    
    
    'objective': 'binary',
    'metric': 'auc',
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1
}

In [8]:
from lightgbm import LGBMClassifier

best_params['n_jobs'] = -1
best_params['random_state'] = 42

print("Retraining LightGBM with BEST parameters on full Train Set...")

final_model = LGBMClassifier(**best_params)
final_model.fit(X_train, y_train)

y_pred_tuned = final_model.predict(X_test)
y_prob_tuned = final_model.predict_proba(X_test)[:, 1]

print("Final Model Trained.")

Retraining LightGBM with BEST parameters on full Train Set...
Final Model Trained.


In [11]:
import pandas as pd
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

# 5. Final Comparison Table: Before vs After Tuning

# Scores for Default LightGBM 
auc_default = 0.7055
rec_default = 0.4572
prec_default = 0.3840
f1_default = 0.4174
acc_default = 0.7236  

# Calculate Scores for Tuned Model
auc_tuned = roc_auc_score(y_test, y_prob_tuned)
rec_tuned = classification_report(y_test, y_pred_tuned, output_dict=True)['1']['recall']
prec_tuned = classification_report(y_test, y_pred_tuned, output_dict=True)['1']['precision']
f1_tuned = classification_report(y_test, y_pred_tuned, output_dict=True)['1']['f1-score']
acc_tuned = accuracy_score(y_test, y_pred_tuned) 

# Create Data Dictionary 
data = {
    "Metric": ["ROC-AUC", "Recall (Defaults)", "Precision", "F1-Score", "Accuracy"],
    "LightGBM (Default)": [auc_default, rec_default, prec_default, f1_default, acc_default],
    "LightGBM (Tuned)": [auc_tuned, rec_tuned, prec_tuned, f1_tuned, acc_tuned]
}

# Create DataFrame
df_compare = pd.DataFrame(data).set_index("Metric")

# Calculate Gain
df_compare["GAIN"] = df_compare["LightGBM (Tuned)"] - df_compare["LightGBM (Default)"]

print("\n=== OPTIMIZATION RESULTS ===")
display(df_compare.round(4))


=== OPTIMIZATION RESULTS ===


Unnamed: 0_level_0,LightGBM (Default),LightGBM (Tuned),GAIN
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ROC-AUC,0.7055,0.7196,0.0141
Recall (Defaults),0.4572,0.5841,0.1269
Precision,0.384,0.3683,-0.0157
F1-Score,0.4174,0.4518,0.0344
Accuracy,0.7236,0.693,-0.0306
