# 3) Model Training & Evaluation

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go
import optuna
import joblib 
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset and perform preprocessing (from notebook 2)
df = pd.read_csv('./data/bank_marketing_preprocessed.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y,age_group
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59
1,57,services,married,high.school,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,30-39
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,40-49
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,50-59


In [3]:
# Separate features and target
X = df.drop('y', axis=1)
y = df['y']
# Load the preprocessor pipeline
preprocessor = joblib.load('./models/preprocessor.pkl')
print("Preprocessor pipeline loaded successfully.")

# Split the raw data for a fair validation split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply the preprocessor pipeline to the split data
X_train_transformed = preprocessor.transform(X_train_raw)
X_test_transformed = preprocessor.transform(X_test_raw)


Preprocessor pipeline loaded successfully.


# 1. Hyperparameter Tuning with Optuna

In [4]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }
    model = lgb.LGBMClassifier(**params)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring='roc_auc')
    
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest hyperparameters found by Optuna:")
print(study.best_params)

[I 2025-08-14 13:02:24,870] A new study created in memory with name: no-name-5bc2a0f7-ecbf-4757-b64b-e3bf86b4895e
Best trial: 0. Best value: 0.75816:   2%|▏         | 1/50 [00:02<02:23,  2.92s/it]

[I 2025-08-14 13:02:27,792] Trial 0 finished with value: 0.7581598658986768 and parameters: {'n_estimators': 269, 'learning_rate': 0.2560767376596851, 'num_leaves': 79, 'max_depth': 14, 'subsample': 0.6830501029173188, 'colsample_bytree': 0.7792291375098078, 'min_child_samples': 100, 'reg_alpha': 0.8853997738987127, 'reg_lambda': 0.5866419241992956}. Best is trial 0 with value: 0.7581598658986768.


Best trial: 1. Best value: 0.762114:   4%|▍         | 2/50 [00:11<05:06,  6.39s/it]

[I 2025-08-14 13:02:36,604] Trial 1 finished with value: 0.7621137078740393 and parameters: {'n_estimators': 957, 'learning_rate': 0.10091423601487537, 'num_leaves': 70, 'max_depth': 14, 'subsample': 0.6328498870294438, 'colsample_bytree': 0.6336434185356036, 'min_child_samples': 97, 'reg_alpha': 0.7679747801430942, 'reg_lambda': 0.8424781596000473}. Best is trial 1 with value: 0.7621137078740393.


Best trial: 2. Best value: 0.777341:   6%|▌         | 3/50 [00:14<03:47,  4.84s/it]

[I 2025-08-14 13:02:39,604] Trial 2 finished with value: 0.7773410573719417 and parameters: {'n_estimators': 645, 'learning_rate': 0.10416170830749841, 'num_leaves': 31, 'max_depth': 14, 'subsample': 0.9396397545285105, 'colsample_bytree': 0.7715497903907783, 'min_child_samples': 21, 'reg_alpha': 0.7933946567741941, 'reg_lambda': 0.4165981977227977}. Best is trial 2 with value: 0.7773410573719417.


Best trial: 3. Best value: 0.78336:   8%|▊         | 4/50 [00:16<02:53,  3.77s/it] 

[I 2025-08-14 13:02:41,732] Trial 3 finished with value: 0.7833597952461597 and parameters: {'n_estimators': 313, 'learning_rate': 0.11014809532262076, 'num_leaves': 46, 'max_depth': 12, 'subsample': 0.7542050263204322, 'colsample_bytree': 0.6627464754069933, 'min_child_samples': 38, 'reg_alpha': 0.8886963879580861, 'reg_lambda': 0.9195668747427601}. Best is trial 3 with value: 0.7833597952461597.


Best trial: 3. Best value: 0.78336:  10%|█         | 5/50 [00:21<03:00,  4.00s/it]

[I 2025-08-14 13:02:46,147] Trial 4 finished with value: 0.753717513125409 and parameters: {'n_estimators': 401, 'learning_rate': 0.1892972680303908, 'num_leaves': 95, 'max_depth': 14, 'subsample': 0.9984370573178054, 'colsample_bytree': 0.6000217493074156, 'min_child_samples': 71, 'reg_alpha': 0.11083212626531902, 'reg_lambda': 0.24579615917709585}. Best is trial 3 with value: 0.7833597952461597.


Best trial: 3. Best value: 0.78336:  12%|█▏        | 6/50 [00:22<02:11,  3.00s/it]

[I 2025-08-14 13:02:47,193] Trial 5 finished with value: 0.7761657603483306 and parameters: {'n_estimators': 213, 'learning_rate': 0.2230765622872942, 'num_leaves': 35, 'max_depth': 7, 'subsample': 0.9378644914729241, 'colsample_bytree': 0.7333940296242453, 'min_child_samples': 65, 'reg_alpha': 0.06466903467824181, 'reg_lambda': 0.02336040027616204}. Best is trial 3 with value: 0.7833597952461597.


Best trial: 3. Best value: 0.78336:  14%|█▍        | 7/50 [00:28<02:50,  3.97s/it]

[I 2025-08-14 13:02:53,169] Trial 6 finished with value: 0.7436658038178556 and parameters: {'n_estimators': 679, 'learning_rate': 0.29553861797913783, 'num_leaves': 75, 'max_depth': 11, 'subsample': 0.8458915205585295, 'colsample_bytree': 0.9039503986511109, 'min_child_samples': 24, 'reg_alpha': 0.4864840593548274, 'reg_lambda': 0.8285485823977157}. Best is trial 3 with value: 0.7833597952461597.


Best trial: 3. Best value: 0.78336:  16%|█▌        | 8/50 [00:33<02:59,  4.28s/it]

[I 2025-08-14 13:02:58,101] Trial 7 finished with value: 0.7454597236418288 and parameters: {'n_estimators': 531, 'learning_rate': 0.2841901208180613, 'num_leaves': 75, 'max_depth': 11, 'subsample': 0.7299342370302481, 'colsample_bytree': 0.843225159806763, 'min_child_samples': 38, 'reg_alpha': 0.6517091015490347, 'reg_lambda': 0.5455597366637522}. Best is trial 3 with value: 0.7833597952461597.


Best trial: 8. Best value: 0.786274:  18%|█▊        | 9/50 [00:37<02:49,  4.13s/it]

[I 2025-08-14 13:03:01,897] Trial 8 finished with value: 0.786274278030162 and parameters: {'n_estimators': 732, 'learning_rate': 0.05932420716628583, 'num_leaves': 23, 'max_depth': 12, 'subsample': 0.6915796655848853, 'colsample_bytree': 0.8735896029051258, 'min_child_samples': 91, 'reg_alpha': 0.752351270025614, 'reg_lambda': 0.3554790901784769}. Best is trial 8 with value: 0.786274278030162.


Best trial: 8. Best value: 0.786274:  20%|██        | 10/50 [00:41<02:43,  4.09s/it]

[I 2025-08-14 13:03:05,897] Trial 9 finished with value: 0.7754540075555344 and parameters: {'n_estimators': 841, 'learning_rate': 0.07986667901011821, 'num_leaves': 37, 'max_depth': 8, 'subsample': 0.6103674458069269, 'colsample_bytree': 0.6586892299465275, 'min_child_samples': 79, 'reg_alpha': 0.4198152260471978, 'reg_lambda': 0.03712823890188699}. Best is trial 8 with value: 0.786274278030162.


Best trial: 10. Best value: 0.794634:  22%|██▏       | 11/50 [00:44<02:26,  3.76s/it]

[I 2025-08-14 13:03:08,916] Trial 10 finished with value: 0.7946341161241264 and parameters: {'n_estimators': 796, 'learning_rate': 0.031059635899339418, 'num_leaves': 22, 'max_depth': 5, 'subsample': 0.8323136438542311, 'colsample_bytree': 0.9783680445152194, 'min_child_samples': 86, 'reg_alpha': 0.28236091663373997, 'reg_lambda': 0.2962017051780916}. Best is trial 10 with value: 0.7946341161241264.


Best trial: 11. Best value: 0.798377:  24%|██▍       | 12/50 [00:46<02:10,  3.45s/it]

[I 2025-08-14 13:03:11,645] Trial 11 finished with value: 0.798377362643155 and parameters: {'n_estimators': 799, 'learning_rate': 0.012862344823567465, 'num_leaves': 21, 'max_depth': 5, 'subsample': 0.8269331190657346, 'colsample_bytree': 0.9774226282259786, 'min_child_samples': 84, 'reg_alpha': 0.29466810130422033, 'reg_lambda': 0.29437600945521236}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  26%|██▌       | 13/50 [00:50<02:05,  3.40s/it]

[I 2025-08-14 13:03:14,943] Trial 12 finished with value: 0.7977399180572518 and parameters: {'n_estimators': 891, 'learning_rate': 0.013457407762564697, 'num_leaves': 48, 'max_depth': 5, 'subsample': 0.8367601818621775, 'colsample_bytree': 0.9930300801820109, 'min_child_samples': 82, 'reg_alpha': 0.30476567878320365, 'reg_lambda': 0.22563398948516242}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  28%|██▊       | 14/50 [00:57<02:50,  4.73s/it]

[I 2025-08-14 13:03:22,729] Trial 13 finished with value: 0.7939532090343373 and parameters: {'n_estimators': 996, 'learning_rate': 0.025409241351044335, 'num_leaves': 55, 'max_depth': 5, 'subsample': 0.86998612792715, 'colsample_bytree': 0.9887672111134173, 'min_child_samples': 53, 'reg_alpha': 0.2542842453827534, 'reg_lambda': 0.19792848308092012}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  30%|███       | 15/50 [01:02<02:41,  4.62s/it]

[I 2025-08-14 13:03:27,094] Trial 14 finished with value: 0.79161652260658 and parameters: {'n_estimators': 873, 'learning_rate': 0.022717958812960664, 'num_leaves': 52, 'max_depth': 7, 'subsample': 0.7823369993922727, 'colsample_bytree': 0.9260193522660574, 'min_child_samples': 78, 'reg_alpha': 0.31779117111595584, 'reg_lambda': 0.15496370545708574}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  32%|███▏      | 16/50 [01:08<02:52,  5.08s/it]

[I 2025-08-14 13:03:33,243] Trial 15 finished with value: 0.7623400160272411 and parameters: {'n_estimators': 557, 'learning_rate': 0.15322101090062584, 'num_leaves': 43, 'max_depth': 8, 'subsample': 0.9073946104781385, 'colsample_bytree': 0.9382832585112102, 'min_child_samples': 57, 'reg_alpha': 0.1759018482209559, 'reg_lambda': 0.658169853473094}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  34%|███▍      | 17/50 [01:15<03:05,  5.62s/it]

[I 2025-08-14 13:03:40,122] Trial 16 finished with value: 0.7659405041316156 and parameters: {'n_estimators': 886, 'learning_rate': 0.1533752513935231, 'num_leaves': 62, 'max_depth': 6, 'subsample': 0.7995087644252766, 'colsample_bytree': 0.8437439138344287, 'min_child_samples': 83, 'reg_alpha': 0.5775066224961514, 'reg_lambda': 0.4511509723561459}. Best is trial 11 with value: 0.798377362643155.


Best trial: 11. Best value: 0.798377:  36%|███▌      | 18/50 [01:19<02:45,  5.16s/it]

[I 2025-08-14 13:03:44,213] Trial 17 finished with value: 0.7808581403682038 and parameters: {'n_estimators': 764, 'learning_rate': 0.0580470936311545, 'num_leaves': 29, 'max_depth': 9, 'subsample': 0.8825037065293427, 'colsample_bytree': 0.99096957908219, 'min_child_samples': 71, 'reg_alpha': 0.4255620892946047, 'reg_lambda': 0.1248317945782326}. Best is trial 11 with value: 0.798377362643155.


Best trial: 18. Best value: 0.799056:  38%|███▊      | 19/50 [01:21<02:13,  4.30s/it]

[I 2025-08-14 13:03:46,518] Trial 18 finished with value: 0.7990562408740696 and parameters: {'n_estimators': 607, 'learning_rate': 0.015625070144960482, 'num_leaves': 92, 'max_depth': 5, 'subsample': 0.8246334895185483, 'colsample_bytree': 0.9485888883919794, 'min_child_samples': 50, 'reg_alpha': 0.35892876782015337, 'reg_lambda': 0.3393262058242598}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  40%|████      | 20/50 [01:22<01:36,  3.21s/it]

[I 2025-08-14 13:03:47,187] Trial 19 finished with value: 0.7984772021629889 and parameters: {'n_estimators': 101, 'learning_rate': 0.05687154238304269, 'num_leaves': 99, 'max_depth': 6, 'subsample': 0.7509222217413484, 'colsample_bytree': 0.9389128959676191, 'min_child_samples': 50, 'reg_alpha': 0.014163360362023036, 'reg_lambda': 0.6931423269109389}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  42%|████▏     | 21/50 [01:24<01:24,  2.91s/it]

[I 2025-08-14 13:03:49,408] Trial 20 finished with value: 0.788007416458436 and parameters: {'n_estimators': 435, 'learning_rate': 0.05660311379242482, 'num_leaves': 100, 'max_depth': 7, 'subsample': 0.7492166030138517, 'colsample_bytree': 0.8889345369038028, 'min_child_samples': 47, 'reg_alpha': 0.015391004573262184, 'reg_lambda': 0.7034907641558502}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  44%|████▍     | 22/50 [01:25<01:03,  2.28s/it]

[I 2025-08-14 13:03:50,206] Trial 21 finished with value: 0.7983824575666257 and parameters: {'n_estimators': 127, 'learning_rate': 0.046575041921884726, 'num_leaves': 88, 'max_depth': 6, 'subsample': 0.7910036724399508, 'colsample_bytree': 0.9392825939875558, 'min_child_samples': 46, 'reg_alpha': 0.1478094105320169, 'reg_lambda': 0.35735456848369174}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  46%|████▌     | 23/50 [01:26<00:50,  1.87s/it]

[I 2025-08-14 13:03:51,116] Trial 22 finished with value: 0.7959572489098381 and parameters: {'n_estimators': 142, 'learning_rate': 0.07373785426931025, 'num_leaves': 90, 'max_depth': 6, 'subsample': 0.7062964528112673, 'colsample_bytree': 0.9395370673110518, 'min_child_samples': 43, 'reg_alpha': 0.16604547844504683, 'reg_lambda': 0.49427805285130105}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  48%|████▊     | 24/50 [01:27<00:40,  1.55s/it]

[I 2025-08-14 13:03:51,926] Trial 23 finished with value: 0.7919531387146546 and parameters: {'n_estimators': 113, 'learning_rate': 0.13541541616621666, 'num_leaves': 87, 'max_depth': 6, 'subsample': 0.7771472195136403, 'colsample_bytree': 0.9453070490253073, 'min_child_samples': 32, 'reg_alpha': 0.019996150738239588, 'reg_lambda': 0.6719744810326862}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  50%|█████     | 25/50 [01:28<00:40,  1.60s/it]

[I 2025-08-14 13:03:53,650] Trial 24 finished with value: 0.7929092378896143 and parameters: {'n_estimators': 174, 'learning_rate': 0.04431380934335676, 'num_leaves': 86, 'max_depth': 9, 'subsample': 0.79796197641776, 'colsample_bytree': 0.8568303792493082, 'min_child_samples': 51, 'reg_alpha': 0.16575194335984061, 'reg_lambda': 0.4145158481188421}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  52%|█████▏    | 26/50 [01:31<00:45,  1.89s/it]

[I 2025-08-14 13:03:56,197] Trial 25 finished with value: 0.7819634540993075 and parameters: {'n_estimators': 359, 'learning_rate': 0.08964784710756446, 'num_leaves': 100, 'max_depth': 8, 'subsample': 0.7183534685901338, 'colsample_bytree': 0.8112052810324905, 'min_child_samples': 61, 'reg_alpha': 0.4042787300786001, 'reg_lambda': 0.3523780053936206}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  54%|█████▍    | 27/50 [01:33<00:44,  1.95s/it]

[I 2025-08-14 13:03:58,308] Trial 26 finished with value: 0.7767221290986746 and parameters: {'n_estimators': 503, 'learning_rate': 0.11704253383729446, 'num_leaves': 82, 'max_depth': 6, 'subsample': 0.6548178750303487, 'colsample_bytree': 0.9175919870043194, 'min_child_samples': 45, 'reg_alpha': 0.11881608546728822, 'reg_lambda': 0.6027684463359676}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  56%|█████▌    | 28/50 [01:34<00:39,  1.80s/it]

[I 2025-08-14 13:03:59,756] Trial 27 finished with value: 0.7951024722069756 and parameters: {'n_estimators': 221, 'learning_rate': 0.0427623117219514, 'num_leaves': 93, 'max_depth': 7, 'subsample': 0.7552234437810326, 'colsample_bytree': 0.9533740424944052, 'min_child_samples': 29, 'reg_alpha': 0.21407942943050023, 'reg_lambda': 0.7720386275133758}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  58%|█████▊    | 29/50 [01:35<00:30,  1.47s/it]

[I 2025-08-14 13:04:00,468] Trial 28 finished with value: 0.789904846673409 and parameters: {'n_estimators': 106, 'learning_rate': 0.1844219376220853, 'num_leaves': 66, 'max_depth': 6, 'subsample': 0.8644298712153295, 'colsample_bytree': 0.8957363296486509, 'min_child_samples': 38, 'reg_alpha': 0.09304915225784512, 'reg_lambda': 0.5050191447645311}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  60%|██████    | 30/50 [01:38<00:35,  1.76s/it]

[I 2025-08-14 13:04:02,882] Trial 29 finished with value: 0.785966202812904 and parameters: {'n_estimators': 265, 'learning_rate': 0.07876907548035564, 'num_leaves': 82, 'max_depth': 9, 'subsample': 0.671125095550258, 'colsample_bytree': 0.7367800995167463, 'min_child_samples': 55, 'reg_alpha': 0.36776074566893785, 'reg_lambda': 0.5944434060893281}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  62%|██████▏   | 31/50 [01:40<00:35,  1.88s/it]

[I 2025-08-14 13:04:05,034] Trial 30 finished with value: 0.7957106569360837 and parameters: {'n_estimators': 610, 'learning_rate': 0.04122262130423873, 'num_leaves': 95, 'max_depth': 5, 'subsample': 0.8154925762775763, 'colsample_bytree': 0.8122552945580436, 'min_child_samples': 63, 'reg_alpha': 0.5182338549373522, 'reg_lambda': 0.39542197871224555}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  64%|██████▍   | 32/50 [01:42<00:37,  2.06s/it]

[I 2025-08-14 13:04:07,513] Trial 31 finished with value: 0.7969800208394704 and parameters: {'n_estimators': 709, 'learning_rate': 0.023863007976318906, 'num_leaves': 77, 'max_depth': 5, 'subsample': 0.8189295500027078, 'colsample_bytree': 0.9571844794233147, 'min_child_samples': 48, 'reg_alpha': 0.21784736173571595, 'reg_lambda': 0.27740478739267127}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  66%|██████▌   | 33/50 [01:45<00:37,  2.23s/it]

[I 2025-08-14 13:04:10,152] Trial 32 finished with value: 0.7981742791461379 and parameters: {'n_estimators': 596, 'learning_rate': 0.012920288143591148, 'num_leaves': 69, 'max_depth': 6, 'subsample': 0.7743916340656111, 'colsample_bytree': 0.9796804094500303, 'min_child_samples': 71, 'reg_alpha': 0.3338086164173135, 'reg_lambda': 0.3302209262637042}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  68%|██████▊   | 34/50 [01:46<00:33,  2.07s/it]

[I 2025-08-14 13:04:11,859] Trial 33 finished with value: 0.7919198996836442 and parameters: {'n_estimators': 492, 'learning_rate': 0.06616910451866323, 'num_leaves': 88, 'max_depth': 5, 'subsample': 0.8519967066373266, 'colsample_bytree': 0.959877932728493, 'min_child_samples': 41, 'reg_alpha': 0.2451131576082029, 'reg_lambda': 0.9919346372255972}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 18. Best value: 0.799056:  70%|███████   | 35/50 [01:50<00:36,  2.46s/it]

[I 2025-08-14 13:04:15,225] Trial 34 finished with value: 0.78695318832789 and parameters: {'n_estimators': 301, 'learning_rate': 0.04064515610981585, 'num_leaves': 96, 'max_depth': 15, 'subsample': 0.896410382314923, 'colsample_bytree': 0.9169396419368112, 'min_child_samples': 98, 'reg_alpha': 0.051796599872870885, 'reg_lambda': 0.10132075220695808}. Best is trial 18 with value: 0.7990562408740696.


Best trial: 35. Best value: 0.799082:  72%|███████▏  | 36/50 [01:53<00:36,  2.59s/it]

[I 2025-08-14 13:04:18,115] Trial 35 finished with value: 0.7990818631807087 and parameters: {'n_estimators': 447, 'learning_rate': 0.01093697962475703, 'num_leaves': 82, 'max_depth': 7, 'subsample': 0.8120523878750876, 'colsample_bytree': 0.878047974397901, 'min_child_samples': 58, 'reg_alpha': 0.0007410392379483786, 'reg_lambda': 0.4512054135333452}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  74%|███████▍  | 37/50 [01:55<00:33,  2.54s/it]

[I 2025-08-14 13:04:20,543] Trial 36 finished with value: 0.7814594675038401 and parameters: {'n_estimators': 436, 'learning_rate': 0.09581466828486354, 'num_leaves': 81, 'max_depth': 7, 'subsample': 0.7385902343899571, 'colsample_bytree': 0.8632334978054008, 'min_child_samples': 57, 'reg_alpha': 0.981579145984464, 'reg_lambda': 0.4715161518453461}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  76%|███████▌  | 38/50 [01:57<00:26,  2.20s/it]

[I 2025-08-14 13:04:21,943] Trial 37 finished with value: 0.7827545228300109 and parameters: {'n_estimators': 210, 'learning_rate': 0.12014744147974189, 'num_leaves': 91, 'max_depth': 8, 'subsample': 0.7591484290284615, 'colsample_bytree': 0.9001679095345287, 'min_child_samples': 50, 'reg_alpha': 0.13541256910631325, 'reg_lambda': 0.5506758114785594}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  78%|███████▊  | 39/50 [01:58<00:23,  2.12s/it]

[I 2025-08-14 13:04:23,870] Trial 38 finished with value: 0.7931109171183558 and parameters: {'n_estimators': 363, 'learning_rate': 0.047244893687030776, 'num_leaves': 85, 'max_depth': 7, 'subsample': 0.9346247473128401, 'colsample_bytree': 0.7666991838050932, 'min_child_samples': 68, 'reg_alpha': 0.07155794648051625, 'reg_lambda': 0.7518565607366416}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  80%|████████  | 40/50 [02:00<00:18,  1.89s/it]

[I 2025-08-14 13:04:25,227] Trial 39 finished with value: 0.7760596156585395 and parameters: {'n_estimators': 239, 'learning_rate': 0.22398905934228197, 'num_leaves': 71, 'max_depth': 6, 'subsample': 0.8019908975871468, 'colsample_bytree': 0.8791650421975719, 'min_child_samples': 59, 'reg_alpha': 0.007615875949425024, 'reg_lambda': 0.409610772429809}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  82%|████████▏ | 41/50 [02:02<00:17,  1.98s/it]

[I 2025-08-14 13:04:27,422] Trial 40 finished with value: 0.794269689606742 and parameters: {'n_estimators': 174, 'learning_rate': 0.03170145343537758, 'num_leaves': 96, 'max_depth': 10, 'subsample': 0.7059286107061697, 'colsample_bytree': 0.9230078860203547, 'min_child_samples': 34, 'reg_alpha': 0.06667914337520488, 'reg_lambda': 0.5325502568296991}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  84%|████████▍ | 42/50 [02:05<00:17,  2.22s/it]

[I 2025-08-14 13:04:30,192] Trial 41 finished with value: 0.7984069190404974 and parameters: {'n_estimators': 659, 'learning_rate': 0.010488172643381755, 'num_leaves': 99, 'max_depth': 5, 'subsample': 0.8234599810164661, 'colsample_bytree': 0.9667040492085175, 'min_child_samples': 53, 'reg_alpha': 0.13387699227615965, 'reg_lambda': 0.3013468506017978}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  86%|████████▌ | 43/50 [02:09<00:18,  2.69s/it]

[I 2025-08-14 13:04:33,990] Trial 42 finished with value: 0.7976582852576104 and parameters: {'n_estimators': 678, 'learning_rate': 0.01100557742989864, 'num_leaves': 100, 'max_depth': 6, 'subsample': 0.7883569715012102, 'colsample_bytree': 0.961395273975812, 'min_child_samples': 52, 'reg_alpha': 0.11832956370674198, 'reg_lambda': 0.362098154448774}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  88%|████████▊ | 44/50 [02:11<00:15,  2.59s/it]

[I 2025-08-14 13:04:36,335] Trial 43 finished with value: 0.7968730041251633 and parameters: {'n_estimators': 601, 'learning_rate': 0.03199803757123981, 'num_leaves': 91, 'max_depth': 5, 'subsample': 0.9884977253148527, 'colsample_bytree': 0.9049435895707386, 'min_child_samples': 66, 'reg_alpha': 0.18005930507191173, 'reg_lambda': 0.24010428943928872}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  90%|█████████ | 45/50 [02:17<00:18,  3.75s/it]

[I 2025-08-14 13:04:42,810] Trial 44 finished with value: 0.7671677786807696 and parameters: {'n_estimators': 628, 'learning_rate': 0.05621740262410313, 'num_leaves': 96, 'max_depth': 13, 'subsample': 0.8485100262425053, 'colsample_bytree': 0.9707875693992161, 'min_child_samples': 40, 'reg_alpha': 0.008060582524816314, 'reg_lambda': 0.4511160997169216}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  92%|█████████▏| 46/50 [02:21<00:14,  3.61s/it]

[I 2025-08-14 13:04:46,075] Trial 45 finished with value: 0.7812908409705965 and parameters: {'n_estimators': 566, 'learning_rate': 0.06833946216244667, 'num_leaves': 84, 'max_depth': 7, 'subsample': 0.8145948355524719, 'colsample_bytree': 0.9416187236910389, 'min_child_samples': 45, 'reg_alpha': 0.051920287615246946, 'reg_lambda': 0.874845533356629}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  94%|█████████▍| 47/50 [02:24<00:10,  3.57s/it]

[I 2025-08-14 13:04:49,573] Trial 46 finished with value: 0.7972754175726681 and parameters: {'n_estimators': 678, 'learning_rate': 0.024262549273810052, 'num_leaves': 79, 'max_depth': 5, 'subsample': 0.7682924891757454, 'colsample_bytree': 0.8294315425823675, 'min_child_samples': 54, 'reg_alpha': 0.7041869927138759, 'reg_lambda': 0.3132555615107846}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  96%|█████████▌| 48/50 [02:26<00:06,  3.09s/it]

[I 2025-08-14 13:04:51,529] Trial 47 finished with value: 0.7983995011061319 and parameters: {'n_estimators': 317, 'learning_rate': 0.010170725963453538, 'num_leaves': 93, 'max_depth': 6, 'subsample': 0.8345035624582602, 'colsample_bytree': 0.929425584845916, 'min_child_samples': 61, 'reg_alpha': 0.1290662843949476, 'reg_lambda': 0.20584188876931395}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082:  98%|█████████▊| 49/50 [02:30<00:03,  3.26s/it]

[I 2025-08-14 13:04:55,193] Trial 48 finished with value: 0.7961765898433049 and parameters: {'n_estimators': 448, 'learning_rate': 0.012353819834427373, 'num_leaves': 94, 'max_depth': 8, 'subsample': 0.8367191430943886, 'colsample_bytree': 0.997748439829562, 'min_child_samples': 63, 'reg_alpha': 0.09958619377208774, 'reg_lambda': 0.179041219825358}. Best is trial 35 with value: 0.7990818631807087.


Best trial: 35. Best value: 0.799082: 100%|██████████| 50/50 [02:31<00:00,  3.04s/it]

[I 2025-08-14 13:04:56,709] Trial 49 finished with value: 0.776540137308745 and parameters: {'n_estimators': 323, 'learning_rate': 0.2605887189041688, 'num_leaves': 74, 'max_depth': 5, 'subsample': 0.8771106620608956, 'colsample_bytree': 0.8829379140925231, 'min_child_samples': 75, 'reg_alpha': 0.4781240035875893, 'reg_lambda': 0.2714909669389315}. Best is trial 35 with value: 0.7990818631807087.

Best hyperparameters found by Optuna:
{'n_estimators': 447, 'learning_rate': 0.01093697962475703, 'num_leaves': 82, 'max_depth': 7, 'subsample': 0.8120523878750876, 'colsample_bytree': 0.878047974397901, 'min_child_samples': 58, 'reg_alpha': 0.0007410392379483786, 'reg_lambda': 0.4512054135333452}





# 2. Train and Evaluate the Final Model

In [12]:
best_params = study.best_params
best_model = lgb.LGBMClassifier(**best_params, objective='binary', random_state=42, n_jobs=-1, verbose=-1)
best_model.fit(X_train_transformed, y_train)

In [13]:
# Make predictions
y_pred = best_model.predict(X_test_transformed)
y_pred_proba = best_model.predict_proba(X_test_transformed)[:, 1]

# 3. Comprehensive Evaluation

In [14]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      7310
           1       0.68      0.27      0.39       928

    accuracy                           0.90      8238
   macro avg       0.80      0.63      0.67      8238
weighted avg       0.89      0.90      0.88      8238



In [15]:
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, text_auto=True, color_continuous_scale='Blues',
                labels={'x':'Predicted', 'y':'Actual'},
                x=['No', 'Yes'], y=['No', 'Yes'],
                title='Confusion Matrix')
fig.show()


Confusion Matrix:


In [16]:
# ROC Curve and AUC Score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (area = {roc_auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash')))
fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve',
                  xaxis_title='False Positive Rate', yaxis_title='True Positive Rate',
                  xaxis_range=[0, 1], yaxis_range=[0, 1])
fig.show()

print(f"\nROC AUC Score: {roc_auc:.4f}")


ROC AUC Score: 0.8142


In [17]:
# Save the trained model
joblib.dump(best_model, './models/best_model.pkl')
print("\nTrained model saved as 'best_model.pkl'")


Trained model saved as 'best_model.pkl'


# 4. Feature Importance Analysis

In [19]:
print("\nFeature Importance Analysis:")
feature_importances = best_model.feature_importances_

# Get the feature names from the preprocessor
feature_names = preprocessor.get_feature_names_out()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)

fig_feat_imp = px.bar(importance_df.head(20), x='importance', y='feature',
                       orientation='h',
                       title='Top 20 Feature Importances',
                       labels={'importance': 'Importance', 'feature': 'Feature Name'})
fig_feat_imp.update_layout(yaxis={'categoryorder':'total ascending'})
fig_feat_imp.show()


Feature Importance Analysis:
