# Telco Churn — Optimized Pipeline (Fixed)

Dataset path used: `"C:\Users\Logesh\OneDrive\Desktop\AI_Hybrid_Training\Jupyter\ML_Coding_Practice\Task_For_Cultus_Job_Readiness\WA_Fn-UseC_-Telco-Customer-Churn.csv"`


## Requirements
`pandas`, `numpy`, `scikit-learn`, `matplotlib`, `lightgbm`, `xgboost`, `imblearn`, `shap`, `optuna` (optional), `joblib`, `zipfile`.


In [14]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Optional packages
try:
    import lightgbm as lgb
    LGB_INSTALLED = True
except Exception:
    LGB_INSTALLED = False

try:
    import xgboost as xgb
    XGB_INSTALLED = True
except Exception:
    XGB_INSTALLED = False

try:
    from imblearn.over_sampling import SMOTE
    IMB_INSTALLED = True
except Exception:
    IMB_INSTALLED = False

try:
    import shap
    SHAP_INSTALLED = True
except Exception:
    SHAP_INSTALLED = False

try:
    import optuna
    OPTUNA_INSTALLED = True
except Exception:
    OPTUNA_INSTALLED = False

print('LightGBM:', LGB_INSTALLED, 'XGBoost:', XGB_INSTALLED, 'imblearn(SMOTE):', IMB_INSTALLED, 'SHAP:', SHAP_INSTALLED, 'Optuna:', OPTUNA_INSTALLED)


LightGBM: True XGBoost: True imblearn(SMOTE): True SHAP: True Optuna: True


In [15]:
## Load dataset
path = r'C:\Users\Logesh\OneDrive\Desktop\AI_Hybrid_Training\Jupyter\ML_Coding_Practice\Task_For_Cultus_Job_Readiness\WA_Fn-UseC_-Telco-Customer-Churn.csv'
if not os.path.exists(path):
    raise FileNotFoundError(f"Dataset not found at {path}. Update path or upload the CSV to the environment.")

df = pd.read_csv(path)
print('Loaded dataset shape:', df.shape)
df.head().T


Loaded dataset shape: (7043, 21)


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [16]:
df = df.copy()
if 'customerID' in df.columns:
    df.drop(columns=['customerID'], inplace=True)

if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})
else:
    raise KeyError('Target `Churn` not found')

print('Missing values per column:')
print(df.isnull().sum())

# Feature/target split
X = df.drop(columns=['Churn'])
y = df['Churn'].copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

print('Numerical cols:', num_cols)
print('Categorical cols:', cat_cols)

# Impute numeric
num_imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(num_imputer.fit_transform(X[num_cols]), columns=num_cols)

# Categorical handling
X_cat = X[cat_cols].fillna('missing').copy()
for col in X_cat.columns:
    X_cat[col] = X_cat[col].astype(str).str.strip()
    X_cat[col] = X_cat[col].replace({'No internet service':'No', 'No phone service':'No'})

# Map Yes/No to 1/0 where applicable
binary_map = {'Yes':1,'No':0,'Male':1,'Female':0}
for c in X_cat.columns:
    uniques = set([str(u) for u in X_cat[c].unique()])
    if uniques.issubset({'Yes','No','yes','no','1','0'}):
        X_cat[c] = X_cat[c].map(lambda v: 1 if str(v).lower()=='yes' else 0)

# One-hot remaining
remaining = [c for c in X_cat.columns if X_cat[c].dtype==object]
X_cat_remain = pd.get_dummies(X_cat[remaining], drop_first=True)

# Combine
X_proc = pd.concat([X_num.reset_index(drop=True), X_cat[[c for c in X_cat.columns if c not in remaining]].reset_index(drop=True), X_cat_remain.reset_index(drop=True)], axis=1)

print('Processed features shape:', X_proc.shape)

# Scale numeric features
scaler = StandardScaler()
if num_cols:
    X_proc[num_cols] = scaler.fit_transform(X_proc[num_cols])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape, 'Test:', X_test.shape)


Missing values per column:
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
Numerical cols: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical cols: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Processed features shape: (7043, 23)
Train: (5634, 23) Test: (1409, 23)


## Class balancing — SMOTE (if available) or class_weight fallback


In [17]:
use_smote = IMB_INSTALLED
if use_smote:
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
    print('Balanced train shape:', X_train_bal.shape, 'Class proportions:', y_train_bal.value_counts(normalize=True).to_dict())
else:
    X_train_bal, y_train_bal = X_train.copy(), y_train.copy()
    print('SMOTE not available — using original training data')


Balanced train shape: (8278, 23) Class proportions: {0: 0.5, 1: 0.5}


## Tuning and training


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import early_stopping, log_evaluation
import time

def tune_lightgbm(X, y):
    if OPTUNA_INSTALLED:
        import optuna
        def objective(trial):
            param = {
                'objective':'binary',
                'metric':'auc',
                'verbosity':-1,
                'boosting_type':'gbdt',
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.2),
                'num_leaves': trial.suggest_int('num_leaves', 20, 200),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
                'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
                'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 10)
            }
            dtrain = lgb.Dataset(X, label=y)
            cv = lgb.cv(param,dtrain, nfold=5,stratified=True,seed=42,metrics='auc',callbacks=[early_stopping(stopping_rounds=30), log_evaluation(-1)])
            # Detect correct AUC key automatically
            auc_key = None
            for k in cv.keys():
                if "auc" in k and "mean" in k:
                    auc_key = k
                    break
            if auc_key is None:
                raise KeyError(f"No AUC mean key found. Available keys: {list(cv.keys())}")
            return max(cv[auc_key])

        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
        study.optimize(objective, n_trials=30)
        best = study.best_params
        best['n_estimators'] = 1000
        best['random_state'] = 42
        model = lgb.LGBMClassifier(**best)
        model.fit(X, y)
        return model, study
    else:
        param_dist = {
            'num_leaves':[31, 50, 70, 100, 150],
            'learning_rate':[0.01, 0.03, 0.05, 0.1],
            'n_estimators':[200,400,800,1000],
            'max_depth':[5,8,12,None]
        }
        clf = lgb.LGBMClassifier(random_state=42)
        rs = RandomizedSearchCV(clf, param_dist, n_iter=20, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1)
        rs.fit(X, y)
        return rs.best_estimator_, rs

# Tuning
start = time.time()
if LGB_INSTALLED:
    print('Tuning LightGBM...')
    model_lgb, tuner = tune_lightgbm(X_train_bal, y_train_bal)
    best_candidates = {'lightgbm': model_lgb}
else:
    print('LightGBM not installed — tuning RandomForest')
    param_dist = {
        'n_estimators':[200,300,500],
        'max_depth':[6,8,12,20,None],
        'min_samples_split':[2,5,10]
    }
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    rs = RandomizedSearchCV(rf, param_dist, n_iter=20, scoring='roc_auc', cv=3, random_state=42, n_jobs=-1)
    rs.fit(X_train_bal, y_train_bal)
    best_candidates = {'random_forest': rs.best_estimator_}
end = time.time()
print('Tuning duration (s):', round(end-start,1))


[I 2025-11-18 07:13:44,567] A new study created in memory with name: no-name-838ac6df-df20-4b96-95c0-6bbd6e77f4a3


Tuning LightGBM...
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:45,874] Trial 0 finished with value: 0.8709238151560769 and parameters: {'learning_rate': 0.00727491708802781, 'num_leaves': 192, 'max_depth': 12, 'min_data_in_leaf': 64, 'feature_fraction': 0.5780093202212182, 'bagging_fraction': 0.5779972601681014, 'bagging_freq': 1}. Best is trial 0 with value: 0.8709238151560769.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.870924 + 0.00493854
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:47,853] Trial 1 finished with value: 0.9177133697724212 and parameters: {'learning_rate': 0.09842315738502598, 'num_leaves': 128, 'max_depth': 12, 'min_data_in_leaf': 11, 'feature_fraction': 0.9849549260809971, 'bagging_fraction': 0.9162213204002109, 'bagging_freq': 3}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[99]	valid's auc: 0.917713 + 0.00490213
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:49,059] Trial 2 finished with value: 0.8666764822156194 and parameters: {'learning_rate': 0.002620503255096255, 'num_leaves': 53, 'max_depth': 6, 'min_data_in_leaf': 57, 'feature_fraction': 0.7159725093210578, 'bagging_fraction': 0.645614570099021, 'bagging_freq': 7}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.866676 + 0.00502042
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:50,286] Trial 3 finished with value: 0.8673290447857402 and parameters: {'learning_rate': 0.002094013887393744, 'num_leaves': 72, 'max_depth': 7, 'min_data_in_leaf': 51, 'feature_fraction': 0.8925879806965068, 'bagging_fraction': 0.5998368910791798, 'bagging_freq': 6}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.867329 + 0.00468584
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:51,579] Trial 4 finished with value: 0.8821998272073952 and parameters: {'learning_rate': 0.023076168040117313, 'num_leaves': 28, 'max_depth': 10, 'min_data_in_leaf': 25, 'feature_fraction': 0.5325257964926398, 'bagging_fraction': 0.9744427686266666, 'bagging_freq': 10}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.8822 + 0.00485066
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:52,534] Trial 5 finished with value: 0.8814943936234221 and parameters: {'learning_rate': 0.07246804518258447, 'num_leaves': 75, 'max_depth': 4, 'min_data_in_leaf': 72, 'feature_fraction': 0.7200762468698007, 'bagging_fraction': 0.5610191174223894, 'bagging_freq': 5}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.881494 + 0.00522523
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:53,677] Trial 6 finished with value: 0.8659826065763457 and parameters: {'learning_rate': 0.0011998556988857205, 'num_leaves': 184, 'max_depth': 6, 'min_data_in_leaf': 70, 'feature_fraction': 0.6558555380447055, 'bagging_fraction': 0.7600340105889054, 'bagging_freq': 6}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.865983 + 0.00496884
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:54,949] Trial 7 finished with value: 0.8620646446353625 and parameters: {'learning_rate': 0.0026629048398290438, 'num_leaves': 195, 'max_depth': 13, 'min_data_in_leaf': 95, 'feature_fraction': 0.9474136752138245, 'bagging_fraction': 0.7989499894055425, 'bagging_freq': 10}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.862065 + 0.00579147
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:55,993] Trial 8 finished with value: 0.8494428712968967 and parameters: {'learning_rate': 0.0015981734133214873, 'num_leaves': 55, 'max_depth': 3, 'min_data_in_leaf': 39, 'feature_fraction': 0.6943386448447411, 'bagging_fraction': 0.6356745158869479, 'bagging_freq': 9}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.849443 + 0.0065163
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:57,520] Trial 9 finished with value: 0.8780879811894208 and parameters: {'learning_rate': 0.006620642015198968, 'num_leaves': 70, 'max_depth': 10, 'min_data_in_leaf': 22, 'feature_fraction': 0.9010984903770198, 'bagging_fraction': 0.5372753218398854, 'bagging_freq': 10}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.878088 + 0.0043435
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:13:59,889] Trial 10 finished with value: 0.9171649833843277 and parameters: {'learning_rate': 0.17879358592341904, 'num_leaves': 138, 'max_depth': 15, 'min_data_in_leaf': 11, 'feature_fraction': 0.8259332753890892, 'bagging_fraction': 0.9538323976412588, 'bagging_freq': 2}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.917165 + 0.00675714
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:01,981] Trial 11 finished with value: 0.9169259624564459 and parameters: {'learning_rate': 0.18027155652011362, 'num_leaves': 133, 'max_depth': 15, 'min_data_in_leaf': 11, 'feature_fraction': 0.8329012695800526, 'bagging_fraction': 0.983276824082136, 'bagging_freq': 2}. Best is trial 1 with value: 0.9177133697724212.


Did not meet early stopping. Best iteration is:
[95]	valid's auc: 0.916926 + 0.00487764
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:04,001] Trial 12 finished with value: 0.9178019280636315 and parameters: {'learning_rate': 0.19351320248296228, 'num_leaves': 133, 'max_depth': 15, 'min_data_in_leaf': 10, 'feature_fraction': 0.8166719134050416, 'bagging_fraction': 0.8811321081835723, 'bagging_freq': 3}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[98]	valid's auc: 0.917802 + 0.00478506
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:05,664] Trial 13 finished with value: 0.9084603954134598 and parameters: {'learning_rate': 0.05723191472011819, 'num_leaves': 113, 'max_depth': 13, 'min_data_in_leaf': 33, 'feature_fraction': 0.9901361973666086, 'bagging_fraction': 0.869561503063886, 'bagging_freq': 4}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.90846 + 0.00607585
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:07,160] Trial 14 finished with value: 0.9088647266448986 and parameters: {'learning_rate': 0.07180554850648081, 'num_leaves': 148, 'max_depth': 12, 'min_data_in_leaf': 39, 'feature_fraction': 0.8095839387020464, 'bagging_fraction': 0.8675516528360194, 'bagging_freq': 3}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.908865 + 0.00535136
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:09,525] Trial 15 finished with value: 0.9051323817930083 and parameters: {'learning_rate': 0.030437815990665255, 'num_leaves': 163, 'max_depth': 15, 'min_data_in_leaf': 21, 'feature_fraction': 0.9980084946806067, 'bagging_fraction': 0.8888558017534703, 'bagging_freq': 4}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.905132 + 0.00588443
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:10,845] Trial 16 finished with value: 0.9026218016993564 and parameters: {'learning_rate': 0.11252973377812939, 'num_leaves': 105, 'max_depth': 13, 'min_data_in_leaf': 88, 'feature_fraction': 0.7897830247330323, 'bagging_fraction': 0.8112468150077519, 'bagging_freq': 1}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.902622 + 0.00624249
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:12,681] Trial 17 finished with value: 0.9045604385812748 and parameters: {'learning_rate': 0.03215190750306503, 'num_leaves': 108, 'max_depth': 9, 'min_data_in_leaf': 10, 'feature_fraction': 0.8820547817901825, 'bagging_fraction': 0.9159175729102804, 'bagging_freq': 3}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.90456 + 0.00512692
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:14,526] Trial 18 finished with value: 0.8796433742554173 and parameters: {'learning_rate': 0.0130420401781776, 'num_leaves': 164, 'max_depth': 11, 'min_data_in_leaf': 46, 'feature_fraction': 0.9345465217221584, 'bagging_fraction': 0.6943845973931931, 'bagging_freq': 4}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.879643 + 0.00361744
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:16,272] Trial 19 finished with value: 0.9140099722534545 and parameters: {'learning_rate': 0.12051188402559913, 'num_leaves': 126, 'max_depth': 14, 'min_data_in_leaf': 29, 'feature_fraction': 0.6238624610835978, 'bagging_fraction': 0.8255044162944253, 'bagging_freq': 7}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.91401 + 0.00670037
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:17,732] Trial 20 finished with value: 0.9059698356744613 and parameters: {'learning_rate': 0.049645639523435245, 'num_leaves': 93, 'max_depth': 8, 'min_data_in_leaf': 18, 'feature_fraction': 0.7752847229001597, 'bagging_fraction': 0.7299931044372592, 'bagging_freq': 3}. Best is trial 12 with value: 0.9178019280636315.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.90597 + 0.00434594
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:19,964] Trial 21 finished with value: 0.9179326248528937 and parameters: {'learning_rate': 0.1909825984631866, 'num_leaves': 138, 'max_depth': 15, 'min_data_in_leaf': 11, 'feature_fraction': 0.842591111320221, 'bagging_fraction': 0.9330428032567744, 'bagging_freq': 2}. Best is trial 21 with value: 0.9179326248528937.


Did not meet early stopping. Best iteration is:
[99]	valid's auc: 0.917933 + 0.00549817
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:21,944] Trial 22 finished with value: 0.9172724681605265 and parameters: {'learning_rate': 0.11254087269405172, 'num_leaves': 152, 'max_depth': 14, 'min_data_in_leaf': 17, 'feature_fraction': 0.8612828065511712, 'bagging_fraction': 0.9341135021231856, 'bagging_freq': 2}. Best is trial 21 with value: 0.9179326248528937.


Did not meet early stopping. Best iteration is:
[97]	valid's auc: 0.917272 + 0.00623286
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:23,681] Trial 23 finished with value: 0.9130057071282882 and parameters: {'learning_rate': 0.19322522753860147, 'num_leaves': 123, 'max_depth': 14, 'min_data_in_leaf': 33, 'feature_fraction': 0.7525654089569482, 'bagging_fraction': 0.8972276294870267, 'bagging_freq': 1}. Best is trial 21 with value: 0.9179326248528937.


Did not meet early stopping. Best iteration is:
[96]	valid's auc: 0.913006 + 0.00747483
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:25,797] Trial 24 finished with value: 0.9182223645064106 and parameters: {'learning_rate': 0.10712548540957047, 'num_leaves': 171, 'max_depth': 12, 'min_data_in_leaf': 17, 'feature_fraction': 0.9370758204210594, 'bagging_fraction': 0.8420353271485408, 'bagging_freq': 3}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[97]	valid's auc: 0.918222 + 0.0045634
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:27,595] Trial 25 finished with value: 0.9076057138784623 and parameters: {'learning_rate': 0.047737796149355374, 'num_leaves': 170, 'max_depth': 11, 'min_data_in_leaf': 27, 'feature_fraction': 0.8471342164535861, 'bagging_fraction': 0.8529116820056706, 'bagging_freq': 5}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.907606 + 0.00503821
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:29,732] Trial 26 finished with value: 0.9165048596738419 and parameters: {'learning_rate': 0.14343633815870385, 'num_leaves': 176, 'max_depth': 15, 'min_data_in_leaf': 18, 'feature_fraction': 0.931323588421291, 'bagging_fraction': 0.7849527378148702, 'bagging_freq': 2}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[93]	valid's auc: 0.916505 + 0.00626489
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:31,370] Trial 27 finished with value: 0.908964656630185 and parameters: {'learning_rate': 0.07771113990463191, 'num_leaves': 149, 'max_depth': 14, 'min_data_in_leaf': 40, 'feature_fraction': 0.7720493935446369, 'bagging_fraction': 0.852847613796506, 'bagging_freq': 4}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.908965 + 0.00641498
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:33,130] Trial 28 finished with value: 0.8902208258293381 and parameters: {'learning_rate': 0.01839823624741898, 'num_leaves': 94, 'max_depth': 13, 'min_data_in_leaf': 32, 'feature_fraction': 0.915293158628819, 'bagging_fraction': 0.9422716447486571, 'bagging_freq': 2}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.890221 + 0.00454945
Training until validation scores don't improve for 30 rounds


[I 2025-11-18 07:14:35,772] Trial 29 finished with value: 0.8918022377194028 and parameters: {'learning_rate': 0.007892790762890834, 'num_leaves': 184, 'max_depth': 12, 'min_data_in_leaf': 19, 'feature_fraction': 0.8582169678146603, 'bagging_fraction': 0.830438458473901, 'bagging_freq': 1}. Best is trial 24 with value: 0.9182223645064106.


Did not meet early stopping. Best iteration is:
[100]	valid's auc: 0.891802 + 0.00433685
Tuning duration (s): 54.8


## Evaluation and threshold tuning


In [19]:
models_evaluated = {}
for name, mdl in best_candidates.items():
    probs = mdl.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probs)
    thresholds = np.linspace(0.01, 0.99, 99)
    best_acc = 0
    best_f1 = 0
    best_t_acc = 0.5
    best_t_f1 = 0.5
    for t in thresholds:
        preds = (probs >= t).astype(int)
        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        if acc > best_acc:
            best_acc = acc
            best_t_acc = t
        if f1 > best_f1:
            best_f1 = f1
            best_t_f1 = t
    models_evaluated[name] = {
        'model': mdl,
        'auc': auc,
        'best_accuracy': best_acc,
        'best_acc_threshold': best_t_acc,
        'best_f1': best_f1,
        'best_f1_threshold': best_t_f1
    }

import pandas as _pd
_pd.DataFrame([{ 'model':k, **{kk:v for kk,v in d.items() if kk!='model'} } for k,d in models_evaluated.items()])


Unnamed: 0,model,auc,best_accuracy,best_acc_threshold,best_f1,best_f1_threshold
0,lightgbm,0.796483,0.784954,0.9,0.585899,0.01


## Save final model and artifacts


In [20]:
final_name = list(models_evaluated.keys())[0]
final_model = models_evaluated[final_name]['model']
final_metrics = models_evaluated[final_name]

joblib.dump(final_model, 'best_model_optimized.pkl')
joblib.dump(scaler, 'scaler_optimized.pkl')

# Feature importances if available
if hasattr(final_model, 'feature_importances_'):
    fi = pd.DataFrame({'feature': X_train.columns, 'importance': final_model.feature_importances_}).sort_values('importance', ascending=False)
    fi.to_csv('feature_importances_optimized.csv', index=False)

# SHAP
if SHAP_INSTALLED:
    explainer = shap.TreeExplainer(final_model)
    shap_values = explainer.shap_values(X_test)
    sv = shap_values[1] if isinstance(shap_values, list) else shap_values
    plt.figure(figsize=(10,6))
    shap.summary_plot(sv, X_test, show=False)
    plt.tight_layout()
    plt.savefig('shap_summary_optimized.png', dpi=150)
    plt.close()
    # local
    top_idx = np.argmax(final_model.predict_proba(X_test)[:,1])
    try:
        shap.force_plot(explainer.expected_value[1] if isinstance(explainer.expected_value, (list,tuple)) else explainer.expected_value,
                        sv[top_idx], X_test.iloc[top_idx], matplotlib=True, show=False)
        plt.savefig('shap_local_optimized.png', dpi=150)
        plt.close()
    except Exception as e:
        print('Could not save local force_plot as PNG:', e)

# Write summary file (fixed string construction)
summary_lines = [
    f"Final model: {final_name}",
    f"AUC: {final_metrics['auc']:.4f}",
    f"Best accuracy (threshold {final_metrics['best_acc_threshold']:.2f}): {final_metrics['best_accuracy']:.4f}",
    f"Best F1 (threshold {final_metrics['best_f1_threshold']:.2f}): {final_metrics['best_f1']:.4f}"
]
summary = "\n".join(summary_lines)
with open('model_summary_optimized.txt','w') as f:
    f.write(summary)

# Create ZIP with everything
import zipfile
zip_name = 'Telco_Churn_Project_Optimized.zip'
with zipfile.ZipFile(zip_name,'w') as zipf:
    files_to_add = ['best_model_optimized.pkl','scaler_optimized.pkl','model_summary_optimized.txt']
    if os.path.exists('feature_importances_optimized.csv'):
        files_to_add.append('feature_importances_optimized.csv')
    if os.path.exists('shap_summary_optimized.png'):
        files_to_add.append('shap_summary_optimized.png')
    if os.path.exists('shap_local_optimized.png'):
        files_to_add.append('shap_local_optimized.png')
    # add this notebook
    nb_name = 'SHAP_Telco_Optimized.ipynb'
    # To be robust, add by name if exists in cwd
    if os.path.exists(nb_name):
        files_to_add.append(nb_name)
    for fn in files_to_add:
        if os.path.exists(fn):
            zipf.write(fn)
print(f'Saved {zip_name} with available artifacts')


Saved Telco_Churn_Project_Optimized.zip with available artifacts
