In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.metrics import (roc_auc_score,
                            f1_score,
                            accuracy_score,
                            confusion_matrix,
                            classification_report)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone
import optuna
SEED = 42

  if entities is not ():


In [2]:
##importing data
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

##splitting into train,test and validation
active,test = train_test_split(df.drop(columns=['customerID']),test_size=1000,stratify=df.Churn,random_state=SEED)

##encoding train and val to get them to training
X,y = active.drop(columns=['Churn']),active['Churn']
le = LabelEncoder()
y_encoded = le.fit_transform(y)
X_train,X_test,y_train,y_test = train_test_split(X,y_encoded,test_size=0.2,stratify=active.Churn,random_state=SEED)

# Modelling approach
- we'll follow a full modular approach with components added through pipeline objects and columns transformers from sklearn.compose
- we'll follow the standard approach of standardization of numerical cols and OHE of categorical columns
- for changing data types or custome features we'll use custom made columns transformers

## Feature engineering 
- although the data present is raw and clean we can engineer some use full features
- we'll implement cross binning to combine two or more categorical features into one

In [3]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fill_value = 0 
        self.cat_cols = ['contract_payment','internet_support','family_status','tenure_group']
        self.num_cols = ['auto_payment']
    def fit(self, X, y=None):
        temp_charges = pd.to_numeric(X['TotalCharges'], errors='coerce')
        self.fill_value = temp_charges.mean()
        return self

    def _get_cat_cols(self):
        return self.cat_cols
    def _get_num_cols(self):
        return self.num_cols
    def transform(self, X):
        X_copy = X.copy()
        # X_copy.drop(columns=['customerID'],inplace=True)
        X_copy['TotalCharges'] = pd.to_numeric(X_copy['TotalCharges'], errors='coerce')
        X_copy['TotalCharges'] = X_copy['TotalCharges'].fillna(self.fill_value)
        X_copy['contract_payment'] = X_copy['Contract'] + '_' + X_copy['PaymentMethod']
        X_copy['internet_support'] = X_copy['InternetService'] + '_' + X_copy['TechSupport']
        X_copy['family_status'] = X_copy['Partner'] + '_' + X_copy['Dependents']
        X_copy['auto_payment'] = X_copy['PaymentMethod'].apply(lambda x: 1 if 'automatic' in x else 0)
        X_copy['tenure_group'] = pd.cut(X_copy['tenure'], bins=[-1, 12, 48, 100], labels=['New', 'Established', 'Loyal'])
        X_copy['tenure_group'] = X_copy['tenure_group'].astype(str)
        
        return X_copy

## Creating Pipelines and Preprocessors

In [4]:
pre = Preprocessor()
numeric_cols = [c for c in X_train.select_dtypes(include='number').columns.tolist() if c not in ['Churn']] + ['TotalCharges'] + pre._get_num_cols()
categorical_cols = [cols for cols in X_train.select_dtypes(exclude='number').columns.tolist() if cols not in set(['customerID','TotalCharges'])] + pre._get_cat_cols()

# Define Transformers
numerical_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

models = {
    'xgboost':XGBClassifier(n_estimators=1000,subsample=0.8,n_jobs=-1,random_state=SEED),
    'randomforest':RandomForestClassifier(n_estimators=500,n_jobs=-1,random_state=SEED),
    'gradientboostingclassifier':GradientBoostingClassifier(n_estimators=500,subsample=0.8,random_state=SEED),
    'logisticregression':LogisticRegression(max_iter=1000,n_jobs=-1)
}

## Baseline Training and Model Choice 

In [5]:
## training each model and evaluating baseline performance with heuristically choosen hyperparams
results = {}
X_combine = pd.concat([X_train,X_test],axis=0)
y_combine = np.concatenate([y_train,y_test],axis=0)
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)

for model_name,model in models.items():
    print(f'Training {model_name}')
    model_pipeline = Pipeline(steps=[
        ('feature_generation',pre),
        ('processor',preprocessor),
        ('model',model)
    ])

    scores = cross_val_score(model_pipeline,X_combine,y_combine,cv=cv,scoring='roc_auc',n_jobs=-1)
    results[model_name] = {
        'scores':scores,
        'avg_score':np.mean(scores),
        'std':np.std(scores)
    }

print('\n')
print('Cross Validated Baseline Scores')
for name,results in results.items():
    print(f'====={name}=====')
    for metric,res in results.items():
        if isinstance(res,np.ndarray):
            print(f'{metric}: {res}')
        else:
            print(f'{metric}: {res:.4f}')    
    print('='*10)

Training xgboost
Training randomforest
Training gradientboostingclassifier
Training logisticregression


Cross Validated Baseline Scores
=====xgboost=====
scores: [0.80191231 0.81199833 0.76974755 0.78391751 0.78536458]
avg_score: 0.7906
std: 0.0148
=====randomforest=====
scores: [0.84313695 0.8367275  0.80091248 0.81168708 0.81114014]
avg_score: 0.8207
std: 0.0163
=====gradientboostingclassifier=====
scores: [0.84520326 0.85012349 0.80947244 0.82737542 0.825856  ]
avg_score: 0.8316
std: 0.0146
=====logisticregression=====
scores: [0.85125838 0.85971836 0.82642046 0.83501197 0.84564864]
avg_score: 0.8436
std: 0.0118


#### Logistic Regression seems to outperform heavier tree based models, we'll pick it and move on to hyperparameter tuning

# Hyperparam Tuning
----
- we'll use optuna to experiment with various solvers and penalities
- optuna follows a bayesian approach in tuning parameters
- along with that we'll use 5FoldStratifiedCV by combining both the training and validation data together

In [6]:


logistic_pipeline = Pipeline(steps=[
        ('feature_generation',pre),
        ('processor',preprocessor),
        ('model',LogisticRegression())
    ])


def objective(trial):
    # --- 1. SOLVER & PENALTY SELECTION ---
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga'])
    
    penalty = 'l2' 
    l1_ratio = None
    
    if solver == 'liblinear':
        penalty = trial.suggest_categorical('penalty_liblinear', ['l1', 'l2'])
    elif solver == 'saga':
        penalty = trial.suggest_categorical('penalty_saga', ['l1', 'l2', 'elasticnet'])
        if penalty == 'elasticnet':
            l1_ratio = trial.suggest_float('l1_ratio', 0, 1)

    # --- 2. NUMERICAL TUNING ---
    c_value = trial.suggest_float('C', 1e-4, 1e2, log=True)

    # Not a very big contributor in performance, optional at best 
    tol_value = trial.suggest_float('tol', 1e-6, 1e-3, log=True)

    # --- 3. MODEL SETUP ---
    clf = clone(logistic_pipeline)
    clf.set_params(
        model__solver=solver,
        model__penalty=penalty,
        model__C=c_value,
        model__l1_ratio=l1_ratio,
        model__class_weight='balanced',
        model__tol=tol_value,    
        model__max_iter=2500     #hardcoded, illogical to tune
    )
    
    # --- 4. RUN ---
    cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)

    scores = cross_val_score(clf,X_combine,y_combine,cv=cv,n_jobs=-1,scoring='roc_auc')
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50,show_progress_bar=True)

print("Best Validation AUC:", study.best_value)
print("Best Params:", study.best_params)

[32m[I 2025-12-21 10:30:33,042][0m A new study created in memory with name: no-name-c5407175-f14c-4b93-9284-64619b09fbd7[0m
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2025-12-21 10:30:34,095][0m Trial 0 finished with value: 0.8437506501617168 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.053684667448489876, 'C': 0.05705446632076725, 'tol': 6.715918498533566e-05}. Best is trial 0 with value: 0.8437506501617168.[0m
[32m[I 2025-12-21 10:30:34,284][0m Trial 1 finished with value: 0.8364034841235387 and parameters: {'solver': 'lbfgs', 'C': 0.0004741569741080096, 'tol': 9.477218619330979e-05}. Best is trial 0 with value: 0.8437506501617168.[0m
[32m[I 2025-12-21 10:30:36,615][0m Trial 2 finished with value: 0.8434282597545943 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.21245876064453995, 'C': 0.6731983306296864, 'tol': 0.00038745785580563546}. Best is trial 0 with value: 0.8437506501617168.[0m
[32m[I 2025-12-21 10:30:36,820][0m Trial 3 finished with value: 0.8434566829155766 and parameters: {'solver': 'lbfgs', 'C': 0.009597204109067569, 'tol': 3.3602898784215007e-06}. Bes



[32m[I 2025-12-21 10:31:12,157][0m Trial 7 finished with value: 0.8431411054804012 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.2507393768282834, 'C': 6.419106850331423, 'tol': 3.1239106714254627e-06}. Best is trial 0 with value: 0.8437506501617168.[0m
[32m[I 2025-12-21 10:31:12,400][0m Trial 8 finished with value: 0.8431446448994195 and parameters: {'solver': 'liblinear', 'penalty_liblinear': 'l2', 'C': 6.013504585129976, 'tol': 0.00045052301171830204}. Best is trial 0 with value: 0.8437506501617168.[0m
[32m[I 2025-12-21 10:31:13,851][0m Trial 9 finished with value: 0.8444439195088727 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.5398731920278216, 'C': 0.08553193458456176, 'tol': 7.078253966985442e-05}. Best is trial 9 with value: 0.8444439195088727.[0m




[32m[I 2025-12-21 10:31:47,309][0m Trial 10 finished with value: 0.8430771316997598 and parameters: {'solver': 'saga', 'penalty_saga': 'l1', 'C': 67.38910334074242, 'tol': 1.0319573658411675e-06}. Best is trial 9 with value: 0.8444439195088727.[0m
[32m[I 2025-12-21 10:31:48,410][0m Trial 11 finished with value: 0.8440545382025713 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.7930061066777586, 'C': 0.04117496481903859, 'tol': 8.721681687202613e-05}. Best is trial 9 with value: 0.8444439195088727.[0m
[32m[I 2025-12-21 10:31:49,835][0m Trial 12 finished with value: 0.8444403795484992 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0.8176030471477298, 'C': 0.06291245744857747, 'tol': 0.00013494593033650218}. Best is trial 9 with value: 0.8444439195088727.[0m
[32m[I 2025-12-21 10:31:56,787][0m Trial 13 finished with value: 0.8438418223360715 and parameters: {'solver': 'saga', 'penalty_saga': 'elasticnet', 'l1_ratio': 0

In [7]:
print(
    f'Best Validation AUC: {study.best_value:.3f}'
    '\n'
    f'Best Parameters: {study.best_params}'
)

Best Validation AUC: 0.844
Best Parameters: {'solver': 'saga', 'penalty_saga': 'l1', 'C': 0.06337964953011156, 'tol': 2.308645905498455e-05}
