# 1. Modelling Overview

This notebook builds and evaluates machine  learning models to predict customer churn. We use multiple algorithms to compare performance and identify the best model for deployment.

**Models included:**
- Logistic Regression
- Random Forest
- XGBoost

**Evaluation metrics:**
- Accuracy
- Precision
- Recall
- F1-Score
- ROC-AUC

# 2. Load Libraries & Dataset

We begin by importing the necessary libraries and loading the dataset that was prepared during feature engineering.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score,roc_auc_score

import matplotlib.pyplot

In [25]:
## Load Engineered Data
df = pd.read_csv('../data/cleaned/engineered_telco_customer_churn.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureBucket_Mid-Term,TenureBucket_Long-Term,NumServices,MonthlyCharges_Tenure,MonthlyCharges_per_Service
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,True,False,False,False,1,29.85,29.849702
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,True,False,False,False,True,False,True,3,1936.3,18.98327
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,True,False,False,3,107.7,17.94994
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,False,True,3,1903.5,14.099953
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,True,False,False,False,1,141.4,70.699293


In [26]:
X = df.drop(columns=[ 'Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_Yes', 'OnlineBackup_Yes',
       'DeviceProtection_Yes', 'TechSupport_Yes', 'StreamingTV_Yes',
       'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'TenureBucket', 'NumServices', 'MonthlyCharges_Tenure',
       'MonthlyCharges_per_Service'],
      dtype='object')

In [27]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyCharge',
                'NumServices', 'MonthlyCharges_per_Service','MonthlyCharges_Tenure'
]

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [28]:
#Model training Helper Function
def evaluate_model(model, X_train,X_test,y_train, y_test):
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    return {
        "Accuracy": accuracy_score(y_test, pred),
        "Precision": precision_score(y_test, pred), 
        "Recall": recall_score(y_test, pred),
        "F1-Score": f1_score(y_test, pred),
        "ROC-AUC": roc_auc_score(y_test, probs)
    }

## 6. Train models

In [29]:
X_train.isna().sum()

gender                                   0
SeniorCitizen                            0
Partner                                  0
Dependents                               0
tenure                                   0
PhoneService                             0
PaperlessBilling                         0
MonthlyCharges                           0
TotalCharges                             0
AvgMonthlyCharge                         0
MultipleLines_No phone service           0
MultipleLines_Yes                        0
InternetService_Fiber optic              0
InternetService_No                       0
OnlineSecurity_Yes                       0
OnlineBackup_Yes                         0
DeviceProtection_Yes                     0
TechSupport_Yes                          0
StreamingTV_Yes                          0
StreamingMovies_Yes                      0
Contract_One year                        0
Contract_Two year                        0
PaymentMethod_Credit card (automatic)    0
PaymentMeth

In [30]:
## Logistic Regression
# Encode gender in training and test sets

log_reg = LogisticRegression(random_state=42, max_iter=2000)
results_log = evaluate_model(log_reg, X_train, X_test, y_train, y_test)
results_log

{'Accuracy': 0.8019872249822569,
 'Precision': 0.6537216828478964,
 'Recall': 0.5401069518716578,
 'F1-Score': 0.5915080527086384,
 'ROC-AUC': 0.8474799142318323}

In [31]:
## Random Forest Classifier
rf = RandomForestClassifier(n_estimators = 300, random_state=42)
results_rf = evaluate_model(rf, X_train, X_test, y_train, y_test)
results_rf

{'Accuracy': 0.7849538679914834,
 'Precision': 0.6228373702422145,
 'Recall': 0.48128342245989303,
 'F1-Score': 0.5429864253393665,
 'ROC-AUC': 0.827660234054096}

In [None]:
## XGBoost Classifier
xgb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05,
                                 max_depth=5,
                                  subsample=0.8,
                                   random_state=42)
results_xgb = evaluate_model(xgb, X_train, X_test, y_train, y_test)
results_xgb

{'Accuracy': 0.794889992902768,
 'Precision': 0.639344262295082,
 'Recall': 0.5213903743315508,
 'F1-Score': 0.5743740795287187,
 'ROC-AUC': 0.8358831279547392}

: 

In [1]:
results = pd.DataFrame({
    'Logistic Regression': results_log, 
    'Random Forest': results_rf,
    'XGBoost': results_xgb
}) 

results.T.sort_values(by='F1-Score', ascending=False)

NameError: name 'pd' is not defined