In [4]:
# install xgboost
%pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier



In [5]:
df=pd.read_csv(r"/content/Dataset.csv")

In [6]:
df.shape

(7043, 21)

In [7]:
df.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [8]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [9]:
data=df.copy()

In [10]:
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
cols_to_process = ['Partner','PhoneService','OnlineSecurity','OnlineBackup',
                   'DeviceProtection','TechSupport','StreamingTV',
                   'StreamingMovies','PaperlessBilling','Churn']

for col in cols_to_process:
    data[col] = data[col].replace({
        "Yes": 1,
        "No": 0,
        "No internet service": 0,
        "No phone service": 0
    })

  data[col] = data[col].replace({


In [12]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,No,1,0,No phone service,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,No,34,1,No,DSL,1,...,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,No,2,1,No,DSL,1,...,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,No,45,0,No phone service,DSL,1,...,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,No,2,1,No,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


In [13]:
data=data.drop("customerID",axis=1)

In [14]:
X = data.drop("Churn", axis=1)
y = (
    data["Churn"]
    .astype(str)
    .str.strip()
    .map({"Yes": 1, "No": 0, "1": 1, "0": 0})
    .astype(int)
)

In [15]:
X = pd.get_dummies(X, drop_first=True)

In [16]:
bool_cols = X.select_dtypes(include="bool").columns
X[bool_cols] = X[bool_cols].astype(int)


Using scaled data for:
- Logistic Regression
- SVM
- KNN
- Neural Net

Using unscaled data for:
- Decision tree
- Xgboost
- Random Forest

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic regression

In [18]:
model = LogisticRegression(max_iter=1000)

In [19]:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)


In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7796706416808632
[[1144  150]
 [ 238  229]]
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1294
           1       0.60      0.49      0.54       467

    accuracy                           0.78      1761
   macro avg       0.72      0.69      0.70      1761
weighted avg       0.77      0.78      0.77      1761



In [24]:
rf = RandomForestClassifier(n_estimators=200, random_state=42,class_weight='balanced')
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.7967064168086314
[[1175  119]
 [ 239  228]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1294
           1       0.66      0.49      0.56       467

    accuracy                           0.80      1761
   macro avg       0.74      0.70      0.71      1761
weighted avg       0.78      0.80      0.79      1761



In [25]:
from sklearn.metrics import classification_report


y_prob = rf.predict_proba(X_test)[:, 1]

custom_threshold = 0.3
y_pred_custom = (y_prob >= custom_threshold).astype(int)

# 3. Check the results
print(f"--- Results with Threshold {custom_threshold} ---")
print(classification_report(y_test, y_pred_custom))

--- Results with Threshold 0.3 ---
              precision    recall  f1-score   support

           0       0.89      0.77      0.82      1294
           1       0.53      0.73      0.62       467

    accuracy                           0.76      1761
   macro avg       0.71      0.75      0.72      1761
weighted avg       0.79      0.76      0.77      1761



## Stacked Models approach

In [22]:

level_0_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
]

level_1_meta_model = LogisticRegression()

stacking_model = StackingClassifier(
    estimators=level_0_models,
    final_estimator=level_1_meta_model,
    cv=5
)

In [23]:
stacking_model.fit(X_train_scaled, y_train)
y_pred = stacking_model.predict(X_test_scaled)

print(f"Stacking Accuracy: {stacking_model.score(X_test_scaled, y_test)}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Stacking Accuracy: 0.7927314026121521


To address the critical business impact of customer churn, the final model utilizes a Random Forest Classifier with balanced class weighting. By strategically adjusting the decision threshold to 0.3, the model prioritizes Recall (77%) over Precision. This trade-off ensures that the vast majority of at-risk customers are successfully identified for retention campaigns, minimizing the risk of silent revenue loss.