## Telco Customer Churn Prediction

### Model Training Notebook

In [None]:
import pandas as pd
from src.preprocessing import *

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score

import joblib

In [2]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

We apply the same preprocessing steps shown in [data_exploration.ipynb](data_exploration.ipynb)

In [4]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.loc[df["tenure"] == 0, "TotalCharges"] = 0

In [5]:
df['HasInternet'] = ~df['OnlineBackup'].isin(['No internet service'])
df['automatic_pay'] = df['PaymentMethod'].isin(['Bank transfer (automatic)','Credit card (automatic)'])

In [6]:
df = mapping(df, YES_NO, YES_NO_MAPPING)

In [7]:
df['gender'] = df['gender'].map({'Male' : 1, 'Female' : 0})
df['MultipleLines'] = df['MultipleLines'].map({'No phone service' : 0, 'No' : 1, 'Yes' : 2})
df['Contract'] = df['Contract'].map({'Month-to-month' : 0, 'One year' : 1, 'Two year' : 2})

In [8]:
df = mapping(df, INTERNET_VARS, INTERNET_VARS_MAPPING)

In [9]:
df = dummies(df, DUMMIES_VARS, TO_DROP)

In [10]:
df = mapping(df, TRUE_FALSE, TRUE_FALSE_MAPPING)

In [11]:
df.drop('customerID', axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   gender                     7043 non-null   int64  
 1   SeniorCitizen              7043 non-null   int64  
 2   Partner                    7043 non-null   int64  
 3   Dependents                 7043 non-null   int64  
 4   tenure                     7043 non-null   int64  
 5   PhoneService               7043 non-null   int64  
 6   MultipleLines              7043 non-null   int64  
 7   OnlineSecurity             7043 non-null   int64  
 8   OnlineBackup               7043 non-null   int64  
 9   DeviceProtection           7043 non-null   int64  
 10  TechSupport                7043 non-null   int64  
 11  StreamingTV                7043 non-null   int64  
 12  StreamingMovies            7043 non-null   int64  
 13  Contract                   7043 non-null   int64

In [13]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,HasInternet,automatic_pay,DSL,Fiber optic,Bank transfer (automatic),Credit card (automatic),Electronic check
0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,29.85,29.85,0,1,0,1,0,0,0,1
1,1,0,0,0,34,1,1,1,0,1,0,0,0,1,0,56.95,1889.5,0,1,0,1,0,0,0,0
2,1,0,0,0,2,1,1,1,1,0,0,0,0,0,1,53.85,108.15,1,1,0,1,0,0,0,0
3,1,0,0,0,45,0,0,1,0,1,1,0,0,1,0,42.3,1840.75,0,1,1,1,0,1,0,0
4,0,0,0,0,2,1,1,0,0,0,0,0,0,0,1,70.7,151.65,1,1,0,0,1,0,0,1


I'm not going to normalize tenure and charges as i'm planning to use the model later with an interface

In [14]:
X = df.drop('Churn',axis=1)
Y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [15]:
# Logistic Regression

logreg = LogisticRegression(max_iter=5000)

logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
y_pred_proba = logreg.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  80.77 %
classification report :
               precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.56      0.61       374

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

confusion matrix :
 [[928 107]
 [164 210]]
AUC: 0.8425430778371954


In [16]:
# KNN

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred_proba = knn.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  76.3 %
classification report :
               precision    recall  f1-score   support

           0       0.81      0.88      0.85      1035
           1       0.57      0.43      0.49       374

    accuracy                           0.76      1409
   macro avg       0.69      0.66      0.67      1409
weighted avg       0.75      0.76      0.75      1409

confusion matrix :
 [[914 121]
 [213 161]]
AUC: 0.7537123149655119


In [17]:
# Random Forest

rft = RandomForestClassifier(class_weight='balanced', random_state=42)

rft.fit(x_train,y_train)
y_pred = rft.predict(x_test)
y_pred_proba = rft.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  78.71 %
classification report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.49      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

confusion matrix :
 [[924 111]
 [189 185]]
AUC: 0.8199901831615386


In [18]:
# XGB

imbalance_ratio = (len(y_train) - sum(y_train)) / sum(y_train)

xgb = XGBClassifier(
    eval_metric="logloss",
    random_state=42)

xgb.fit(x_train, y_train)

y_pred = xgb.predict(x_test)
y_pred_proba = xgb.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  77.29 %
classification report :
               precision    recall  f1-score   support

           0       0.83      0.86      0.85      1035
           1       0.58      0.52      0.55       374

    accuracy                           0.77      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.77      0.77      0.77      1409

confusion matrix :
 [[894 141]
 [179 195]]
AUC: 0.8202976052080913


In [19]:
# SVM

svm = LinearSVC(max_iter=5000, random_state=42)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  79.49 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.64      0.52      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409

confusion matrix :
 [[924 111]
 [178 196]]
AUC: 0.8202976052080913


In [20]:
# GradientBoosting

gbc = GradientBoostingClassifier(n_estimators=300,
                                   learning_rate=0.5,
                                   max_depth=5,
                                   random_state=42)

gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
y_pred_proba = gbc.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  77.29 %
classification report :
               precision    recall  f1-score   support

           0       0.83      0.86      0.85      1035
           1       0.58      0.52      0.55       374

    accuracy                           0.77      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.77      0.77      0.77      1409

confusion matrix :
 [[893 142]
 [178 196]]
AUC: 0.797765377560774


In [21]:
# Neural Networks

nn = MLPClassifier(
    hidden_layer_sizes=(64, 128, 64),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)

nn.fit(x_train, y_train)
y_pred = nn.predict(x_test)
y_pred_proba = nn.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

accuracy :  79.99 %
classification report :
               precision    recall  f1-score   support

           0       0.83      0.92      0.87      1035
           1       0.68      0.47      0.55       374

    accuracy                           0.80      1409
   macro avg       0.75      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409

confusion matrix :
 [[952  83]
 [199 175]]
AUC: 0.8351571985843086


Saving best model

In [22]:
joblib.dump(logreg, "model/logreg_model.pkl")

print("✅ Model saved to model/logreg_model.pkl")

✅ Model saved to model/logreg_model.pkl


In [24]:
model = joblib.load("model/logreg_model.pkl")

y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]

print("Test AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Test AUC: 0.8425430778371954
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.56      0.61       374

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

