## Telco Customer Churn Prediction

### Model Training Notebook

In [1]:
import pandas as pd
from src.preprocessing import *

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score

import joblib

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

We apply the same preprocessing steps shown in [data_exploration.ipynb](data_exploration.ipynb)

In [4]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.loc[df["tenure"] == 0, "TotalCharges"] = 0

In [5]:
df['HasInternet'] = ~df['OnlineBackup'].isin(['No internet service'])
df['automatic_pay'] = df['PaymentMethod'].isin(['Bank transfer (automatic)','Credit card (automatic)'])

In [6]:
df = mapping(df, YES_NO, YES_NO_MAPPING)

In [7]:
df['gender'] = df['gender'].map({'Male' : 1, 'Female' : 0})
df['MultipleLines'] = df['MultipleLines'].map({'No phone service' : 0, 'No' : 1, 'Yes' : 2})
df['Contract'] = df['Contract'].map({'Month-to-month' : 0, 'One year' : 1, 'Two year' : 2})

In [8]:
df = mapping(df, INTERNET_VARS, INTERNET_VARS_MAPPING)

In [9]:
df = dummies(df, DUMMIES_VARS, TO_DROP)

In [10]:
df = mapping(df, TRUE_FALSE, TRUE_FALSE_MAPPING)

In [11]:
df.drop(['customerID', 'TotalCharges', 'PhoneService'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   gender                     7043 non-null   int64  
 1   SeniorCitizen              7043 non-null   int64  
 2   Partner                    7043 non-null   int64  
 3   Dependents                 7043 non-null   int64  
 4   tenure                     7043 non-null   int64  
 5   MultipleLines              7043 non-null   int64  
 6   OnlineSecurity             7043 non-null   int64  
 7   OnlineBackup               7043 non-null   int64  
 8   DeviceProtection           7043 non-null   int64  
 9   TechSupport                7043 non-null   int64  
 10  StreamingTV                7043 non-null   int64  
 11  StreamingMovies            7043 non-null   int64  
 12  Contract                   7043 non-null   int64  
 13  PaperlessBilling           7043 non-null   int64

In [13]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,MonthlyCharges,Churn,HasInternet,automatic_pay,DSL,Fiber optic,Bank transfer (automatic),Credit card (automatic),Electronic check
0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,29.85,0,1,0,1,0,0,0,1
1,1,0,0,0,34,1,1,0,1,0,0,0,1,0,56.95,0,1,0,1,0,0,0,0
2,1,0,0,0,2,1,1,1,0,0,0,0,0,1,53.85,1,1,0,1,0,0,0,0
3,1,0,0,0,45,0,1,0,1,1,0,0,1,0,42.3,0,1,1,1,0,1,0,0
4,0,0,0,0,2,1,0,0,0,0,0,0,0,1,70.7,1,1,0,0,1,0,0,1


I'm not going to normalize tenure and charges as I'm planning to use the model later with an interface.<br>
- `TotalCharges` = Tenure * MonthlyCharges, so it'll be dropped.
- Same way `PhoneService` seems to be a included of `MultipleLines`

In [14]:
X = df.drop(['Churn'],axis=1)
Y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [15]:
models = {}

In [16]:
# Logistic Regression

logreg = LogisticRegression(max_iter=5000)

logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
y_pred_proba = logreg.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['logreg'] = logreg

accuracy :  79.99 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.65      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

confusion matrix :
 [[924 111]
 [171 203]]
AUC: 0.8385000904182489


In [19]:
# KNN

knn = KNeighborsClassifier(n_neighbors=20)

knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred_proba = knn.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['knn'] = knn

accuracy :  79.49 %
classification report :
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.66      0.47      0.55       374

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

confusion matrix :
 [[943  92]
 [197 177]]
AUC: 0.8156617324136506


In [23]:
# Random Forest

rft = RandomForestClassifier(n_estimators=200,
                             max_depth=15,
                             random_state=42)

rft.fit(x_train,y_train)
y_pred = rft.predict(x_test)
y_pred_proba = rft.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['rft'] = rft

accuracy :  79.21 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.79      1409

confusion matrix :
 [[920 115]
 [178 196]]
AUC: 0.826000413340567


In [24]:
# XGB

imbalance_ratio = (len(y_train) - sum(y_train)) / sum(y_train)

xgb = XGBClassifier(n_estimators=550,
                    learning_rate=0.01,
                    max_depth=7,
                    subsample=0.7,
                    eval_metric="logloss",
                    random_state=42)

xgb.fit(x_train, y_train)

y_pred = xgb.predict(x_test)
y_pred_proba = xgb.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['xgb'] = xgb

accuracy :  80.2 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

confusion matrix :
 [[928 107]
 [172 202]]
AUC: 0.8400539926115373


In [25]:
# GradientBoosting

gbc = GradientBoostingClassifier(n_estimators=550,
                                 learning_rate=0.01,
                                 max_depth=5,
                                 random_state=42)

gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
y_pred_proba = gbc.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['gbc'] = gbc

accuracy :  80.48 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.80      0.80      1409

confusion matrix :
 [[933 102]
 [173 201]]
AUC: 0.8426696633857759


In [26]:
# Neural Networks

nn = MLPClassifier(hidden_layer_sizes=(64, 128, 64),
                   activation='relu',
                   solver='adam',
                   max_iter=1000,
                   random_state=42,
                   early_stopping=True)

nn.fit(x_train, y_train)
y_pred = nn.predict(x_test)
y_pred_proba = nn.predict_proba(x_test)[:,1]

print("accuracy : ", round(accuracy_score(y_test,y_pred)*100,2),"%")
print("classification report :\n", classification_report(y_test,y_pred))
print("confusion matrix :\n", confusion_matrix(y_test,y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))
models['nn'] = nn

accuracy :  79.91 %
classification report :
               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

confusion matrix :
 [[933 102]
 [181 193]]
AUC: 0.840243870934408


Saving best model

In [27]:
for model_name, model in models.items():
    joblib.dump(model, f"model/{model_name}_model.pkl")
    print(f'✅ {model_name} saved to model/{model_name}_model.pkl')

✅ logreg saved to model/logreg_model.pkl
✅ knn saved to model/knn_model.pkl
✅ rft saved to model/rft_model.pkl
✅ xgb saved to model/xgb_model.pkl
✅ gbc saved to model/gbc_model.pkl
✅ nn saved to model/nn_model.pkl


In [28]:
model = joblib.load("model/logreg_model.pkl")

y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Test AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

Accuracy: 0.7998580553584103
Test AUC: 0.8385000904182489
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.65      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

