# Telco Churn Model Building - Khayyam Khan

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

## Import Data

In [2]:
df = pd.read_csv('tel_churn.csv', index_col= 0)
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [3]:
x = df.drop('Churn', axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7039,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7040,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7041,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [4]:
y = df['Churn']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

## Train Test Split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=.8, random_state=20)

## Machine Learning

Decision Tree Classifier

In [6]:
model_dt = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [7]:
model_dt.fit(x_train, y_train)

In [8]:
y_pred = model_dt.predict(x_test)
y_pred

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [9]:
model_dt.score(x_test, y_test)

0.8017057569296375

In [10]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1042
           1       0.63      0.58      0.60       365

    accuracy                           0.80      1407
   macro avg       0.74      0.73      0.73      1407
weighted avg       0.80      0.80      0.80      1407



Accuracy is low due to imbalanced dataset.

Recall, precision, and f1 score is low for class 1

We can try SMOTEENN

### SMOTEENN

In [11]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [12]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [13]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [14]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9223549488054608
              precision    recall  f1-score   support

           0       0.94      0.89      0.91       530
           1       0.91      0.95      0.93       642

    accuracy                           0.92      1172
   macro avg       0.92      0.92      0.92      1172
weighted avg       0.92      0.92      0.92      1172



In [15]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[471  59]
 [ 32 610]]


These are better results. Accuracy 93% with good precision, recall, and f1-scores

### Random Forest Classifier

In [16]:
model_rf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [17]:
model_rf.fit(x_train, y_train)

In [18]:
y_pred=model_rf.predict(x_test)

In [19]:
model_rf.score(x_test,y_test)

0.7938877043354655

In [20]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1042
           1       0.64      0.48      0.55       365

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



We can use SMOTEENN for this as well

In [21]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [22]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2, random_state=42)

In [23]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [24]:
model_rf_smote.fit(xr_train1,yr_train1)

In [25]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [26]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [27]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9182978723404255
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       543
           1       0.90      0.96      0.93       632

    accuracy                           0.92      1175
   macro avg       0.92      0.91      0.92      1175
weighted avg       0.92      0.92      0.92      1175



In [28]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[473  70]
 [ 26 606]]


### Performing PCA

In [29]:
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [30]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [31]:
model.fit(xr_train_pca,yr_train1)

In [32]:
yr_predict_pca = model.predict(xr_test_pca)

In [33]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [34]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.708936170212766
              precision    recall  f1-score   support

           0       0.72      0.60      0.65       543
           1       0.70      0.81      0.75       632

    accuracy                           0.71      1175
   macro avg       0.71      0.70      0.70      1175
weighted avg       0.71      0.71      0.71      1175



We didn't get better results with PCA, so we can finalize the model created by the RF Classifier.

## Pickle The Model

In [35]:
import pickle

In [36]:
filename = 'model.sav'

In [37]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [38]:
load_model = pickle.load(open(filename, 'rb'))

In [39]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [40]:
model_score_r1

0.9182978723404255

I belive this is my best and my most thought out attempt, please let me know if there are any better methodologies

- Khayyam Khan