In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, mean_squared_log_error
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv("/content/telco_churn_EDA.csv")
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0


In [3]:
df.shape

(7032, 51)

In [4]:
##defining X and y data
X = df.drop('Churn', axis=1)
y = df.Churn


In [5]:
print(X.shape)
print(y.shape)

(7032, 50)
(7032,)


In [6]:
## Splitting data into training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [8]:
#fitting various models
seed = 21
num_trees = 200

model_dt = DecisionTreeClassifier(criterion = "gini",random_state = seed)
model_br = BaggingClassifier(n_estimators=num_trees, random_state=seed)
model_rf = RandomForestClassifier(n_estimators=num_trees, random_state=seed)
model_abr = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
model_svr = SVC()
model_knr = KNeighborsClassifier()

models = [model_dt, model_br, model_rf, model_abr, model_svr, model_knr]
for model in models:
  model_name = str(model).partition('(')[0] #Partitioning the text and keeping only the head [0], which is model name
  print('\n')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"Accuracy Score of {model_name}: ", accuracy_score(y_test, y_pred))
  print(f"RMSE of {model_name}: ", np.sqrt(mean_squared_error(y_test, y_pred)))
  



Accuracy Score of DecisionTreeClassifier:  0.7299218194740583
RMSE of DecisionTreeClassifier:  0.5196904660718163


Accuracy Score of BaggingClassifier:  0.7882018479033405
RMSE of BaggingClassifier:  0.46021533231375455


Accuracy Score of RandomForestClassifier:  0.7860696517412935
RMSE of RandomForestClassifier:  0.46252605143786923


Accuracy Score of AdaBoostClassifier:  0.7889125799573561
RMSE of AdaBoostClassifier:  0.45944251005174075


Accuracy Score of SVC:  0.7157071783937455
RMSE of SVC:  0.5331911679747279


Accuracy Score of KNeighborsClassifier:  0.7654584221748401
RMSE of KNeighborsClassifier:  0.48429492855610196


Since, this is a highly imbalanced dataset, we might end up with metric trap if we continue checking accuracy_score. 
We shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.


In [9]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      1007
           1       0.62      0.46      0.53       400

    accuracy                           0.77      1407
   macro avg       0.71      0.67      0.69      1407
weighted avg       0.75      0.77      0.75      1407



As, evident from the classification report above , the precision, recall and f1-score is quite low for class 1. Therefore, we need to balance the dataset.


In [12]:
##Using combination of Oversampling and undersampling for balancing the dataset

from imblearn.combine import SMOTEENN

sme = SMOTEENN(random_state=21)
X_sme, y_sme = sme.fit_sample(X, y)



In [13]:
print(X_sme.shape)
print(y_sme.shape)


(5915, 50)
(5915,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_sme, y_sme, test_size=0.2, random_state=21)

In [15]:
#fitting various models on balanced dataset
seed = 21
num_trees = 500

model_dt = DecisionTreeClassifier(criterion = "gini",random_state = seed)
model_br = BaggingClassifier(n_estimators=num_trees, random_state=seed)
model_rf = RandomForestClassifier(n_estimators=num_trees, random_state=seed)
model_abr = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
model_svr = SVC()
model_knr = KNeighborsClassifier()
model_xgb = XGBClassifier(n_estimators=num_trees, random_state=seed)

all_models_names =[]
RMSE_values =[]
models = [model_dt, model_br, model_rf, model_xgb, model_abr, model_svr, model_knr]
for model in models:
  model_name = str(model).partition('(')[0] #Partitioning the text and keeping only the head [0], which is model name
  print('\n')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"{model_name}: \n", classification_report(y_test, y_pred, labels=[0,1]))
  RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
  print(f"RMSE of {model_name}: ", RMSE)
  all_models_names.append(model_name)
  RMSE_values.append(RMSE)



DecisionTreeClassifier: 
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       546
           1       0.93      0.95      0.94       637

    accuracy                           0.93      1183
   macro avg       0.93      0.93      0.93      1183
weighted avg       0.93      0.93      0.93      1183

RMSE of DecisionTreeClassifier:  0.25841705530340325


BaggingClassifier: 
               precision    recall  f1-score   support

           0       0.96      0.95      0.96       546
           1       0.96      0.97      0.96       637

    accuracy                           0.96      1183
   macro avg       0.96      0.96      0.96      1183
weighted avg       0.96      0.96      0.96      1183

RMSE of BaggingClassifier:  0.20143189867937758


RandomForestClassifier: 
               precision    recall  f1-score   support

           0       0.97      0.94      0.95       546
           1       0.95      0.97      0.96       637

  

Clearly, XGBoost Classifier has the least RMSE value, so we will proceed with Hyperparameter tuning in XGBoost

In [18]:
##Using XGBoost Classifier, RandomSearchCV, GridSearchCV to get the best parameters

param_grid = {
     'max_depth':range(3,10,2),
     'min_child_weight':range(1,6,2)}

model = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=200, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=seed), 
 param_grid = param_grid, scoring='roc_auc',n_jobs=4,iid=False, cv=5, verbose=2)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("\n RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
model.best_params_, model.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   45.3s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:  1.5min finished



 RMSE:  0.1884222879063983


({'max_depth': 5, 'min_child_weight': 1}, 0.9926993364132934)

In [19]:
param_grid = {
     'subsample':[i/10.0 for i in range(6,10)],
     'colsample_bytree':[i/10.0 for i in range(6,10)],
     'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
     }

model = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=200, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=seed), 
 param_grid = param_grid, scoring='roc_auc',n_jobs=4,iid=False, cv=5, verbose=2)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("\n RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
model.best_params_, model.best_score_

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   45.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:  8.1min finished



 RMSE:  0.18156825980064073


({'colsample_bytree': 0.8, 'reg_alpha': 1e-05, 'subsample': 0.9},
 0.9931562841473489)

In [28]:
##Using tuned XGboost model

trained_model = XGBClassifier(
 learning_rate =0.1,
 n_estimators=200,
 max_depth=7,
 min_child_weight=0,
 gamma=0,
 subsample=0.9,
 colsample_bytree=0.6,
 reg_alpha=0.1,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=seed)


In [34]:
trained_model.fit(X_train, y_train)
y_pred = trained_model.predict(X_test)

print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print(classification_report(y_test, y_pred, labels=[0,1]))
print("Score: ", trained_model.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

RMSE:  0.17922534538791307
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       546
           1       0.96      0.98      0.97       637

    accuracy                           0.97      1183
   macro avg       0.97      0.97      0.97      1183
weighted avg       0.97      0.97      0.97      1183

Score:  0.9678782755705833
[[522  24]
 [ 14 623]]


# Saving the model

In [35]:
import pickle

filename = 'model.sav'
pickle.dump(trained_model, open(filename, 'wb'))

In [36]:
#load saved model
load_model = pickle.load(open(filename, 'rb'))

In [37]:
model_score = load_model.score(X_test, y_test)

In [38]:
model_score

0.9678782755705833