### Importing Libraries

In [26]:
!pip install imbalanced-learn



In [27]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import time

#### Reading csv

In [28]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [29]:
df=df.drop('Unnamed: 0',axis=1)

In [30]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [31]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Train Test Split

In [32]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Decision Tree Classifier

In [33]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [34]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [35]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
model_dt.score(x_test,y_test)

0.7825159914712153

In [37]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85      1032
           1       0.60      0.57      0.58       375

    accuracy                           0.78      1407
   macro avg       0.72      0.72      0.72      1407
weighted avg       0.78      0.78      0.78      1407



###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [38]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [39]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [40]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [41]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9421915444348576
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       515
           1       0.95      0.95      0.95       644

    accuracy                           0.94      1159
   macro avg       0.94      0.94      0.94      1159
weighted avg       0.94      0.94      0.94      1159



In [42]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[482  33]
 [ 34 610]]


###### Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Random Forest Classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [45]:
model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [46]:
y_pred=model_rf.predict(x_test)

In [47]:
model_rf.score(x_train,y_train)

0.8099555555555555

In [48]:
model_rf.score(x_test,y_test)

0.7853589196872779

In [49]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1032
           1       0.64      0.45      0.53       375

    accuracy                           0.79      1407
   macro avg       0.73      0.68      0.69      1407
weighted avg       0.77      0.79      0.77      1407



### Boosting with imbalanced class

In [51]:
from sklearn.ensemble import GradientBoostingClassifier
boost=GradientBoostingClassifier(n_estimators=100,verbose=1) #Number of iterations try n_estimators=[20,100,200]
##fitting the gradient boost classifier
start_time = time.time()
boost.fit(x_train,y_train)
print("Time taken by GBM "+ str((time.time() - start_time))+ " Seconds")

      Iter       Train Loss   Remaining Time 
         1           1.1061            1.19s
         2           1.0655            1.13s
         3           1.0328            1.03s
         4           1.0046            1.01s
         5           0.9813            0.99s
         6           0.9607            0.99s
         7           0.9429            0.97s
         8           0.9278            0.95s
         9           0.9139            0.94s
        10           0.9021            0.92s
        20           0.8323            0.77s
        30           0.8037            0.66s
        40           0.7867            0.56s
        50           0.7753            0.45s
        60           0.7659            0.37s
        70           0.7581            0.28s
        80           0.7518            0.18s
        90           0.7453            0.09s
       100           0.7391            0.00s
Time taken by GBM 0.9348433017730713 Seconds


In [53]:
from sklearn.metrics import f1_score
###predicting Gradient boosting model on the train Data
boost_predict_train=boost.predict(x_train)
cm1 = confusion_matrix(y_train,boost_predict_train)
print(cm1)

accuracy_train=f1_score(y_train, boost_predict_train, average='micro') 
print("train accuracy", accuracy_train)

[[3803  328]
 [ 628  866]]
train accuracy 0.8300444444444445


In [55]:
###predicting Gradient boosting model on the test Data
boost_predict_test=boost.predict(x_test)
cm1 = confusion_matrix(y_test,boost_predict_test)
print(cm1)

accuracy_test=f1_score(y_test, boost_predict_test, average='micro') 
print("test accuracy", accuracy_test)

[[939  93]
 [181 194]]
test accuracy 0.8052594171997157


### XGB Model

In [56]:
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn import preprocessing

#### With class imbalanced

In [57]:
train_labels_b = y_train.values
train_labels_b = preprocessing.LabelEncoder().fit_transform(train_labels_b)
test_labels_b = y_test.values
test_labels_b = preprocessing.LabelEncoder().fit_transform(test_labels_b)

matrix_train_b = xgboost.DMatrix(x_train,label=train_labels_b)
matrix_test_b = xgboost.DMatrix(x_test,label=test_labels_b)

In [58]:
params = {
    'max_depth': 8, 
    'eta':0.01, #Learning Rate
    'eval_metric':'merror', # Multiclass classification error rate. 
    #'tree_method' : "gpu_hist", # use this with colab in gpu mode for faster training
    'num_class': 9
}

start_time = time.time()

model=xgboost.train(params=params,
                    dtrain=matrix_train_b,
                    num_boost_round=300,    #Number of trees
                    early_stopping_rounds=3, # Stop after 3 rounds, if test error doesn't improve. 
                    evals=[(matrix_test_b,'test')] 
                   )

print("Time taken by XGB "+ str((time.time() - start_time))+ " Seconds")

[0]	test-merror:0.21748
[1]	test-merror:0.22033
[2]	test-merror:0.21890
Time taken by XGB 0.07300877571105957 Seconds


In [59]:
###prediction using XGB on the train Data
boost_predict_train=model.predict(matrix_train_b)
cm1 = confusion_matrix(train_labels_b,boost_predict_train)
print(cm1)

accuracy_train=f1_score(train_labels_b, boost_predict_train, average='micro') 
print("train accuracy", accuracy_train)

[[3793  338]
 [ 630  864]]
train accuracy 0.8279111111111112


In [60]:
###prediction using XGB on the test Data
boost_predict_test=model.predict(matrix_test_b)
cm1 = confusion_matrix(test_labels_b,boost_predict_test)
print(cm1)

accuracy_test=f1_score(test_labels_b, boost_predict_test, average='micro') 
print("test accuracy", accuracy_test)

[[918 114]
 [197 178]]
test accuracy 0.7789623312011372


In [61]:
boost_predict_test
print(boost_predict_test)
print(metrics.classification_report(y_test, y_pred))

[0. 0. 0. ... 0. 0. 0.]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1032
           1       0.64      0.45      0.53       375

    accuracy                           0.79      1407
   macro avg       0.73      0.68      0.69      1407
weighted avg       0.77      0.79      0.77      1407



## Resampling

In [62]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [63]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [64]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [65]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [66]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [67]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [68]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9366980325064157
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       544
           1       0.92      0.97      0.94       625

    accuracy                           0.94      1169
   macro avg       0.94      0.93      0.94      1169
weighted avg       0.94      0.94      0.94      1169



In [69]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[488  56]
 [ 18 607]]


###### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

###### We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

### Boosting after resampling

In [70]:
from sklearn.ensemble import GradientBoostingClassifier
boost=GradientBoostingClassifier(n_estimators=100,verbose=1) #Number of iterations try n_estimators=[20,100,200]
##fitting the gradient boost classifier
start_time = time.time()
boost.fit(xr_train1,yr_train1)
print("Time taken by GBM "+ str((time.time() - start_time))+ " Seconds")

      Iter       Train Loss   Remaining Time 
         1           1.2458            0.89s
         2           1.1352            0.85s
         3           1.0514            0.82s
         4           0.9804            0.81s
         5           0.9089            0.79s
         6           0.8515            0.81s
         7           0.8005            0.78s
         8           0.7574            0.76s
         9           0.7233            0.75s
        10           0.6846            0.73s
        20           0.4724            0.63s
        30           0.3815            0.55s
        40           0.3222            0.46s
        50           0.2830            0.38s
        60           0.2566            0.30s
        70           0.2397            0.23s
        80           0.2246            0.15s
        90           0.2129            0.08s
       100           0.2028            0.00s
Time taken by GBM 0.7621738910675049 Seconds


In [71]:
from sklearn.metrics import f1_score
###predicting Gradient boosting model on the train Data
boost_predict_train=boost.predict(xr_train1)
cm1 = confusion_matrix(yr_train1,boost_predict_train)
print(cm1)

accuracy_train=f1_score(yr_train1, boost_predict_train, average='micro') 
print("train accuracy", accuracy_train)

[[2006   87]
 [  73 2510]]
train accuracy 0.9657827202737382


In [72]:
###predicting Gradient boosting model on the test Data
boost_predict_test=boost.predict(xr_test)
cm1 = confusion_matrix(yr_test,boost_predict_test)
print(cm1)

accuracy_test=f1_score(yr_test, boost_predict_test, average='micro') 
print("test accuracy", accuracy_test)

[[494  21]
 [ 19 625]]
test accuracy 0.9654874892148404


### XG Boost after resampling

In [73]:
#Creating XGB Friendly data and matrices
train_labels = yr_train1.values
train_labels = preprocessing.LabelEncoder().fit_transform(train_labels)
test_labels = yr_test1.values
test_labels = preprocessing.LabelEncoder().fit_transform(test_labels)

matrix_train = xgboost.DMatrix(xr_train1,label=train_labels)
matrix_test = xgboost.DMatrix(xr_test1,label=test_labels)

In [74]:
params = {
    'max_depth': 8, 
    'eta':0.05, #Learning Rate
    'eval_metric':'merror', # Multiclass classification error rate. 
    #'tree_method' : "gpu_hist", # use this with colab in gpu mode for faster training
    'num_class': 9
}

start_time = time.time()

model=xgboost.train(params=params,
                    dtrain=matrix_train,
                    num_boost_round=300,    #Number of trees
                    early_stopping_rounds=3, # Stop after 3 rounds, if test error doesn't improve. 
                    evals=[(matrix_test,'test')] 
                   )

print("Time taken by XGB "+ str((time.time() - start_time))+ " Seconds")

[0]	test-merror:0.05988
[1]	test-merror:0.05731
[2]	test-merror:0.05389
[3]	test-merror:0.05304
[4]	test-merror:0.05475
[5]	test-merror:0.05218
[6]	test-merror:0.05389
[7]	test-merror:0.05218
Time taken by XGB 0.139021635055542 Seconds


In [75]:
###prediction using XGB on the train Data
boost_predict_train=model.predict(matrix_train)
cm1 = confusion_matrix(train_labels,boost_predict_train)
print(cm1)

accuracy_train=f1_score(train_labels, boost_predict_train, average='micro') 
print("train accuracy", accuracy_train)

[[1999   94]
 [  63 2520]]
train accuracy 0.9664242942686057


In [76]:
###prediction using XGB on the test Data
boost_predict_test=model.predict(matrix_test)
cm1 = confusion_matrix(test_labels,boost_predict_test)
print(cm1)

accuracy_test=f1_score(test_labels, boost_predict_test, average='micro') 
print("test accuracy", accuracy_test)

[[507  37]
 [ 24 601]]
test accuracy 0.9478186484174508


In [77]:
boost_predict_test
print(boost_predict_test)
print(metrics.classification_report(yr_test1, yr_predict1))

[1. 0. 0. ... 1. 0. 0.]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       544
           1       0.92      0.97      0.94       625

    accuracy                           0.94      1169
   macro avg       0.94      0.93      0.94      1169
weighted avg       0.94      0.94      0.94      1169



#### Performing PCA

In [78]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [79]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [80]:
model.fit(xr_train_pca,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [81]:
yr_predict_pca = model.predict(xr_test_pca)

In [82]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [83]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7074422583404619
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       544
           1       0.70      0.79      0.74       625

    accuracy                           0.71      1169
   macro avg       0.71      0.70      0.70      1169
weighted avg       0.71      0.71      0.70      1169



##### With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage :)

#### Pickling the model

In [None]:
import pickle

In [None]:
filename = 'model.sav'

In [None]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

In [None]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [None]:
model_score_r1

0.9427350427350427

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.