### 1. Importing Libraries

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

### 2. Reading Files

In [4]:
df=pd.read_csv('cust_churn_dummies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Call_Failure,Seconds,Frequency of use,Frequency of SMS,Customer Value,Subscription length group_1 - 12,Subscription length group_13 - 24,Subscription length group_25 - 36,Subscription length group_37 - 48,...,Charge Amount_2,Charge Amount_3,Charge Amount_4,Charge Amount_5,Charge Amount_6,Charge Amount_7,Charge Amount_8,Charge Amount_9,Status_0,Status_1
0,0,8,4370,71,5,197.64,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,1,0,318,5,7,46.035,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2,10,2453,60,359,1536.52,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,3,10,4198,66,1,240.02,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,4,3,2393,58,2,145.805,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [5]:
df=df.drop('Unnamed: 0', axis=1)

In [6]:
x= df.drop(['Churn_0','Churn_1'], axis=1)
x.head()

Unnamed: 0,Call_Failure,Seconds,Frequency of use,Frequency of SMS,Customer Value,Subscription length group_1 - 12,Subscription length group_13 - 24,Subscription length group_25 - 36,Subscription length group_37 - 48,Subscription length group_49 - 60,...,Charge Amount_2,Charge Amount_3,Charge Amount_4,Charge Amount_5,Charge Amount_6,Charge Amount_7,Charge Amount_8,Charge Amount_9,Status_0,Status_1
0,8,4370,71,5,197.64,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,318,5,7,46.035,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,10,2453,60,359,1536.52,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,10,4198,66,1,240.02,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,3,2393,58,2,145.805,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


Storing the target variable in a new dataframe 

In [8]:
y= df['Churn_1']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Churn_1, dtype: int64

## 3. Training data using Train Test Split

In [10]:
x_train, x_test,y_train, y_test = train_test_split(x,y, random_state=42, test_size=0.2)

### 1.Decision Tree Classifier

In [12]:
model_dt= DecisionTreeClassifier( criterion ='gini', random_state= 42, max_depth= 6, min_samples_leaf= 8)

In [13]:
print(x_train.shape)
print(y_train.shape)


(2520, 32)
(2520,)


In [14]:
model_dt.fit(x_train,y_train)

In [15]:
y_pred = model_dt.predict(x_test)

In [16]:
model_dt.score(x_test, y_test)

0.9015873015873016

In [17]:
print(classification_report(y_test, y_pred,labels =[0,1]))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       520
           1       0.78      0.61      0.68       110

    accuracy                           0.90       630
   macro avg       0.85      0.79      0.81       630
weighted avg       0.90      0.90      0.90       630



In [18]:
print(metrics.confusion_matrix(y_test, y_pred))


[[501  19]
 [ 43  67]]


<b>As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)<b>

In [20]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

Training and evaluating with decision tree classifier once again

In [22]:
xr_train, xr_test,yr_train, yr_test = train_test_split(X_resampled,y_resampled, test_size=0.2)


In [23]:
model_dt_smote= DecisionTreeClassifier( criterion ='gini', max_depth= 6, min_samples_leaf= 8)


In [24]:
model_dt_smote.fit(xr_train,yr_train)
y_dt_pred_smote = model_dt_smote.predict(xr_test)
model_dt_smote.score(xr_test, yr_test)

print(classification_report(yr_test, y_dt_pred_smote,labels =[0,1]))


              precision    recall  f1-score   support

           0       0.96      0.93      0.95       362
           1       0.94      0.97      0.96       433

    accuracy                           0.95       795
   macro avg       0.95      0.95      0.95       795
weighted avg       0.95      0.95      0.95       795



In [25]:
print(metrics.confusion_matrix(yr_test, y_dt_pred_smote))

[[337  25]
 [ 13 420]]


<b>Now we can see quite better results, i.e. Accuracy: 95 %, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.<b>

### 2. Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
model_rf_smote= RandomForestClassifier( n_estimators = 100, random_state =42, criterion = 'gini', min_samples_leaf= 8, max_depth =6)

In [30]:
model_rf_smote.fit(xr_train,yr_train)
y_rf_pred_smote = model_rf_smote.predict(xr_test)
model_rf_smote.score(xr_test, yr_test)

print(classification_report(yr_test, y_rf_pred_smote,labels =[0,1]))


              precision    recall  f1-score   support

           0       0.94      0.93      0.93       362
           1       0.94      0.95      0.94       433

    accuracy                           0.94       795
   macro avg       0.94      0.94      0.94       795
weighted avg       0.94      0.94      0.94       795



In [31]:
print(metrics.confusion_matrix(yr_test, y_rf_pred_smote))

[[335  27]
 [ 23 410]]


### 3. KNN Classifier

In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
knn_model_smote =KNeighborsClassifier()
knn_model_smote.fit(xr_train, yr_train)
y_knn_predict_smote= knn_model_smote.predict(xr_test)
knn_model_smote.score(xr_test, yr_test)

0.9622641509433962

In [35]:
print(classification_report(yr_test, y_knn_predict_smote))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       362
           1       0.94      0.99      0.97       433

    accuracy                           0.96       795
   macro avg       0.97      0.96      0.96       795
weighted avg       0.96      0.96      0.96       795



In [36]:
print(metrics.confusion_matrix(yr_test, y_knn_predict_smote))

[[336  26]
 [  4 429]]


best accuracy and other scores so far

### 4. Naive Baye's

In [39]:
from sklearn.naive_bayes import GaussianNB

In [40]:
nb_model_smote=  GaussianNB()
nb_model_smote.fit(xr_train, yr_train)
y_nb_predict_smote= nb_model_smote.predict(xr_test)
nb_model_smote.score(xr_test, yr_test)

0.8767295597484277

In [41]:
print(classification_report(yr_test, y_nb_predict_smote))

              precision    recall  f1-score   support

           0       0.93      0.79      0.85       362
           1       0.84      0.95      0.89       433

    accuracy                           0.88       795
   macro avg       0.89      0.87      0.87       795
weighted avg       0.88      0.88      0.88       795



In [42]:
print(metrics.confusion_matrix(yr_test, y_nb_predict_smote))

[[285  77]
 [ 21 412]]


### 5. SVM

In [44]:
from sklearn.svm import SVC

In [45]:
svm_model_smote=  SVC()
svm_model_smote.fit(xr_train, yr_train)
y_svm_predict_smote= svm_model_smote.predict(xr_test)
svm_model_smote.score(xr_test, yr_test)

0.7836477987421384

In [46]:
print(classification_report(yr_test, y_svm_predict_smote))

              precision    recall  f1-score   support

           0       0.83      0.67      0.74       362
           1       0.76      0.88      0.82       433

    accuracy                           0.78       795
   macro avg       0.79      0.77      0.78       795
weighted avg       0.79      0.78      0.78       795



In [47]:
print(metrics.confusion_matrix(yr_test, y_svm_predict_smote))

[[241 121]
 [ 51 382]]


### 6. Logistic Regression

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
lr_model_smote=  LogisticRegression(max_iter= 1000, solver='liblinear')
lr_model_smote.fit(xr_train, yr_train)
y_lr_predict_smote= lr_model_smote.predict(xr_test)
lr_model_smote.score(xr_test, yr_test)

0.9018867924528302

In [51]:
print(classification_report(yr_test, y_lr_predict_smote))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       362
           1       0.91      0.91      0.91       433

    accuracy                           0.90       795
   macro avg       0.90      0.90      0.90       795
weighted avg       0.90      0.90      0.90       795



In [52]:
print(metrics.confusion_matrix(yr_test, y_lr_predict_smote))

[[323  39]
 [ 39 394]]


As we are predicting the customers who will churn we wish to <b>catch as many churn risks as possible<b>.

Hence we choose **Recall score** to choose the best model as it will **minimize false negatives.**
Catching a few customers that won't churn doesn't hurt much

<b>KNN classifier has a recall score of 98 for (1 : customer will churn) <b>

Hence we choose **KNN classifier** as our final model

# Pickling the model

In [55]:
import pickle

In [56]:
filename = 'model.sav'

In [57]:
pickle.dump(knn_model_smote , open(filename, 'wb'))

In [58]:
load_model = pickle.load(open (filename, 'rb'))

In [59]:
model_score_r1= load_model.score(xr_test, yr_test)

In [60]:
model_score_r1

0.9622641509433962

Our final model i.e. KNN Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.