## INFERENCE

In [1]:
#import Libraries
import warnings
warnings.filterwarnings("ignore")
import pyodbc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
churn_data_raw = pd.read_csv('data/churn_data_fe.csv')
churn_data_raw1 = pd.read_csv('data/churn_data_fe1.csv')

##### Loading the Models

In [3]:
import pickle

with open('models/robust_rfc_pkl' , 'rb') as f:
    rfc_tuned = pickle.load(f)

In [4]:
with open('models/robust_xgb_pkl' , 'rb') as f:
    xgb_tuned = pickle.load(f)

In [5]:
with open('models/rf_under_2_1_pkl' , 'rb') as f:
    rfc = pickle.load(f)

In [6]:
with open('models/xgb_under_2_1_pkl' , 'rb') as f:
    xgb = pickle.load(f)

In [7]:
with open('models/random_forest_pkl' , 'rb') as f:
    random_forest = pickle.load(f)

In [9]:
with open('models/xgboost_pkl' , 'rb') as f:
    xgboost = pickle.load(f)

### Predicting May

In [15]:
churn_pred = churn_data_raw.copy()

In [16]:
churn_pred.head()

Unnamed: 0,customer_id,mnth1,mnth2,mnth3,mnth4,mnth5,mnth6,age,card,ussd,...,customer_segment_freq,generation_freq,occupation_freq,region_name_freq,Active,Inactive,F,M,avg_5mnths_cnt,5mnths_cnt_std
0,71,0,24,0,1,0,0,45,1,0,...,561040,620852,480841,555244,0,1,0,1,5,10.630146
1,112,8,1,14,22,3,0,83,1,0,...,561040,163871,480841,555244,0,1,0,1,9,8.544004
2,194,96,126,68,72,98,128,75,0,0,...,561040,163871,352146,55641,1,0,1,0,92,23.366643
3,378,176,208,166,236,288,184,70,1,0,...,561040,163871,480841,946701,1,0,0,1,214,49.345719
4,378,176,208,166,236,288,184,70,1,0,...,887694,163871,480841,946701,1,0,0,1,214,49.345719


In [17]:
#month 1 which is november will be dropped and month6 which is april will be kept. Unlike what was done in training.
churn_pred.drop(columns=['mnth1', 'customer_id', 'churn'], axis=1, inplace=True)

churn_pred.rename(columns={'mnth2':'mnth1', 'mnth3':'mnth2', 'mnth4':'mnth3', 'mnth5':'mnth4', 'mnth6':'mnth5'},
                  inplace=True)

In [18]:
churn_pred.head()

Unnamed: 0,mnth1,mnth2,mnth3,mnth4,mnth5,age,card,ussd,newmobile,customer_segment_freq,generation_freq,occupation_freq,region_name_freq,Active,Inactive,F,M,avg_5mnths_cnt,5mnths_cnt_std
0,24,0,1,0,0,45,1,0,1,561040,620852,480841,555244,0,1,0,1,5,10.630146
1,1,14,22,3,0,83,1,0,0,561040,163871,480841,555244,0,1,0,1,9,8.544004
2,126,68,72,98,128,75,0,0,1,561040,163871,352146,55641,1,0,1,0,92,23.366643
3,208,166,236,288,184,70,1,0,1,561040,163871,480841,946701,1,0,0,1,214,49.345719
4,208,166,236,288,184,70,1,0,1,887694,163871,480841,946701,1,0,0,1,214,49.345719


#### Using Random Forest to Predict May

In [19]:
#Random Forest Tuned
churn_may = rfc_tuned.predict(churn_pred)

In [20]:
#Random Forest Undersampled 2:1
churn_may_u2 = rfc.predict(churn_pred)
#Random Forest Undersampled 1:1
churn_may_u1 = random_forest.predict(churn_pred)

In [21]:
### put the predictions in a data frame
churn_may = pd.DataFrame(churn_may, index=churn_pred.index, columns=['churn_pred_may'])
churn_may_u2 = pd.DataFrame(churn_may_u2, index=churn_pred.index, columns=['churn_pred_may'])
churn_may_u1 = pd.DataFrame(churn_may_u1, index=churn_pred.index, columns=['churn_pred_may'])

In [23]:
churn_may.head()

Unnamed: 0,churn_pred_may
0,1
1,1
2,0
3,0
4,0


In [24]:
#concat predictions dataframe with churn_data_raw
churn_may_pred = pd.concat([churn_data_raw, churn_may], axis=1)
churn_may_pred_u2 = pd.concat([churn_data_raw, churn_may_u2], axis=1)
churn_may_pred_u1 = pd.concat([churn_data_raw, churn_may_u1], axis=1)

In [26]:
churn_may_pred.head()

Unnamed: 0,customer_id,mnth1,mnth2,mnth3,mnth4,mnth5,mnth6,age,card,ussd,...,generation_freq,occupation_freq,region_name_freq,Active,Inactive,F,M,avg_5mnths_cnt,5mnths_cnt_std,churn_pred_may
0,71,0,24,0,1,0,0,45,1,0,...,620852,480841,555244,0,1,0,1,5,10.630146,1
1,112,8,1,14,22,3,0,83,1,0,...,163871,480841,555244,0,1,0,1,9,8.544004,1
2,194,96,126,68,72,98,128,75,0,0,...,163871,352146,55641,1,0,1,0,92,23.366643,0
3,378,176,208,166,236,288,184,70,1,0,...,163871,480841,946701,1,0,0,1,214,49.345719,0
4,378,176,208,166,236,288,184,70,1,0,...,163871,480841,946701,1,0,0,1,214,49.345719,0


#### Using XGBoost to Predict May

In [27]:
#XG Boost Tuned
churn_mayx = xgb_tuned.predict(churn_pred)

In [28]:
#XG Boost Undersampled 2:1
churn_mayx_u2 = xgb.predict(churn_pred)
#XG Boost Undersampled 1:1
churn_mayx_u1 = xgboost.predict(churn_pred)

In [29]:
### put the predictions in a data frame
churn_mayx = pd.DataFrame(churn_mayx, index=churn_pred.index, columns=['churn_pred_may'])
churn_mayx_u2 = pd.DataFrame(churn_mayx_u2, index=churn_pred.index, columns=['churn_pred_may'])
churn_mayx_u1 = pd.DataFrame(churn_mayx_u1, index=churn_pred.index, columns=['churn_pred_may'])

In [31]:
churn_mayx.head()

Unnamed: 0,churn_pred_may
0,1
1,1
2,0
3,0
4,0


In [32]:
#concat predictions dataframe with churn_data_raw
churn_mayx_pred = pd.concat([churn_data_raw, churn_mayx], axis=1)
churn_mayx_pred_u2 = pd.concat([churn_data_raw, churn_mayx_u2], axis=1)
churn_mayx_pred_u1 = pd.concat([churn_data_raw, churn_mayx_u1], axis=1)

In [34]:
churn_mayx_pred.head()

Unnamed: 0,customer_id,mnth1,mnth2,mnth3,mnth4,mnth5,mnth6,age,card,ussd,...,generation_freq,occupation_freq,region_name_freq,Active,Inactive,F,M,avg_5mnths_cnt,5mnths_cnt_std,churn_pred_may
0,71,0,24,0,1,0,0,45,1,0,...,620852,480841,555244,0,1,0,1,5,10.630146,1
1,112,8,1,14,22,3,0,83,1,0,...,163871,480841,555244,0,1,0,1,9,8.544004,1
2,194,96,126,68,72,98,128,75,0,0,...,163871,352146,55641,1,0,1,0,92,23.366643,0
3,378,176,208,166,236,288,184,70,1,0,...,163871,480841,946701,1,0,0,1,214,49.345719,0
4,378,176,208,166,236,288,184,70,1,0,...,163871,480841,946701,1,0,0,1,214,49.345719,0


#####  Evaluate Prediction

In [35]:
inf_raw = pd.read_csv('data/inf_fe.csv')

In [36]:
inf = inf_raw.copy()

In [37]:
inf.head()

Unnamed: 0,customer_id,mnth1,mnth2,mnth3,mnth4,mnth5,mnth6,age,card,ussd,...,customer_segment_freq,generation_freq,occupation_freq,region_name_freq,Active,Inactive,F,M,avg_5mnths_cnt,5mnths_cnt_std
0,71,24,0,1,0,0,0,45,1,0,...,562876,628607,495865,564708,0,1,0,1,5,10.630146
1,465,19,23,35,59,38,32,60,0,1,...,458070,165729,495865,965667,1,0,0,1,34,15.684387
2,582,38,17,31,18,21,32,81,1,0,...,925096,165729,292515,564708,1,0,0,1,25,9.110434
3,601,53,56,47,70,37,8,71,1,1,...,589170,165729,12964,965667,1,0,0,1,52,12.124356
4,738,1,2,6,10,8,2,57,0,0,...,925096,628607,495865,564708,1,0,0,1,5,3.741657


#### Random Forest Evaluation

In [38]:
#create dataframes for comparing the models
compare_rfc = pd.merge(churn_may_pred[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']], on='customer_id')
compare_rfc_u2 = pd.merge(churn_may_pred_u2[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']], on='customer_id')
compare_rfc_u1 = pd.merge(churn_may_pred_u1[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']],on='customer_id')

In [40]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#rfc tuned model
pd.DataFrame(confusion_matrix(compare_rfc['churn_pred_may'], compare_rfc['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2558646,2
Actual: True,121420,307512


In [41]:
#rfc undersampled 2:1 model
pd.DataFrame(confusion_matrix(compare_rfc_u2['churn_pred_may'], compare_rfc_u2['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2558633,0
Actual: True,121433,307514


In [43]:
#rfc undersampled 1:1 model
pd.DataFrame(confusion_matrix(compare_rfc_u1['churn_pred_may'], compare_rfc_u1['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2558634,7
Actual: True,121432,307507


In [None]:
print(classification_report(compare_rfc['churn_pred_may'], compare_rfc['churn']))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98   2558648
           1       1.00      0.72      0.84    428932

    accuracy                           0.96   2987580
   macro avg       0.98      0.86      0.91   2987580
weighted avg       0.96      0.96      0.96   2987580



In [45]:
print(classification_report(compare_rfc_u2['churn_pred_may'], compare_rfc_u2['churn']))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98   2558633
           1       1.00      0.72      0.84    428947

    accuracy                           0.96   2987580
   macro avg       0.98      0.86      0.91   2987580
weighted avg       0.96      0.96      0.96   2987580



In [47]:
print(classification_report(compare_rfc_u1['churn_pred_may'], compare_rfc_u1['churn']))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98   2558641
           1       1.00      0.72      0.84    428939

    accuracy                           0.96   2987580
   macro avg       0.98      0.86      0.91   2987580
weighted avg       0.96      0.96      0.96   2987580



The undersampled random forest model of ratio 2:1 seems to perform better than the other random forest models as seen in the confusion matrix.

#### XGBoost Evaluation

In [49]:
compare_xgb = pd.merge(churn_mayx_pred[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']], on='customer_id')
compare_xgb_u2 = pd.merge(churn_mayx_pred_u2[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']],on='customer_id')
compare_xgb_u1 = pd.merge(churn_mayx_pred_u1[['customer_id', 'churn_pred_may']], inf[['customer_id', 'churn']],on='customer_id')

In [None]:
#xgb tuned model
pd.DataFrame(confusion_matrix(compare_xgb['churn_pred_may'], compare_xgb['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2580017,4937
Actual: True,100049,302577


In [51]:
#xgb undersampled 2:1 model
pd.DataFrame(confusion_matrix(compare_xgb_u2['churn_pred_may'], compare_xgb_u2['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2563944,625
Actual: True,116122,306889


In [52]:
#xgb undersampled 1:1 model
pd.DataFrame(confusion_matrix(compare_xgb_u1['churn_pred_may'], compare_xgb_u1['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2562565,134
Actual: True,117501,307380


In [None]:
print(classification_report(compare_xgb['churn_pred_may'], compare_xgb['churn']))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98   2584954
           1       0.98      0.75      0.85    402626

    accuracy                           0.96   2987580
   macro avg       0.97      0.87      0.92   2987580
weighted avg       0.97      0.96      0.96   2987580



In [55]:
print(classification_report(compare_xgb_u2['churn_pred_may'], compare_xgb_u2['churn']))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98   2564569
           1       1.00      0.73      0.84    423011

    accuracy                           0.96   2987580
   macro avg       0.98      0.86      0.91   2987580
weighted avg       0.96      0.96      0.96   2987580



In [56]:
print(classification_report(compare_xgb_u1['churn_pred_may'], compare_xgb_u1['churn']))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98   2562699
           1       1.00      0.72      0.84    424881

    accuracy                           0.96   2987580
   macro avg       0.98      0.86      0.91   2987580
weighted avg       0.96      0.96      0.96   2987580



The best XGBoost Model is the one tuned without undersampling, with a recall of 75%. In this case we're more particular about the recall than the precision

The random Forest Models seem to do better than the xgboost on inference in classifying true positives as seen in the confusion matrix. So we'll go with the best RFC model, which is the one undersampled in 2:1.

## Conclusion

The Best Model seems to be our second Random Forest Classifier.

The performance of the model on inference will be improved in version 2

#### Best Model at the moment

In [None]:
#rfc undersampled 2:1 model
pd.DataFrame(confusion_matrix(compare_rfc_u2['churn_pred_may'], compare_rfc_u2['churn']), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,2558633,0
Actual: True,121433,307514
