In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
%matplotlib inline 

# preprocessing
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# model tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier


# class imbalance
from sklearn.dummy import DummyClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# evaluating models
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/preprocessed_cutomer_data.csv',index_col=0)
extra_features = pd.read_csv('../data/new_features.csv',index_col=0)

In [2]:
# train test split
X = df.drop(columns=['churn'])
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.20,random_state=10)

In [3]:
# scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=[X_train.columns])
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=[X_test.columns])

In [4]:
# using SMOTE to reduce class imbalance
sm = SMOTE(random_state=64)
X_train_SM, y_train_SM = sm.fit_sample(X_train_scaled, y_train)
X_train_SM = pd.DataFrame(X_train_SM,columns=X_train_scaled.columns)

## Modeling

In [5]:
models_data = []
def scoreboard(classifier,y_true, y_pred):
    ''' 
    returns classifier name, params and
    calculates and returns Accuracy, F1 score, Precision, and Recall using sklearn
    '''    
    model_summary = {'Model': classifier,
                     'Params': classifier.get_params(),
                     'Accuracy':accuracy_score(y_true, y_pred),
                     'F1_score':f1_score(y_true, y_pred),
                     'Precision':precision_score(y_true, y_pred),
                     'Recall':recall_score(y_true, y_pred)}

    models_data.append(model_summary)
    return model_summary

def justscore(y_true, y_pred):
    ''' 
    
    calculates and returns Accuracy, F1 score, Precision, and Recall using sklearn
    '''    
    model_summary = {
                     
                     'Accuracy':accuracy_score(y_true, y_pred),
                     'F1_score':f1_score(y_true, y_pred),
                     'Precision':precision_score(y_true, y_pred),
                     'Recall':recall_score(y_true, y_pred)}


    return model_summary

### Logistic Regression

In [6]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train_SM,y_train_SM)
lr_preds = lr.predict(X_test_scaled)
scoreboard(lr,y_test,lr_preds)

{'Model': LogisticRegression(),
 'Params': {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 100,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': None,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'Accuracy': 0.7511244377811095,
 'F1_score': 0.45394736842105265,
 'Precision': 0.33014354066985646,
 'Recall': 0.7263157894736842}

### KNN

In [7]:
knn = KNeighborsClassifier()
knn.fit(X_train_SM,y_train_SM)
knn_preds = knn.predict(X_test_scaled)
scoreboard(knn,y_test,knn_preds)

{'Model': KNeighborsClassifier(),
 'Params': {'algorithm': 'auto',
  'leaf_size': 30,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': None,
  'n_neighbors': 5,
  'p': 2,
  'weights': 'uniform'},
 'Accuracy': 0.7916041979010495,
 'F1_score': 0.4908424908424908,
 'Precision': 0.37640449438202245,
 'Recall': 0.7052631578947368}

### Support Vector

In [8]:
svc = SVC()
svc.fit(X_train_SM,y_train_SM)
svc_preds = svc.predict(X_test_scaled)
scoreboard(svc,y_test,svc_preds)

{'Model': SVC(),
 'Params': {'C': 1.0,
  'break_ties': False,
  'cache_size': 200,
  'class_weight': None,
  'coef0': 0.0,
  'decision_function_shape': 'ovr',
  'degree': 3,
  'gamma': 'scale',
  'kernel': 'rbf',
  'max_iter': -1,
  'probability': False,
  'random_state': None,
  'shrinking': True,
  'tol': 0.001,
  'verbose': False},
 'Accuracy': 0.9025487256371814,
 'F1_score': 0.6798029556650247,
 'Precision': 0.6388888888888888,
 'Recall': 0.7263157894736842}

### Decision tree

In [9]:
dt = DecisionTreeClassifier()
dt.fit(X_train_SM,y_train_SM)
dt_preds = dt.predict(X_test_scaled)
scoreboard(dt,y_test,dt_preds)

{'Model': DecisionTreeClassifier(),
 'Params': {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_impurity_split': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'presort': 'deprecated',
  'random_state': None,
  'splitter': 'best'},
 'Accuracy': 0.8860569715142429,
 'F1_score': 0.6481481481481481,
 'Precision': 0.5785123966942148,
 'Recall': 0.7368421052631579}

## Gridsearch

### Decision Tree

In [10]:
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[1,2,5,10]}
gs = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=1,cv=5)
gs.fit(X_train_SM,y_train_SM)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    1.1s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [1, 2, 5, 10]},
             verbose=1)

In [11]:
gsb = gs.best_estimator_

In [12]:
gs_preds = gs.best_estimator_.predict(X_test_scaled)
scoreboard(gs,y_test,gs_preds)

{'Model': GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
              param_grid={'criterion': ('gini', 'entropy'),
                          'max_depth': [1, 2, 5, 10]},
              verbose=1),
 'Params': {'cv': 5,
  'error_score': nan,
  'estimator__ccp_alpha': 0.0,
  'estimator__class_weight': None,
  'estimator__criterion': 'gini',
  'estimator__max_depth': None,
  'estimator__max_features': None,
  'estimator__max_leaf_nodes': None,
  'estimator__min_impurity_decrease': 0.0,
  'estimator__min_impurity_split': None,
  'estimator__min_samples_leaf': 1,
  'estimator__min_samples_split': 2,
  'estimator__min_weight_fraction_leaf': 0.0,
  'estimator__presort': 'deprecated',
  'estimator__random_state': None,
  'estimator__splitter': 'best',
  'estimator': DecisionTreeClassifier(),
  'iid': 'deprecated',
  'n_jobs': None,
  'param_grid': {'criterion': ('gini', 'entropy'), 'max_depth': [1, 2, 5, 10]},
  'pre_dispatch': '2*n_jobs',
  'refit': True,
  'return_train_score': False

### Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators=1000,random_state=23)
rf.fit(X_train_SM,y_train_SM)
rf_preds = rf.predict(X_test_scaled)
scoreboard(rf,y_test,rf_preds)

{'Model': RandomForestClassifier(n_estimators=1000, random_state=23),
 'Params': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': 'auto',
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_impurity_split': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 1000,
  'n_jobs': None,
  'oob_score': False,
  'random_state': 23,
  'verbose': 0,
  'warm_start': False},
 'Accuracy': 0.9475262368815592,
 'F1_score': 0.8205128205128205,
 'Precision': 0.8,
 'Recall': 0.8421052631578947}

In [14]:
grf = RandomForestClassifier()
gsrf_params = {'bootstrap':[True, False],
               'criterion':['gini','entropy'],
               'max_depth':[None,1,5,10,25],
               'max_features':['auto',3,8,17],
               'max_leaf_nodes':[None,2,5,10],
               'min_impurity_split':[None,.001,.1,1],
               'min_samples_leaf':[1,5,10] ,
               'min_samples_split':[2,5,10] ,
               'n_estimators':[10,50],
               'random_state':[23],
               'warm_start':[False,True],
               'class_weight':[None,'balanced']}
gs_rf = GridSearchCV(grf,gsrf_params,verbose=1,n_jobs=-1)

In [19]:
gs_rf_result  = RandomForestClassifier(bootstrap=False, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=3,
                       max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)

In [20]:
gs_rf_result.fit(X_train_SM,y_train_SM)
gs_rf_preds = gs_rf_result.predict(X_test_scaled)
scoreboard(gs_rf_result,y_test,gs_rf_preds)

{'Model': RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=3,
                        n_estimators=50, random_state=23),
 'Params': {'bootstrap': False,
  'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 3,
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_impurity_split': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 50,
  'n_jobs': None,
  'oob_score': False,
  'random_state': 23,
  'verbose': 0,
  'warm_start': False},
 'Accuracy': 0.9565217391304348,
 'F1_score': 0.839779005524862,
 'Precision': 0.8837209302325582,
 'Recall': 0.8}

## Summary

In [21]:
models_df = pd.DataFrame(models_data)

In [22]:
models_df.sort_values(by=['F1_score'],ascending=False)

Unnamed: 0,Model,Params,Accuracy,F1_score,Precision,Recall
6,"(DecisionTreeClassifier(criterion='entropy', m...","{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.956522,0.839779,0.883721,0.8
5,"(DecisionTreeClassifier(max_features='auto', r...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.947526,0.820513,0.8,0.842105
4,"GridSearchCV(cv=5, estimator=DecisionTreeClass...","{'cv': 5, 'error_score': nan, 'estimator__ccp_...",0.934033,0.765957,0.774194,0.757895
2,SVC(),"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.902549,0.679803,0.638889,0.726316
3,DecisionTreeClassifier(),"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.886057,0.648148,0.578512,0.736842
1,KNeighborsClassifier(),"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.791604,0.490842,0.376404,0.705263
0,LogisticRegression(),"{'C': 1.0, 'class_weight': None, 'dual': False...",0.751124,0.453947,0.330144,0.726316
