In [3]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
%matplotlib inline 

# preprocessing
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# model tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier


# class imbalance
from sklearn.dummy import DummyClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# evaluating models
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/preprocessed_cutomer_data.csv',index_col=0)
extra_features = pd.read_csv('../data/new_features.csv',index_col=0)

In [4]:
# train test split
X = df.drop(columns=['churn'])
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.20,random_state=10)

In [5]:
# scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=[X_train.columns])
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=[X_test.columns])

In [6]:
# using SMOTE to reduce class imbalance
sm = SMOTE(random_state=64)
X_train_SM, y_train_SM = sm.fit_sample(X_train_scaled, y_train)
X_train_SM = pd.DataFrame(X_train_SM,columns=X_train_scaled.columns)

## Modeling

In [7]:
models_data = []
def scoreboard(classifier,y_true, y_pred):
    ''' 
    returns classifier name, params and
    calculates and returns Accuracy, F1 score, Precision, and Recall using sklearn
    '''    
    model_summary = {'Model': classifier,
                     'Params': classifier.get_params(),
                     'Accuracy':accuracy_score(y_true, y_pred),
                     'F1_score':f1_score(y_true, y_pred),
                     'Precision':precision_score(y_true, y_pred),
                     'Recall':recall_score(y_true, y_pred)}

    models_data.append(model_summary)
    return model_summary

def justscore(y_true, y_pred):
    ''' 
    
    calculates and returns Accuracy, F1 score, Precision, and Recall using sklearn
    '''    
    model_summary = {
                     
                     'Accuracy':accuracy_score(y_true, y_pred),
                     'F1_score':f1_score(y_true, y_pred),
                     'Precision':precision_score(y_true, y_pred),
                     'Recall':recall_score(y_true, y_pred)}


    return model_summary

### Logistic Regression

In [8]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train_SM,y_train_SM)
lr_preds = lr.predict(X_test_scaled)
scoreboard(lr,y_test,lr_preds)

{'Model': LogisticRegression(),
 'Params': {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 100,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': None,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'Accuracy': 0.7511244377811095,
 'F1_score': 0.45394736842105265,
 'Precision': 0.33014354066985646,
 'Recall': 0.7263157894736842}

### KNN

In [9]:
knn = KNeighborsClassifier()
knn.fit(X_train_SM,y_train_SM)
knn_preds = knn.predict(X_test_scaled)
scoreboard(knn,y_test,knn_preds)

{'Model': KNeighborsClassifier(),
 'Params': {'algorithm': 'auto',
  'leaf_size': 30,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': None,
  'n_neighbors': 5,
  'p': 2,
  'weights': 'uniform'},
 'Accuracy': 0.7916041979010495,
 'F1_score': 0.4908424908424908,
 'Precision': 0.37640449438202245,
 'Recall': 0.7052631578947368}

### Support Vector

In [10]:
svc = SVC()
svc.fit(X_train_SM,y_train_SM)
svc_preds = svc.predict(X_test_scaled)
scoreboard(svc,y_test,svc_preds)

{'Model': SVC(),
 'Params': {'C': 1.0,
  'break_ties': False,
  'cache_size': 200,
  'class_weight': None,
  'coef0': 0.0,
  'decision_function_shape': 'ovr',
  'degree': 3,
  'gamma': 'scale',
  'kernel': 'rbf',
  'max_iter': -1,
  'probability': False,
  'random_state': None,
  'shrinking': True,
  'tol': 0.001,
  'verbose': False},
 'Accuracy': 0.9025487256371814,
 'F1_score': 0.6798029556650247,
 'Precision': 0.6388888888888888,
 'Recall': 0.7263157894736842}

### Decision tree

In [11]:
dt = DecisionTreeClassifier()
dt.fit(X_train_SM,y_train_SM)
dt_preds = dt.predict(X_test_scaled)
scoreboard(dt,y_test,dt_preds)

{'Model': DecisionTreeClassifier(),
 'Params': {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_impurity_split': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'presort': 'deprecated',
  'random_state': None,
  'splitter': 'best'},
 'Accuracy': 0.8770614692653673,
 'F1_score': 0.6272727272727273,
 'Precision': 0.552,
 'Recall': 0.7263157894736842}