# 1. Loading the Libraries

In [118]:
import pandas as pd
import numpy as np
from pandas import Series
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_ml import ConfusionMatrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, auc, roc_auc_score
%matplotlib inline

# 2. Loading the Dataset

In [119]:
churn_data = pd.read_csv('df.csv')

# 3. Data Preprocessing, EDA & Feature selection

## a. View Data Attributes

In [120]:
churn_data.head()

Unnamed: 0,age_new,churn_indicator_new,discount_amt_cum,gender_new,gross_amt_cum,marital_status_new,online_count_cum,order_size_cum,recency_new,redemption_count_cum,store_count_cum,weeks_old_new
0,56,1,0,1,0,0,5,7,51,2,1,51
1,38,0,0,0,0,0,1,1,8,0,4,23
2,45,1,0,0,0,0,1,5,12,1,0,12
3,45,1,0,0,0,0,0,7,510,2,2,510
4,45,1,0,0,0,0,0,5,455,0,7,505


In [None]:
churn_data.shape

In [None]:
churn_data.churn_indicator_new.value_counts()

In [None]:
churn_data.describe()

In [124]:
churn_data.dtypes

age_new                 int64
churn_indicator_new     int64
discount_amt_cum        int64
gender_new              int64
gross_amt_cum           int64
marital_status_new      int64
online_count_cum        int64
order_size_cum          int64
recency_new             int64
redemption_count_cum    int64
store_count_cum         int64
weeks_old_new           int64
dtype: object

In [125]:
churn_data['gender_new'] = churn_data['gender_new'].astype('category',copy=False)

In [126]:
y = churn_data.churn_indicator_new

In [127]:
X = churn_data.ix[:, churn_data.columns != 'churn_indicator_new']

In [128]:
X_colnames = X.columns.tolist()

## i. Split the dataset into training and test

In [129]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# 3. Modeling

## a. Logistic Regression

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)

In [131]:
cm_logistic_train = ConfusionMatrix(y_train, y_train_pred)

In [132]:
y_test_pred = clf.predict(X_test)

In [133]:
cm_logistic_test = ConfusionMatrix(y_test, y_test_pred)

In [134]:
auc_logistic_train = roc_auc_score(y_train, y_train_pred)
auc_logistic_test = roc_auc_score(y_test, y_test_pred)

## b. Decision Tree

In [135]:
from sklearn import tree

In [136]:
dt = tree.DecisionTreeClassifier()

In [137]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [138]:
yhat_dt_train = dt.predict(X_train)

In [139]:
cm_dt_train = ConfusionMatrix(y_train, yhat_dt_train)

In [140]:
yhat_dt_test = dt.predict(X_test)

In [141]:
cm_dt_test = ConfusionMatrix(y_test, yhat_dt_test)

In [142]:
auc_dt_train = roc_auc_score(y_train, yhat_dt_train)
auc_dt_test = roc_auc_score(y_test, yhat_dt_test)

## c. SVC

In [143]:
from sklearn.svm import SVC

In [144]:
svc = SVC()

In [145]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [146]:
yhat_svc_train = svc.predict(X_train)

In [147]:
cm_svc_train = ConfusionMatrix(y_train, yhat_svc_train)

In [148]:
yhat_svc_test = svc.predict(X_test)

In [149]:
cm_svc_test = ConfusionMatrix(y_test, yhat_svc_test)

In [150]:
auc_svc_train = roc_auc_score(y_train, yhat_svc_train)
auc_svc_test = roc_auc_score(y_test, yhat_svc_test)

## d. Random Forest

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
rf = RandomForestClassifier()

In [153]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [154]:
yhat_rf_train = rf.predict(X_train)

In [155]:
cm_rf_train = ConfusionMatrix(y_train, yhat_rf_train)

In [156]:
yhat_rf_test = rf.predict(X_test)

In [157]:
cm_rf_test = ConfusionMatrix(y_test, yhat_rf_test)

In [158]:
auc_rf_train = roc_auc_score(y_train, yhat_rf_train)
auc_rf_test = roc_auc_score(y_test, yhat_rf_test)

# 3. Model Comparison and selection of the best

In [159]:
TPR = pd.Series([cm_logistic_train.TPR, cm_logistic_test.TPR, cm_dt_train.TPR, cm_dt_test.TPR, cm_svc_train.TPR, cm_svc_test.TPR, cm_rf_train.TPR, cm_rf_test.TPR])
TNR = pd.Series([cm_logistic_train.TNR, cm_logistic_test.TNR, cm_dt_train.TNR, cm_dt_test.TNR, cm_svc_train.TNR, cm_svc_test.TNR, cm_rf_train.TNR, cm_rf_test.TNR])
F1 = pd.Series([cm_logistic_train.F1_score, cm_logistic_test.F1_score, cm_dt_train.F1_score, cm_dt_test.F1_score, cm_svc_train.F1_score, cm_svc_test.F1_score, cm_rf_train.F1_score, cm_rf_test.F1_score])
AUC = pd.Series([auc_logistic_train, auc_logistic_test, auc_dt_train, auc_dt_test, auc_svc_train, auc_svc_test, auc_rf_train, auc_rf_test])

index = pd.Series(['Logistic_Train', 'Logistic_Test', 'DT_Train', 'DT_Test', 'SVC_Train', 'SVC_Test', 'RF_Train', 'RF_Test'])
results = pd.concat([index, TPR,TNR, F1, AUC], axis=1)
results.columns = ['Model', 'Sensitivity', 'Specificity', 'F1', 'AUC']
results

Unnamed: 0,Model,Sensitivity,Specificity,F1,AUC
0,Logistic_Train,0.593023,0.481752,0.591304,0.98018
1,Logistic_Test,0.5,0.6,0.555556,0.978154
2,DT_Train,0.581395,0.474453,0.581395,1.0
3,DT_Test,0.45,0.6,0.514286,1.0
4,SVC_Train,0.581395,0.474453,0.581395,1.0
5,SVC_Test,1.0,0.266667,0.784314,0.608696
6,RF_Train,0.581395,0.474453,0.581395,1.0
7,RF_Test,0.45,0.6,0.514286,0.992647
