# Bank Churn Prediction

## Objective:
Given a Bank customer, build a neural network based classifier that can determine whether they will leave or not in the next 6 months

## Context:
Businesses like banks which provide service have to worry about problem of 'Churn' i.e. customers leaving and joining another service provider. It is important to understand which aspects of the service influence a customer's decision in this regard. Management can concentrate efforts on improvement of service, keeping in mind these priorities.

### 1. Read the data set

In [7]:
import tensorflow as tf
print(tf.__version__)

2.0.0


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from tensorflow.keras import optimizers
import warnings
warnings.filterwarnings("ignore")

In [9]:
# Reading the dataset
data = pd.read_csv('bank.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
data.shape

(10000, 14)

In [11]:
# Not necessary, but to have a basic understanding of data distribution
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RowNumber,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
CustomerId,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48


In [12]:
# Null values
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

### 2. Drop the columns which are unique for all users like IDs

In [13]:
# The following colums are unique for all users, therefore do not play any role in building the model
data.drop("RowNumber", axis = 1, inplace = True)
data.drop("CustomerId", axis = 1, inplace = True)
data.drop("Surname", axis = 1, inplace = True)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### 3. Distinguish the features and target variable

In [14]:
y_data = data.iloc[:, -1]

In [15]:
data.drop("Exited", axis = 1, inplace = True)

### Creating Dummy Variables

In [16]:
gen = pd.get_dummies(data['Geography'], prefix='Geography', drop_first=True)
data = pd.concat([data, gen], axis = 1)
data.drop("Geography", axis = 1, inplace = True)

In [17]:
gen = pd.get_dummies(data['Gender'], prefix='Gender', drop_first=True)
data = pd.concat([data, gen], axis = 1)
data.drop("Gender", axis = 1, inplace = True)

In [18]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [19]:
X_data = data

In [20]:
X_data.shape

(10000, 11)

In [21]:
X_data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


### 4. Divide the data set into training and test sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 0)

### 5. Normalize the train and test data

In [23]:
#X_train = preprocessing.normalize(X_train)

In [24]:
# For the above given dataset I find standardization to be benficial than normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train.shape

(8000, 11)

### 6. Initialize & build the model

### The model here has been updated after the model has been tuned for hyperparameters. The implementation can be seen in the section 7 below

In [25]:
model = Sequential()

In [26]:
model.add(Dense(units=10, kernel_initializer='uniform', input_shape = (11,), activation = 'relu'))
model.add(Dense(units=10, kernel_initializer='uniform', activation = 'relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation = 'sigmoid'))

In [27]:
#sgd = optimizers.Adam(lr = 0.001)

In [28]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 241
Trainable params: 241
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(X_train, y_train.values, validation_split=0.33, batch_size = 10, epochs = 50, verbose = 1)

Train on 5359 samples, validate on 2641 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1e06ea89408>

In [31]:
results = model.evaluate(X_test, y_test.values)



In [32]:
print(model.metrics_names)
print(results)   

['loss', 'accuracy']
[0.401703532576561, 0.8405]


### Evaluating the model using K-Fold Cross validation

### It is important to ensure that our model is accurate and has low variance. It can be seen that the Exited output variable is not equally distributed amoung the data. Therefore StratifiedK-Fold is employed to find if the model has acceptable variance

In [33]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [34]:
def keras_k_fold_model():
    classifier = Sequential()
    classifier.add(Dense(units=10, kernel_initializer='uniform', input_shape = (11,), activation = 'relu'))
    classifier.add(Dense(units=10, kernel_initializer='uniform', activation = 'relu'))
    classifier.add(Dense(units=1, kernel_initializer='uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
    return classifier

In [35]:
# K-fold (10 fold) cross validation
Kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = 10)

In [36]:
# model using KerasClassifier
classifier = KerasClassifier(build_fn=keras_k_fold_model, epochs=100, batch_size=10)

In [37]:
# kfold corss validation results
cross_val_result = cross_val_score(classifier, X_train, y_train.values, cv = Kfold)



In [51]:
print('mean of the model is : ', cross_val_result.mean())
print('variance in the model is : ', cross_val_result.std())

mean of the model is :  0.8450000047683716
variance in the model is :  0.009503291601344412


### The variance of the model is very low, which means that the model is accurate

### 6. Identify the points of improvement and implement the same

### Tuning the model

In [52]:
# Tuning the ANN
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [53]:
def model_tuning(optimizer = 'rmsprop'):
    classifier = Sequential()
    classifier.add(Dense(units=10, kernel_initializer='uniform', input_shape = (11,), activation = 'relu'))
    classifier.add(Dense(units=10, kernel_initializer='uniform', activation = 'relu'))
    classifier.add(Dense(units=1, kernel_initializer='uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
    return classifier

In [54]:
model_tune = KerasClassifier(build_fn = model_tuning, epochs = 100, batch_size=10)

In [55]:
# grid search epochs, batch size and optimizer
# Here, various parameters can be tuned using GridsearchCV method. 
optimizers = ['rmsprop', 'adam']
#init = ['normal', 'uniform']
#epochs = [50, 100]
#batches = [10, 20]
#param_grid = dict(optimizer=optimizers, nb_epoch=epochs, init=init)
param_grid = dict(optimizer=optimizers)
grid = GridSearchCV(estimator=model_tune, param_grid = param_grid)
grid_result = grid.fit(X_train, y_train)

Train on 8000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

In [56]:
# Printing out the best parameters after parameter tuning
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.853875 using {'optimizer': 'adam'}
0.848250 (0.012768) with: {'optimizer': 'rmsprop'}
0.853875 (0.008305) with: {'optimizer': 'adam'}


###  As shown in the Hyperparameter tuning above it is recommended to use adam optimizer to improve the accuracy score of the model

### 7. Predict the results using 0.5 as a threshold

In [57]:
# Test set results Prediction
y_pred = model.predict(X_test)

In [58]:
# Converting y_pred to a dataframe
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.head()

Unnamed: 0,0
0,0.263152
1,0.385279
2,0.150986
3,0.064347
4,0.191073


In [59]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [60]:
# Using customer ID as index
y_test_df['CustomerId'] = y_test_df.index

In [61]:
# Removing indexes and concatinating y_test and y_pred
y_pred_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)
y_pred_result = pd.concat([y_test_df, y_pred_df], axis=1)
y_pred_result.head()

Unnamed: 0,Exited,CustomerId,0
0,0,9394,0.263152
1,1,898,0.385279
2,0,2398,0.150986
3,0,5906,0.064347
4,0,2343,0.191073


In [62]:
# Renaming and Rearranging
y_pred_result = y_pred_result.rename(columns={0 : 'Exited_Prob'})
y_pred_result = y_pred_result.reindex(['CustomerId', 'Exited', 'Exited_Prob'], axis = 1)
y_pred_result.head()

Unnamed: 0,CustomerId,Exited,Exited_Prob
0,9394,0,0.263152
1,898,1,0.385279
2,2398,0,0.150986
3,5906,0,0.064347
4,2343,0,0.191073


In [63]:
# Creating a new column with value 1 if Exited_Prob > 0.5 else 0
y_pred_result['Predicted'] = y_pred_result.Exited_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_pred_result.head()

Unnamed: 0,CustomerId,Exited,Exited_Prob,Predicted
0,9394,0,0.263152,0
1,898,1,0.385279,0
2,2398,0,0.150986,0
3,5906,0,0.064347,0
4,2343,0,0.191073,0


### 8. Print the Accuracy score and confusion matrix

In [64]:
from sklearn import metrics

In [65]:
# Overall accuracy of the model
metrics.accuracy_score(y_pred_result.Exited, y_pred_result.Predicted)

0.8405

In [66]:
# Confusion matrix
confusion = metrics.confusion_matrix(y_pred_result.Exited, y_pred_result.Predicted)
confusion

array([[1538,   57],
       [ 262,  143]], dtype=int64)

In [70]:
# Further scores
Y_pred_cls = model.predict_classes(X_test, batch_size=10, verbose=0)
print('Accuracy Model1 (Dropout): '+ str(model.evaluate(X_test,y_test.values)[1]))
print('Recall_score: ' + str(recall_score(y_test.values,Y_pred_cls)))
print('Precision_score: ' + str(precision_score(y_test.values, Y_pred_cls)))
print('F-score: ' + str(f1_score(y_test.values,Y_pred_cls)))
confusion_matrix(y_test.values, Y_pred_cls)

Accuracy Model1 (Dropout): 0.8405
Recall_score: 0.3530864197530864
Precision_score: 0.715
F-score: 0.4727272727272727


array([[1538,   57],
       [ 262,  143]], dtype=int64)

## Conclusion

### Using the model above the bank can now understand which ascpects of the service influence a curstimer's decision to exit the bank and thereby improve thier services