## Installing Required Libaries

In [1]:
pip install eli5

Collecting eli5
  Using cached eli5-0.13.0.tar.gz (216 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tabulate>=0.7.7 (from eli5)
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py): started
  Building wheel for eli5 (setup.py): finished with status 'done'
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107728 sha256=83c173c911745c189a030b52b30abaeed466864db09392173023d6e65e7e5c5c
  Stored in directory: c:\users\marx\appdata\local\pip\cache\wheels\b8\58\ef\2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: tabulate, eli5
Successfully installed eli5-0.13.0 tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


## Importing the Required Libaries

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix, classification_report

## Loading the data

In [7]:
data = pd.read_csv('Churn_Modelling.csv')

## Choosing the Features

In [10]:
X = data.iloc[:,3:-1]

## Encoding the Categorical Features

In [12]:
## Our columns e.g Geography contains words and not numbers, These words can assume a limited number of values, called Categorical features
## Our ml model is mathematical so, it requires numbers for computation thats why we encode

In [13]:
encoder = OrdinalEncoder()
value = encoder.fit_transform(X['Geography'].values.reshape(-1, 1))
X['Geography'] = value
encoder = OrdinalEncoder()
value = encoder.fit_transform(X['Gender'].values.reshape(-1, 1))
X['Gender'] = value

In [14]:
## We've encoded the Geography and Gender columns with OrdinalEncoder()

## Getting the Target Column

In [15]:
## Predicting whether a customer leaves the bank is a supervised learning problem and so we have to train the model so as to be able to predict the right target variable which is a column of 0s and 1s .

In [16]:
y = data.iloc[:, len(data.columns)-1]

## Splitting the Data into Train and Test sets

In [17]:
## The function train_test_split will be used to divide our data into training and testing sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [19]:
## Checking the lengths of train and test sets in which the general practice requires 70% to be training and 30% to be testing

In [20]:
len(X_train), len(X_test), len(y_train), len(y_test)

(7000, 3000, 7000, 3000)

## Implementing the Random Forest Classifier Model

In [None]:
## trying the model for customers who will leave the bank

In [22]:
RF = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 0)

RF.fit(X_train, y_train)

## Performance Check for Random Forest classifier

In [24]:
round(RF.score(X_train, y_train), 4)

0.8093

In [25]:
## Training we 80.93% accuracy

In [26]:
round(RF.score(X_test, y_test), 4)

0.8237

In [27]:
## Testing we get a 82.37% accuracy

## Checking Feature Importance for Random Forest Classifier Model

In [29]:
## Here is where we use eli5 in getting feature importance

In [30]:
perm = PermutationImportance(RF, random_state = 42, n_iter = 10).fit(X, y)

eli5.show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
0.0196  ± 0.0014,NumOfProducts
0.0070  ± 0.0011,Age
0.0012  ± 0.0008,Balance
0.0003  ± 0.0003,Geography
0.0002  ± 0.0006,Gender
0.0002  ± 0.0000,CreditScore
0.0002  ± 0.0005,IsActiveMember
0  ± 0.0000,HasCrCard
0  ± 0.0000,Tenure
-0.0000  ± 0.0002,EstimatedSalary


In [31]:
## This now indicates that NumOfProducts(age and balance) are our Top features

## Implementing a MLP Classifier Model

In [32]:
## Creating another training and testing set for another model

In [33]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X, y, test_size = 0.30, random_state = 42)                                                                    

In [34]:
## Checking out with MLP classifier after trying out RandomForest

In [64]:
clf = MLPClassifier(random_state = 1, max_iter = 100).fit(X_train_new, y_train_new)

## Performance check for MLP Classifier model

In [65]:
clf.score(X_train, y_train_new)

0.7382857142857143

In [66]:
clf.score(X_test_new, y_test_new)

0.733

In [67]:
## Getting a 73.82% training accuracy and 73.30% testing accuracy

## Checking Feature Importance for MLP Classifier Model

In [68]:
perm = PermutationImportance(clf, random_state = 42, n_iter = 10).fit(X, y)

eli5.show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
0.0113  ± 0.0074,Balance
0.0037  ± 0.0024,Age
0.0002  ± 0.0006,IsActiveMember
0.0001  ± 0.0011,Tenure
0.0000  ± 0.0003,Gender
0  ± 0.0000,HasCrCard
0  ± 0.0000,NumOfProducts
-0.0000  ± 0.0006,Geography
-0.0012  ± 0.0020,CreditScore
-0.0016  ± 0.0051,EstimatedSalary


In [69]:
## This model indicates that Balance, Age and IsActiveMember to be our top features

## Implementing a NeuralNetwork Model and Checking the Importance

In [70]:
## Trying the Neural Network Model after trying out the Random forest and MLP classfiers and for that we'll use keras

In [75]:
model = keras.Sequential([
    keras.layers.Dense(10, input_shape = (10,), activation = 'relu'),
    keras.layers.Dense(25, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])
model.compile(optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = ['accuracy'])

In [76]:
## We have a 25 node hidden layer. You can tweak and try out other combinations. We are using the 'adam' optimizer and 'binary_crossentropy' loss

In [77]:
## Fitting the model with 50epochs

In [81]:
model.fit(X_train, y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x292e1c42dd0>

In [82]:
## After training the 50 epochs, I got a 79.24% accuracy.

In [83]:
model.evaluate(X_test, y_test)



[0.4936392307281494, 0.8053333163261414]

In [84]:
## After testing the model gives an 80.53% accuracy

In [85]:
## Printing the classification report and checking the perfomance

In [98]:
yp = model.predict(X_test)
y_pred = []
for element in yp:
    if element > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89      2416
           1       0.00      0.00      0.00       584

    accuracy                           0.81      3000
   macro avg       0.40      0.50      0.45      3000
weighted avg       0.65      0.81      0.72      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
## Base on these metrics, we see that precision & recall are less with the above model for class 1 but good for class 0