In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head(7)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0


In [3]:
df.drop(['CustomerId', 'RowNumber', 'Surname'], axis='columns', inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df['Gender'].replace({'Female': 1, 'Male': 0}, inplace=True)
df['Gender'].unique()

array([1, 0])

In [5]:
df = pd.get_dummies(data=df, columns=['Geography'])
df.sample(5)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
9721,560,1,38,5,83714.41,1,1,1,33245.97,0,1,0,0
4869,669,0,50,4,149713.61,3,1,1,124872.42,1,1,0,0
9925,694,1,38,5,195926.39,1,1,1,85522.84,0,1,0,0
9982,655,1,46,7,137145.12,1,1,0,115146.4,1,0,1,0
9228,699,0,39,2,109724.38,1,1,1,180022.39,0,1,0,0


In [6]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

col_to_scale = ['Balance', 'EstimatedSalary', 'CreditScore', 'Age', 'Tenure', 'NumOfProducts']

df[col_to_scale] = sc.fit_transform(df[col_to_scale])

In [7]:
df.Balance.unique()

array([0.        , 0.33403148, 0.63635718, ..., 0.22865702, 0.29922631,
       0.51870777])

In [8]:
class_distribution = df.Exited.value_counts()

imbalance_ratio = class_distribution[0] / class_distribution[1]
print('Imbalance Ratio:', np.round(imbalance_ratio, 1))

Imbalance Ratio: 3.9


In [9]:
count_class_0, count_class_1 = df.Exited.value_counts()
count_class_0, count_class_1

(7963, 2037)

In [10]:
df_class_0 = df[df['Exited']==0]
df_class_1 = df[df['Exited']==1]

In [11]:
df.isnull().sum().sum()

0

In [12]:
df_test_under = pd.concat([df_class_0.sample(count_class_1), df_class_1], axis=0)
df_test_under.sample(7)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
5398,0.962,0,0.351351,0.3,0.442813,0.0,1,1,0.140676,1,0,1,0
8301,0.53,0,0.540541,0.3,0.288202,0.0,1,1,0.428418,1,0,1,0
161,0.668,0,0.162162,0.2,0.0,0.333333,1,0,0.417351,0,1,0,0
5903,0.44,1,0.148649,0.4,0.609969,0.0,1,1,0.656823,1,1,0,0
3317,0.858,0,0.716216,0.3,0.0,0.333333,1,1,0.734489,1,1,0,0
674,0.358,0,0.22973,0.5,0.0,0.333333,1,1,0.936474,0,0,0,1
9505,0.602,0,0.135135,1.0,0.317113,0.0,1,1,0.373415,0,1,0,0


In [13]:
df_test_under.Exited.value_counts()

0    2037
1    2037
Name: Exited, dtype: int64

In [15]:
X = df.drop('Exited', axis='columns')
y = df['Exited']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [17]:
X_train.shape

(8000, 12)

In [18]:
X_test.shape

(2000, 12)

In [19]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Dense(26, input_shape=(12,), activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)
model.fit(X_train, y_train, epochs = 50, batch_size = 8)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0405d2f340>

In [20]:
model.evaluate(X_test, y_test)



[0.3442649841308594, 0.8659999966621399]

In [25]:
y_pred = model.predict(X_test).reshape(-1)
y_pred = np.round(y_pred)
print(y_pred[:10])

[0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]


In [27]:
y_test[:10]

9394    0
898     1
2398    0
5906    0
2343    0
8225    1
5506    0
6451    0
2670    1
3497    1
Name: Exited, dtype: int64

In [28]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.98      0.92      1595
           1       0.82      0.43      0.56       405

    accuracy                           0.87      2000
   macro avg       0.85      0.70      0.74      2000
weighted avg       0.86      0.87      0.85      2000

