In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("datasets/churn-modelling.csv")

data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Resample imbalanced label

In [3]:
data["Exited"].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [4]:
from imblearn.over_sampling import RandomOverSampler

features = data.drop(columns = "Exited")
label = data["Exited"]

resampler = RandomOverSampler(random_state = 0)

resampled_features, resampled_label = resampler.fit_resample(features, label)

data = pd.concat([resampled_features, resampled_label], axis=1)

data["Exited"].value_counts()

Exited
1    7963
0    7963
Name: count, dtype: int64

In [5]:
# exclude RowNumber, CustomerId, Surname & Exited from features
x = data.iloc[:, 3: 13]
y = data.iloc[:, 13]

Encode categorical features

In [6]:
from sklearn.preprocessing import LabelEncoder

gender_encoder = LabelEncoder()
x["Gender"] = gender_encoder.fit_transform(x["Gender"])

Encode Multi-class features with label encoder can influence model training, special treat for high encode class

In [7]:
x["Geography"].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 1 is the index of the feature to encode
column_transformer = ColumnTransformer(
  transformers = [("encoder", OneHotEncoder(), [1])],
  remainder = "passthrough"
)

# this will remove feature's column and replace it with n columns inserted at index 0
# n = 3 possible classes in this issue
x = column_transformer.fit_transform(x)

x[0]

array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
       0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05])

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

Scale after splitting, apply the same calculated mean from x_train on x_test

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Ensure results are reproducible

In [None]:
import numpy as np
import tensorflow as tf

import logging
tf.get_logger().setLevel(logging.ERROR)

# 42 is common for this usage, many different stories about it
np.random.seed(42)
tf.random.set_seed(42)

In [12]:
from keras.models import Sequential
from keras.layers import Dense

In [13]:
model = Sequential()

model.add(Dense(6, activation = "relu", input_dim = len(x_train[0])))
model.add(Dense(6, activation = "relu"))
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 78        
                                                                 
 dense_1 (Dense)             (None, 6)                 42        
                                                                 
 dense_2 (Dense)             (None, 1)                 7         
                                                                 
Total params: 127 (508.00 Byte)
Trainable params: 127 (508.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
model.fit(x_train, y_train, batch_size = 10, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x202f567bb80>

In [15]:
loss, accuracy = model.evaluate(x_test, y_test)

loss, accuracy



(0.4796296954154968, 0.7543947696685791)

In [16]:
y_predict = model.predict(x_test)

y_predict = (y_predict > 0.5)

y_predict



array([[False],
       [False],
       [ True],
       ...,
       [False],
       [False],
       [ True]])

Low score

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_predict)
print(f"Accuracy: {accuracy}")

accuracy = precision_score(y_test, y_predict)
print(f"Precision: {accuracy}")

accuracy = recall_score(y_test, y_predict)
print(f"Recall: {accuracy}")

accuracy = f1_score(y_test, y_predict)
print(f"F1: {accuracy}")

Accuracy: 0.754394776494224
Precision: 0.7574232511323603
Recall: 0.7521239380309845
F1: 0.7547642928786359


Restructure DNN model

In [18]:
model = Sequential()

model.add(Dense(128, activation = "relu", input_dim = len(x_train[0])))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(16, activation = "relu"))
model.add(Dense(8, activation = "relu"))
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

model.fit(x_train, y_train, batch_size = 10, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x202f6d3d1b0>

Much better score

In [19]:
y_predict = model.predict(x_test)

y_predict = (y_predict > 0.5)

accuracy = accuracy_score(y_test, y_predict)
print(f"Accuracy: {accuracy}")

accuracy = precision_score(y_test, y_predict)
print(f"Precision: {accuracy}")

accuracy = recall_score(y_test, y_predict)
print(f"Recall: {accuracy}")

accuracy = f1_score(y_test, y_predict)
print(f"F1: {accuracy}")

Accuracy: 0.8515821195379206
Precision: 0.8133333333333334
Recall: 0.9145427286356822
F1: 0.8609738884968243


Save the model to reuse

In [25]:
model.save("05-dumps/model.keras")

In [None]:
from joblib import dump

dump(gender_encoder, "05-dumps/gender-encoder.pkl")
dump(column_transformer, "05-dumps/column-transformer.pkl")
dump(scaler, "05-dumps/scaler.pkl")

In [27]:
sample_data = np.array([
  [600, "France", "Male", 40, 3, 60000, 2, 1, 1, 50000]
])

In [28]:
from joblib import load

gender_encoder = load("05-dumps/gender-encoder.pkl")

sample_data[:, 2] = gender_encoder.transform(sample_data[:, 2])

In [None]:
column_transformer = load("05-dumps/column-transformer.pkl")

sample_data = column_transformer.transform(sample_data)

In [30]:
scaler = load("05-dumps/scaler.pkl")

sample_data = scaler.transform(sample_data)

In [31]:
sample_data

array([[ 1.07479237, -0.6660016 , -0.54458213, -0.49084051,  0.98454672,
        -0.10120986, -0.67592971, -0.35207194,  0.73346722,  0.64407345,
         1.08132784, -0.88180893]])

In [33]:
from keras.models import load_model

model = load_model("05-dumps/model.keras")

y_predict = model.predict(sample_data)

y_predict = (y_predict > 0.5)

y_predict



array([[False]])