In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# standardization and encoding
# standardization is a technique to rescale the features of data to have a mean of 0 and a standard deviation of 1
# label encoding is a technique to convert categorical labels into numerical format
#  and the type of encoding that we will use is label encoding or ordinal encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# import pickle for saving the model and providing persistence
import pickle

In [64]:
# now let's load the dataset
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [65]:
# Let's do some preprocessing on the data
# dropping unnecessary columns
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [22]:
print(data["Geography"].value_counts())

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64


In [66]:
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
print(data["Gender"].value_counts())
print("+-----------------------+")
data.head()

Gender
1    5457
0    4543
Name: count, dtype: int64
+-----------------------+


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [67]:
# encoding the 'Geography' column using OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
geography_encoded = ohe.fit_transform(data[["Geography"]])
geography_encoded_df = pd.DataFrame(
    geography_encoded, columns=ohe.get_feature_names_out(["Geography"])
)
data = pd.concat([data.drop("Geography", axis=1), geography_encoded_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Gender             10000 non-null  int64  
 2   Age                10000 non-null  int64  
 3   Tenure             10000 non-null  int64  
 4   Balance            10000 non-null  float64
 5   NumOfProducts      10000 non-null  int64  
 6   HasCrCard          10000 non-null  int64  
 7   IsActiveMember     10000 non-null  int64  
 8   EstimatedSalary    10000 non-null  float64
 9   Exited             10000 non-null  int64  
 10  Geography_France   10000 non-null  float64
 11  Geography_Germany  10000 non-null  float64
 12  Geography_Spain    10000 non-null  float64
dtypes: float64(5), int64(8)
memory usage: 1015.8 KB


In [26]:
with open("label_encoder_gender.pkl", "wb") as f:
    pickle.dump(le, f)

with open("onehot_encoder_geography.pkl", "wb") as f:
    pickle.dump(ohe, f)

In [27]:
# splitting the data into features and target variable
X = data.drop("Exited", axis=1)
y = data["Exited"]
# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [28]:
# standardization and encoding
# standardization is a technique to rescale the features of data to have a mean of 0 and a standard deviation of 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]], shape=(8000, 12))

In [30]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

### ANN

##### Artificial Neural Networks

- ANN are a subset of machine learning and are at the heart of deep learning algorithms.
- They are inspired by the structure and function of the human brain, mimicking the way that biological neurons signal to one another.
- An ANN is composed of layers of interconnected nodes, or "neurons", where each connection has an associated weight.
- These networks are capable of learning complex patterns from data through a process called training, where the network adjusts its weights based on the input data and the corresponding output.
- ANNs are widely used in various applications, including image and speech recognition, natural language processing, and game playing, due to their ability to model complex relationships and make accurate predictions


In [48]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime

In [49]:
X_train.shape[1]

12

In [50]:
# Build our ANN model

model = Sequential(
    [
        Dense(
            64, activation="relu", input_shape=(X_train.shape[1],)
        ),  # Hidden layer 1 connected to input layer
        Dense(32, activation="relu"),  # Hidden layer 2 connected to hidden layer 1
        Dense(1, activation="sigmoid"),  # Output layer connected to hidden layer 2
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [51]:
model.summary()

In [52]:
# Compile the model using Adam optimizer and binary crossentropy loss function
# Adam optimizer is an optimization algorithm
# that can be used instead of the classical stochastic gradient descent
# procedure to update network weights iteratively based on training data.
# Binary crossentropy is used as a loss function for binary classification problems,
# where the output is either 0 or 1.
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [53]:
# set TensorBoard log directory
# TensorBoard is a visualization tool provided with TensorFlow
# that allows you to monitor and visualize various aspects of your machine learning models,
# such as training progress, loss and accuracy metrics, computational graphs, and more.
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [55]:
# set up early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor="val_loss", patience=10, restore_best_weights=True
)

In [56]:
# Training the model
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, tensorboard_callback],
)

Epoch 1/100


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8012 - loss: 0.4562 - val_accuracy: 0.8240 - val_loss: 0.3974
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8372 - loss: 0.3936 - val_accuracy: 0.8475 - val_loss: 0.3620
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8518 - loss: 0.3614 - val_accuracy: 0.8575 - val_loss: 0.3513
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8583 - loss: 0.3481 - val_accuracy: 0.8585 - val_loss: 0.3432
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8583 - loss: 0.3431 - val_accuracy: 0.8610 - val_loss: 0.3437
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8575 - loss: 0.3375 - val_accuracy: 0.8555 - val_loss: 0.3444
Epoch 7/100
[1m250/250[0m [32m━

In [57]:
model.save("ann_model.h5")



In [58]:
# Load tensorboard extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [61]:
%tensorboard --logdir logs/fit/20260107-065104

Reusing TensorBoard on port 6007 (pid 90062), started 0:01:04 ago. (Use '!kill 90062' to kill it.)