In [1]:
import pandas as pd 
import numpy as np 

In [2]:
data = pd.read_csv("Churn_Modelling.csv")

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Data Preprocessing

In [4]:
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [6]:
data.Geography.value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [7]:
#Converting Gender and Geography to numerical values
from sklearn.preprocessing import StandardScaler, LabelEncoder

le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])


In [8]:
#One-hot encoding Geography

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
geo_encoded = ohe.fit_transform(data[['Geography']])
ohe.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [9]:
type(geo_encoded)

scipy.sparse._csr.csr_matrix

In [10]:
geo_encoded_df = pd.DataFrame(geo_encoded.toarray(),columns=ohe.get_feature_names_out(['Geography']))
# geo_encoded.toarray() converts the sparse matrix to a dense array
data = pd.concat([data, geo_encoded_df], axis=1)
data = data.drop(['Geography'], axis=1)

In [11]:
data.duplicated().sum()  # Check for duplicates

0

In [12]:
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [13]:
#Saving label and one-hot encoders in pkl files

import pickle

with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(le, file)
    
with open('onehot_encoder.pkl', 'wb') as file:
    pickle.dump(ohe, file)

In [14]:
#Split the dataset into features and target variable

X = data.drop('Exited', axis=1)
y = data['Exited']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)


In [16]:
#Standardization

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
X_test

array([[-0.32241481,  0.91232714, -0.95108677, ...,  1.00300451,
        -0.58254653, -0.5744635 ],
       [ 0.66125561,  0.91232714,  2.2842259 , ...,  1.00300451,
        -0.58254653, -0.5744635 ],
       [-0.2395794 ,  0.91232714,  2.18906965, ..., -0.99700449,
         1.71660109, -0.5744635 ],
       ...,
       [ 0.75444544,  0.91232714, -0.09468047, ..., -0.99700449,
        -0.58254653,  1.74075464],
       [ 0.8994074 ,  0.91232714,  0.47625706, ...,  1.00300451,
        -0.58254653, -0.5744635 ],
       [-0.59162987,  0.91232714, -0.665618  , ..., -0.99700449,
        -0.58254653,  1.74075464]])

In [18]:
X_train.shape[1]            #these many input features in ANN

12

In [19]:
#Saving the standard scaler in a pkl file

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler,file)
    

ANN Implementation


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

Sequential for the interconnected network, Dense for the neurones, TensorBoard for visualizing all the logs during training

In [21]:
model = Sequential([
    Dense(64, activation = 'relu', input_shape = (X_train.shape[1],)),  #HL 1 with 64 neurones connected with input layer
    Dense(32, activation = 'relu'),  #HL 2 with 32 neurones
    Dense(1, activation = 'sigmoid'),  #Output layer     
]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.summary()

Total params is the total number of weights and biases required (12 \* 64 + 64 for first layer, 64\*32 + 32 for second layer, 32\*1 + 1 for layer3)

In [23]:
#Compile model

opt = tf.keras.optimizers.Adam(learning_rate=0.01)  #Setting my custom learning rate for Adam optimizer
model.compile(optimizer = opt, loss ="binary_crossentropy",metrics = ['accuracy'])

In [24]:
#Setup TensorBoard

import datetime

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")        #logs are created in a file named logs/fit with a timestamp
tf_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [25]:
#Setup for early stopping -- if the validation loss does not improve for certain epochs, training will stop

early_stopping_callback = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)     
#patience tells to stop after 12 epochs if loss is not significantly decreasing, restore_best_weights used to restore the best wt after early stopping
#val_loss - Validation loss


In [26]:
#Model training

history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,
                    callbacks=[tf_callback, early_stopping_callback])

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8354 - loss: 0.3977 - val_accuracy: 0.8415 - val_loss: 0.3861
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8565 - loss: 0.3531 - val_accuracy: 0.8580 - val_loss: 0.3539
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8579 - loss: 0.3479 - val_accuracy: 0.8500 - val_loss: 0.3622
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8587 - loss: 0.3435 - val_accuracy: 0.8500 - val_loss: 0.3563
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8616 - loss: 0.3371 - val_accuracy: 0.8610 - val_loss: 0.3481
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8616 - loss: 0.3375 - val_accuracy: 0.8525 - val_loss: 0.3552
Epoch 7/100
[1m250/25

In [27]:
model.save('churn_prediction_model.h5')  # Save the trained model



In [28]:
#load Tensorboard Extension

%load_ext tensorboard

In [41]:
%tensorboard --logdir logs/fit/20250809-130938
# This will start TensorBoard and you can view it in your browser at http://localhost:6006

Reusing TensorBoard on port 6008 (pid 11756), started 0:00:27 ago. (Use '!kill 11756' to kill it.)