In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [17]:
#load dataset
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Feature Engineering

In [18]:
#preprocess data
#drop irrelevant features
#axis = 1 means colums
data=data.drop(['RowNumber','CustomerId','Surname'],axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [19]:
#encode categorical variables
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [20]:
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


We won't use LabelEncoder for Geography column, as it can have many places name, but here we have 3, so if we assign Germany as 2 then ANN is all about numerical calculations, so it will consider Germany as more impt than Spain or France which we don't want here or which is not intutive, so we won't go with LabelEncoder

In [24]:
print(data['Geography'].unique())

['France' 'Spain' 'Germany']


In [28]:
# One hot Encoder for Geography column
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoder_res = encoder.fit_transform(data[['Geography']])
encoder_res

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data

In [29]:
encoder.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [31]:
geo_data = pd.DataFrame(encoder_res,columns=encoder.get_feature_names_out(['Geography']))

In [32]:
geo_data

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [None]:
data_encoded = pd.concat([data, geo_data],axis=1)
data_encoded

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,France,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,France,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [46]:
data_encoded = data_encoded.drop('Geography',axis=1)

In [93]:
#save the encoder in pickle
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('one_hot_encoder_geo.pkl','wb') as file:
    pickle.dump(encoder, file)



Splitting data into train and test

In [47]:
# divide data into independent and dependent features
data_encoded.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [48]:
X=data_encoded.drop('Exited', axis=1)
y=data_encoded['Exited']

In [49]:
#split into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [50]:
X_train


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
9254,686,1,32,6,0.00,2,1,1,179093.26,1.0,0.0,0.0
1561,632,1,42,4,119624.60,2,1,1,195978.86,0.0,1.0,0.0
1670,559,1,24,3,114739.92,1,1,0,85891.02,0.0,0.0,1.0
6087,561,0,27,9,135637.00,1,1,0,153080.40,1.0,0.0,0.0
6669,517,1,56,9,142147.32,1,0,0,39488.04,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,768,1,54,8,69712.74,1,1,1,69381.05,1.0,0.0,0.0
5191,682,0,58,1,0.00,1,1,1,706.50,1.0,0.0,0.0
5390,735,0,38,1,0.00,3,0,0,92220.12,1.0,0.0,0.0
860,667,1,43,8,190227.46,1,1,0,97508.04,1.0,0.0,0.0


In [51]:
#scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [52]:
# save scaler in pickle
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler, file)

ANN Implementation
1. Sequential Network
2. Dense 
3. Activation function - Sigmoid, softmax, Relu, Leaky Relu
4. Optimizer - Back propogation - updating wieights
5. Loss function - minimize
6. Metric - accuracy score, for regression - MSE,RMSE
7. Training info - logs - store to display them via tensorboard

In [56]:
import tensorflow as tf

In [59]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [60]:
import datetime

In [61]:
X_train.shape[1]

12

In [None]:
#converting the 12 col into row
(X_train.shape[1],)

(12,)

In [64]:
# Build our Model
model = Sequential(
    [
        Dense(64, activation="relu", input_shape=(X_train.shape[1],)), #Hidden layer1 with 64 hidden nodes 
        Dense(32, activation="relu"), #hidden layer 2 with 32 hidden nodes
        Dense(1, activation="sigmoid"), #ouput layer 1 node
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


in model summarry we can see, Param coulmn

so each row in our dataset will be treated as single I/P which has 12 columns => (12,1) matrix
and in Hidden layer 1 we have defined 64 nodes, each 64 nodes has bias 
so total weights = 12*64 = 768 + 64(bias) = 832

for hidden layer 2, we have defined 32 hidden nodes
total weights = 64 * 32 = 2048 + 32(bias) = 2080

for output layer, it's usally 1 node
so total weight = 32 * 1 = 32 + 1 (bias for oputput node) = 33

so total Trainable weights/params = 832 + 2080 + 33 = 2945

In [65]:
model.summary()

In [80]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


In [81]:
# compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy',metrics=['accuracy'])

In [82]:
# setup tensorboard
log_dir = "logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [83]:
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)

In [84]:
# Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [85]:
#Train the model
history=model.fit(
    X_train, y_train, validation_data=[X_test, y_test], epochs=100,
    callbacks = [tensorflow_callback, early_stopping_callback] 
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 38ms/step - accuracy: 0.8118 - loss: 0.4448 - val_accuracy: 0.8490 - val_loss: 0.3576
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.8526 - loss: 0.3620 - val_accuracy: 0.8550 - val_loss: 0.3493
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8544 - loss: 0.3603 - val_accuracy: 0.8590 - val_loss: 0.3481
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.8600 - loss: 0.3354 - val_accuracy: 0.8610 - val_loss: 0.3503
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.8565 - loss: 0.3450 - val_accuracy: 0.8615 - val_loss: 0.3352
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8616 - loss: 0.3375 - val_accuracy: 0.8575 - val_loss: 0.3445
Epoch 7/100
[1

In [86]:
model.save('model.h5')



In [89]:
# Launch TensorBoard
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/fit20241201-141511



Reusing TensorBoard on port 6008 (pid 29388), started 0:02:30 ago. (Use '!kill 29388' to kill it.)

Load the pickle file

In [None]:
## Load the pickle file


In [102]:
!kill 29388

'kill' is not recognized as an internal or external command,
operable program or batch file.
