# Load and Preprocess the data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [2]:
## Load the dataset
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
## Preprocessing
## Dropping irrelevant features

data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#geo_enc = LabelEncoder()
gen_enc = LabelEncoder()

#data['Geography'] = geo_enc.fit_transform(data['Geography'])
data['Gender'] = gen_enc.fit_transform(data['Gender'])
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [5]:
## Onehot encoder
from sklearn.preprocessing import OneHotEncoder
geo_enc = OneHotEncoder()

geo_encoded = geo_enc.fit_transform(data[['Geography']])
geo_encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [6]:
geo_enc.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [7]:
geo_df = pd.DataFrame(geo_encoded.toarray(), columns=geo_enc.get_feature_names_out(['Geography']))
geo_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [8]:
## Combine the one hot encoded data into the original data
data = pd.concat([data.drop('Geography', axis=1), geo_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [9]:
## Save the encoders and Sscaler

with open('gen_enc.pkl', 'wb') as file:
    pickle.dump(gen_enc, file)
    
with open('geo_enc.pkl', 'wb') as file:
    pickle.dump(geo_enc, file)

In [10]:
## Splitting data
x = data.drop('Exited', axis=1)
y = data['Exited']
x.head(), y.head()

(   CreditScore  Gender  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
 0          619       0   42       2       0.00              1          1   
 1          608       0   41       1   83807.86              1          0   
 2          502       0   42       8  159660.80              3          1   
 3          699       0   39       1       0.00              2          0   
 4          850       0   43       2  125510.82              1          1   
 
    IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany  \
 0               1        101348.88               1.0                0.0   
 1               1        112542.58               0.0                0.0   
 2               0        113931.57               1.0                0.0   
 3               0         93826.63               1.0                0.0   
 4               1         79084.10               0.0                0.0   
 
    Geography_Spain  
 0              0.0  
 1              1.0  
 2          

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=43)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((8000, 12), (8000,), (2000, 12), (2000,))

In [12]:
##Scaling the data

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [13]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

2025-03-06 20:12:42.712905: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 20:12:42.722949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741288362.734805   37364 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741288362.738281   37364 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 20:12:42.750930: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [15]:
## The ANN Model
model = Sequential([
    Input(shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),         ## 1st hidden layer connected to inp_layer
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

I0000 00:00:1741288364.466606   37364 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1768 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [16]:
opt = tf.keras.optimizers.Adam(learning_rate = 0.01)

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
## Setting up tensorboard for logging
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir= log_dir, histogram_freq=1)

In [19]:
## Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [20]:
## Training the model
history = model.fit(
    x_train, y_train, validation_data=(x_test, y_test), epochs=100,
    callbacks = [tensorflow_callback, early_stopping_callback]
)

Epoch 1/100


I0000 00:00:1741288365.842015   37532 service.cc:148] XLA service 0x7f9d440172c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741288365.842085   37532 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2025-03-06 20:12:45.858016: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1741288365.952289   37532 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 53/250[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.6328 - loss: 0.6310

I0000 00:00:1741288366.626842   37532 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7523 - loss: 0.5096 - val_accuracy: 0.8305 - val_loss: 0.4071
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8459 - loss: 0.3778 - val_accuracy: 0.8560 - val_loss: 0.3668
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8583 - loss: 0.3497 - val_accuracy: 0.8540 - val_loss: 0.3590
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8562 - loss: 0.3453 - val_accuracy: 0.8550 - val_loss: 0.3537
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8550 - loss: 0.3445 - val_accuracy: 0.8525 - val_loss: 0.3574
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8631 - loss: 0.3386 - val_accuracy: 0.8605 - val_loss: 0.3491
Epoch 7/100
[1m250/250[0m [32m━

In [21]:
model.save('model.h5')
model.save('model.keras')



In [22]:
## Launch tensorboard
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/fit20250306-201245