In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle # it is used to save the model

In [75]:
# Load the dataset
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [76]:
## data preprocessing
# Drop unnecessary columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1) # axis=1 means drop columns, axis=0 means drop rows 
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [77]:
## encode categorical variables
lable_encoder = LabelEncoder()
df['Gender'] = lable_encoder.fit_transform(df['Gender']) # fit_transform is used to fit the data and transform it at the same time
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [91]:
# one hot encoding for categorical variables
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
geo_encoder = onehot_encoder_geo.fit(df[['Geography']]) # fit is used to fit the data only 

In [92]:
print("Encoded Geography Data:", geo_encoder)

Encoded Geography Data: OneHotEncoder(handle_unknown='ignore', sparse_output=False)


In [93]:
columns=geo_encoder.get_feature_names_out(['Geography']) # get_feature_names_out is used to get the feature names after one hot encoding
columns

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [94]:
geo_encoder_df = pd.DataFrame(geo_encoder, columns=onehot_encoder_geo.get_feature_names_out(['Geography'])) # get_feature_names_out is used to get the feature names after one hot encoding
geo_encoder_df.head()

ValueError: DataFrame constructor not properly called!

In [95]:
# drop the original Geography column
df = df.drop(['Geography'], axis=1) # axis=1 means drop columns, axis=0 means drop rows
## combine the dataframes
df = pd.concat([df, geo_encoder_df], axis=1) # axis=1 means combine columns, axis=0 means combine rows
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [96]:
# save the encoders and scaler
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(lable_encoder, f) # dump is used to save the model
with open('onehot_encoder_geo.pkl', 'wb') as f:
    pickle.dump(onehot_encoder_geo, f) # dump is used to save the model

In [97]:
# divide the data into dependent and independent variables
X = df.drop(['Exited'], axis=1) # axis=1 means drop columns, axis=0 means drop rows
y= df['Exited'] # Exited is the dependent variable

# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # random_state is used to get the same result every time

# scale the data
scaler = StandardScaler() # StandardScaler is used to scale the data, it is used to standardize the features by removing the mean and scaling to unit variance
X_train = scaler.fit_transform(X_train) # fit_transform is used to fit the data and transform it at the same time
X_test = scaler.transform(X_test) # transform is used to transform the data only

In [98]:
# save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f) # dump is used to save the model

In [16]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


### ANN Implementation

In [18]:
# import the libraries for the model
import tensorflow as tf # used to create the model
from tensorflow.keras.models import Sequential # used to create the model
from tensorflow.keras.layers import Dense # used to create the layers of the model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard # used to stop the training if the model is not improving
import datetime # used to get the current time

In [20]:
# How many collumns are there in the dataset
X_train.shape[1] # shape[1] is used to get the number of columns in the dataset

12

In [21]:
# build the model
model = Sequential() # used to create the 
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],))) # add the first layer with 64 neurons and relu activation function, input_shape is used to define the shape of the input data
model.add(Dense(32, activation='relu')) # add the second layer with 32 neurons and relu activation function
model.add(Dense(1, activation='sigmoid')) # add the third output layer with 1 neurons and sigmoid activation function
model.summary() # used to get the summary of the model


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # adam is the optimizer, binary_crossentropy is the loss function, accuracy is the metric to be used

In [22]:
# optimizer learning rate can be changed by using the following code
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
opt = Adam(learning_rate=0.01) # learning rate is used to change the learning rate of the optimizer
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # adam is the optimizer, binary_crossentropy is the loss function, accuracy is the metric to be used

In [23]:
# Set up the tensorboard callback
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping # used to create the tensorboard callback
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # used to get the current time
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1) # used to create the tensorboard callback

In [25]:
#set up early stopping
# used to create the early stopping callback and patience is used to define the number of epochs to wait before stopping the training if the model is not improving and restore_best_weights is used to restore the best weights of the model
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) 

In [69]:
# train the model
history = model.fit(
    X_train, y_train, validation_data =(X_test, y_test),
    epochs=100, # used to define the number of epochs to train the model
    callbacks=[tensorboard_callback, early_stopping_callback]) # used to train the model, epochs is used to define the number of epochs to train the model, 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [70]:
# save the model
model.save('model.h5') # h5 is used to save the model in h5 format, h5 is a file format used to save the model

  saving_api.save_model(


In [34]:
# load tensorboard extension in jupyter notebook
%load_ext tensorboard

In [37]:
%tensorboard --logdir logs/fit/20250408-064141 # used to load the tensorboard in jupyter notebook


Reusing TensorBoard on port 6007 (pid 17348), started 0:01:17 ago. (Use '!kill 17348' to kill it.)