In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [9]:
# Load the dataset
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [10]:
# Preprocess the data 
# Dropping Irrelevant Features

data = data.drop(["RowNumber","CustomerId","Surname"],axis=1) 
# In pandas, the axis argument tells pandas which direction to apply the operation: axis=0 → rows,  axis=1 → columns

In [11]:
data.head()

# Here Features - Geography and Gender are Categorical variable
# For Categorical variables, we can apply Encoding to map these features with numerical values

# Encoding Categorical Variables
# Label Encoder Converts categories → integers (one number per unique label).

label_encoder_gender = LabelEncoder()
data["Gender"] = label_encoder_gender.fit_transform(data["Gender"])

data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [12]:
# So for Encoding Geography, we will not be using Label Encoder because what label encoder does is that, it assigns a numerical value and the problem with directly numerical value is that it some labels will be assigned higher numerical values as compared to other labels which can be problematic

# When encoding categorical features like Geography in an ANN, Label Encoding assigns integers (e.g., France = 0, Germany = 1, Spain = 2), which introduces a false ordinal relationship (Spain > Germany > France). During training, the ANN may wrongly interpret higher labels as having more importance. Instead, One-Hot Encoding represents each category as an independent vector, e.g., France = [1,0,0], Germany = [0,1,0], Spain = [0,0,1].

# One Hot Encoding - Geography Column

from sklearn.preprocessing import OneHotEncoder

onehot_encoder_geo = OneHotEncoder(sparse_output=False)
geo_encoder = onehot_encoder_geo.fit_transform(data[["Geography"]])

print(geo_encoder)
print(onehot_encoder_geo.get_feature_names_out(["Geography"]))

[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]
['Geography_France' 'Geography_Germany' 'Geography_Spain']


In [13]:
geo_encoded_df = pd.DataFrame(geo_encoder,columns=onehot_encoder_geo.get_feature_names_out(["Geography"]))

In [14]:
# Combing OHE Columns with Original data
data = pd.concat([data.drop("Geography",axis=1),geo_encoded_df],axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [15]:
# Saving Encoder 
with open("label_encoder_gender.pkl","wb") as file:
    pickle.dump(label_encoder_gender,file)

with open("onehot_encoder_geo.pkl","wb") as file:
    pickle.dump(onehot_encoder_geo,file)

In [16]:
# Splitting into Dependent and Independent Features
X = data.drop("Exited",axis=1)
y = data["Exited"]

# Split into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Scale down these features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
print(X_train)
print(X_test)

[[ 0.35649971  0.91324755 -0.6557859  ...  1.00150113 -0.57946723
  -0.57638802]
 [-0.20389777  0.91324755  0.29493847 ... -0.99850112  1.72572313
  -0.57638802]
 [-0.96147213  0.91324755 -1.41636539 ... -0.99850112 -0.57946723
   1.73494238]
 ...
 [ 0.86500853 -1.09499335 -0.08535128 ...  1.00150113 -0.57946723
  -0.57638802]
 [ 0.15932282  0.91324755  0.3900109  ...  1.00150113 -0.57946723
  -0.57638802]
 [ 0.47065475  0.91324755  1.15059039 ... -0.99850112  1.72572313
  -0.57638802]]
[[-0.57749609  0.91324755 -0.6557859  ... -0.99850112  1.72572313
  -0.57638802]
 [-0.29729735  0.91324755  0.3900109  ...  1.00150113 -0.57946723
  -0.57638802]
 [-0.52560743 -1.09499335  0.48508334 ... -0.99850112 -0.57946723
   1.73494238]
 ...
 [ 0.81311987 -1.09499335  0.77030065 ...  1.00150113 -0.57946723
  -0.57638802]
 [ 0.41876609  0.91324755 -0.94100321 ...  1.00150113 -0.57946723
  -0.57638802]
 [-0.24540869  0.91324755  0.00972116 ... -0.99850112  1.72572313
  -0.57638802]]


In [18]:
with open("scaler.pkl","wb") as file:
    pickle.dump(scaler,file)

# ANN Implementation

### Steps in ANN Implementation

1. **Sequential Network**  
   Build the model using a sequential architecture.

2. **Dense - Hidden Neuron Creation**  
   Add dense layers to create hidden neurons.

3. **Activation Function**  
   Choose activation functions such as `sigmoid`, `tanh`, `relu`, or `leakyrelu`.

4. **Optimizer - Backpropagation**  
   Select an optimizer (e.g., Adam, SGD) responsible for updating the weights during backpropagation.

5. **Loss Function**  
   Define the loss function to measure model error.

6. **Metrics**  
   - Classification: Accuracy  
   - Regression: MSE (Mean Squared Error), MAE (Mean Absolute Error)

7. **Training & Logging**  
   Train the model and log results using TensorBoard for visualization.

In [19]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.16.2
Num GPUs Available: 1


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [None]:
# Build ANN Model

input_dim = X_train.shape[1]
model = Sequential([
    Input(shape=(input_dim,)),
    Dense(64,activation="relu"), # Hidden Layer 1 - connected with input layer
    Dense(32,activation="relu"),  # Hidden Layer 2
    Dense(1,activation="sigmoid")  # Output Layer - Binary Classification
])

In [26]:
model.summary()

In [27]:
# Forward and Backward Propogation

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )

In [35]:
# Set up Tensorboard

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [32]:
# Set up Early Stopping 

early_stopping_callback = EarlyStopping(monitor="val_loss",patience=10,restore_best_weights=True)

In [36]:
# Train the model

history = model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.8086 - auc: 0.7571 - loss: 0.4360 - val_accuracy: 0.8135 - val_auc: 0.7769 - val_loss: 0.4174
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8101 - auc: 0.7602 - loss: 0.4347 - val_accuracy: 0.8140 - val_auc: 0.7788 - val_loss: 0.4159
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8098 - auc: 0.7582 - loss: 0.4356 - val_accuracy: 0.8110 - val_auc: 0.7789 - val_loss: 0.4192
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8086 - auc: 0.7572 - loss: 0.4360 - val_accuracy: 0.8110 - val_auc: 0.7787 - val_loss: 0.4176
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8086 - auc: 0.7549 - loss: 0.4378 - val_accuracy: 0.8115 - val_auc: 0.7700 - val_loss: 0.4219
Epoch 6/100
[1m250/

In [37]:
model.save("model.h5")



In [39]:
%load_ext tensorboard
%tensorboard --logdir logs/fit

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 76410), started 0:01:19 ago. (Use '!kill 76410' to kill it.)