In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

# Loading The Dataset

In [2]:
 
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Preprocess the Data
### dropping irrelevant features

In [3]:
data.drop(['RowNumber','CustomerId','Surname'],axis = 1,inplace= True)

In [4]:
data.head(1)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1


### Encodeing Categorical data 

In [5]:
le = LabelEncoder()
data.Gender = le.fit_transform(data.Gender)

In [6]:

data.head(4)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0


### ONE_hot_encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
onehot_geo = OneHotEncoder()
geo_en = onehot_geo.fit_transform(data[['Geography']])
geo_en

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [8]:
onehot_geo.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [9]:
geo_en.toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [10]:
geo_en_df = pd.DataFrame(data = geo_en.toarray(),columns = onehot_geo.get_feature_names_out())
geo_en_df.head(3)

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0


In [11]:
data = pd.concat([data.drop(['Geography'],axis = 1), geo_en_df],axis = 1)

In [12]:
data.head(4)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0


### Save the encoders and Scaler

In [13]:
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(le,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_geo,file)

### Dividing dataset

In [14]:
data.head(1)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0


In [15]:
X = data.drop(['Exited'],axis=1)
y = data.Exited

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
X_train.shape,y_train.shape

((8000, 12), (8000,))

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Avoiding data leakage: If you call fit on X_test,
#it would compute new scaling parameters based on the test set,
#which could lead to information from the test set influencing the model, 
#defeating the purpose of having a separate test set.

In [37]:
X_test.shape

(2000, 12)

In [19]:
X_train.shape,y_train.shape

((8000, 12), (8000,))

In [20]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

# ANN Implementation

In [21]:
import tensorflow as tf

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [23]:
(X_train.shape[1],)

(12,)

In [24]:
model = Sequential([
    Dense(64,activation='relu',input_shape = (X_train.shape[1],) ),# HL1 connected to inputlayer
    Dense(32,activation='relu'),# HL2
    Dense(1,activation='sigmoid')# Output layer 
]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
model.summary()

In [26]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)# Flexible learning Rate
loss = tf.keras.losses.BinaryCrossentropy()

In [27]:
# Compile The model
#model.compile(optimizer='adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
model.compile(optimizer=opt,loss = loss,metrics = ['accuracy'])

In [28]:
# Set up the TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y/%m/%d-%H:%M:%S")
tf_callback = TensorBoard(log_dir = './logs',histogram_freq=1)

In [29]:
#Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [30]:
# Training The model
history = model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs = 150,
    callbacks = [tf_callback,early_stopping_callback]
    )

Epoch 1/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8169 - loss: 0.4345 - val_accuracy: 0.8525 - val_loss: 0.3583
Epoch 2/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8548 - loss: 0.3539 - val_accuracy: 0.8610 - val_loss: 0.3468
Epoch 3/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8622 - loss: 0.3396 - val_accuracy: 0.8520 - val_loss: 0.3540
Epoch 4/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8684 - loss: 0.3342 - val_accuracy: 0.8510 - val_loss: 0.3565
Epoch 5/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8556 - loss: 0.3415 - val_accuracy: 0.8635 - val_loss: 0.3450
Epoch 6/150
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8664 - loss: 0.3318 - val_accuracy: 0.8555 - val_loss: 0.3526
Epoch 7/150
[1m250/25

In [31]:
#Saving the model
model.save('model.h5')



In [32]:
#Load TensorBoard extension
%reload_ext tensorboard

In [33]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6008 (pid 5824), started 20:06:59 ago. (Use '!kill 5824' to kill it.)

In [None]:
# loading pickle files
