In [33]:
import pandas as pd
from altair import layer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import pickle

from sklearn.utils.tests.test_pprint import GridSearchCV

In [34]:
import tensorflow as tf
print(f'GPUs Available : {tf.config.list_physical_devices}')

GPUs Available : <function list_physical_devices at 0x7a778e3fd260>


In [35]:
# read the dataset

dataset = pd.read_csv('Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [36]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [37]:
dataset.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [38]:
# checking for null values in the dataset

print(f'Null values present in the dataset : {dataset.isnull().sum().any()}')
dataset[dataset.isnull().any(axis=1)]

Null values present in the dataset : False


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [39]:
# preprocessing

# drop irrelevant columns
df = dataset.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [40]:
gender_label_encoder = LabelEncoder()

df['Gender'] = gender_label_encoder.fit_transform(df['Gender'])

ohe_encoder = OneHotEncoder(sparse_output=False)
geo_ohe_encoder = ohe_encoder.fit_transform(df[['Geography']])

geo_encoded_df = pd.DataFrame(data=geo_ohe_encoder, columns=ohe_encoder.get_feature_names_out())

df = pd.concat([df.drop(['Geography'], axis=1), geo_encoded_df], axis=1)

In [41]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [42]:
# separate features and labels

X = df.drop(['Exited'], axis=1)
y = df['Exited']

print(f'Shape of X : {X.shape}, Shape of y : {y.shape}')

Shape of X : (10000, 12), Shape of y : (10000,)


In [43]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
with open('gender_label_encoder.pkl', 'wb') as file:
    pickle.dump(gender_label_encoder, file)

with open('geo_ohe_encoder.pkl', 'wb') as file:
    pickle.dump(geo_ohe_encoder, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## ANN model implementation also finding the best hidden layers using Grid Search

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [46]:
# define a function to create the model and try different parameters using Keras Classifier
# Fixed: Pass input_dim as parameter instead of referencing X_train directly

def create_model(neurons=32, layers=1):
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(X_train.shape[1],)))
    
    # This for loop is NOT infinite - it runs (layers-1) times
    # With layers=1, runs 0 times; with layers=2, runs 1 time
    for _ in range(layers - 1):
        model.add(Dense(neurons, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [47]:
# create keras classifier
# Fixed: use 'model' parameter instead of deprecated 'build_fn'
# Don't pass build parameters here - they'll be passed via param_grid
model = KerasClassifier(model=create_model, verbose=0, layers=2, neurons=16)

In [48]:
# define the grid search params

param_grid = {
    'neurons': [16, 32, 64, 128],
    'layers': [1, 2],
    'epochs': [50, 100]
}

In [49]:
# # perform grid search
# # MEMORY FIX: Changed n_jobs from -1 to 2 to reduce RAM usage
# # n_jobs=-1 creates too many parallel processes, each consuming RAM
# # Using n_jobs=2 will be slower but won't deplete RAM
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=2, cv=3, verbose=2)
# grid_result = grid.fit(X_train_scaled, y_train)  # Fixed: use X_train_scaled instead of X_train
#
# # get the best model score and best model params
# print(f'Best params: {grid_result.best_params_}')
# print(f'Best score: {grid_result.best_score_}')

In [52]:
optimized_model = Sequential([
    Dense(units=16, activation='relu', input_shape=(X_train.shape[1], )),
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

optimized_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [54]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir)

In [55]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [56]:
history = optimized_model.fit(
    X_train,
    y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping_callback, tensorflow_callback]
)

Epoch 1/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.6201 - loss: 1318.9006 - val_accuracy: 0.5592 - val_loss: 37.2286
Epoch 2/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6973 - loss: 27.8178 - val_accuracy: 0.7364 - val_loss: 11.8831
Epoch 3/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6911 - loss: 23.7058 - val_accuracy: 0.7680 - val_loss: 9.0402
Epoch 4/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6936 - loss: 18.1651 - val_accuracy: 0.6404 - val_loss: 22.2542
Epoch 5/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6867 - loss: 18.8313 - val_accuracy: 0.7984 - val_loss: 27.2257
Epoch 6/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6831 - loss: 18.8738 - val_accuracy: 0.7972 - val_loss: 26.6285
Epoch 7/1

In [57]:
optimized_model.save('model.keras')

In [58]:
%load_ext tensorboard

In [59]:
# Set TensorBoard binary path for WSL environment
import os
os.environ['TENSORBOARD_BINARY'] = '/home/hashaneranga/.virtualenvs/MachineLearning_and_datamining_fundamentals/bin/tensorboard'

In [60]:
%tensorboard --logdir logs/fit

In [61]:
# evaluate model on the test data

test_loss, test_mae = optimized_model.evaluate(X_test_scaled, y_test)
print(f'test loss : {test_loss}')
print(f'test mae : {test_mae}')

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5568 - loss: 0.7486
test loss : 0.7486270666122437
test mae : 0.5568000078201294
