In [325]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [326]:
## Load the dataset
df = pd.read_csv("artifacts/cleaned_preprocessed_data.csv")
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


# Split Train Test Sets

In [327]:
## Divide the dataset into independent and dependent features
X = df.drop('Exited',axis=1)
y = df['Exited']

## Split the data in training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scale these features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [328]:
(pd.DataFrame(X_train).shape, pd.DataFrame(X_test).shape)

((8000, 12), (2000, 12))

In [329]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.356500,0.913248,-0.655786,0.345680,-1.218471,0.808436,0.649203,0.974817,1.367670,1.001501,-0.579467,-0.576388
1,-0.203898,0.913248,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,1.661254,-0.998501,1.725723,-0.576388
2,-0.961472,0.913248,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,-0.252807,-0.998501,-0.579467,1.734942
3,-0.940717,-1.094993,-1.131148,1.386753,0.953212,-0.916688,0.649203,-1.025834,0.915393,1.001501,-0.579467,-0.576388
4,-1.397337,0.913248,1.625953,1.386753,1.057449,-0.916688,-1.540351,-1.025834,-1.059600,1.001501,-0.579467,-0.576388
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,1.207474,0.913248,1.435808,1.039728,-0.102301,-0.916688,0.649203,0.974817,-0.539860,1.001501,-0.579467,-0.576388
7996,0.314989,-1.094993,1.816097,-1.389442,-1.218471,-0.916688,0.649203,0.974817,-1.733882,1.001501,-0.579467,-0.576388
7997,0.865009,-1.094993,-0.085351,-1.389442,-1.218471,2.533560,-1.540351,-1.025834,-0.142765,1.001501,-0.579467,-0.576388
7998,0.159323,0.913248,0.390011,1.039728,1.827259,-0.916688,0.649203,-1.025834,-0.050826,1.001501,-0.579467,-0.576388


In [330]:
y_train

9254    0
1561    0
1670    1
6087    1
6669    1
       ..
5734    0
5191    0
5390    1
860     1
7270    0
Name: Exited, Length: 8000, dtype: int64

In [331]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.577496,0.913248,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-1.019605,-0.998501,1.725723,-0.576388
1,-0.297297,0.913248,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,0.798883,1.001501,-0.579467,-0.576388
2,-0.525607,-1.094993,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.727980,-0.998501,-0.579467,1.734942
3,-1.511492,0.913248,1.911170,1.039728,0.689272,0.808436,0.649203,0.974817,1.221387,-0.998501,1.725723,-0.576388
4,-0.951094,-1.094993,-1.131148,0.692704,0.782839,-0.916688,0.649203,0.974817,0.247560,-0.998501,-0.579467,1.734942
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.515230,-1.094993,1.340735,-0.001345,0.354911,-0.916688,-1.540351,0.974817,-0.963100,-0.998501,1.725723,-0.576388
1996,-0.442586,0.913248,-1.321293,1.733777,-1.218471,-0.916688,-1.540351,0.974817,0.164545,1.001501,-0.579467,-0.576388
1997,0.813120,-1.094993,0.770301,0.692704,-1.218471,-0.916688,0.649203,-1.025834,-1.165916,1.001501,-0.579467,-0.576388
1998,0.418766,0.913248,-0.941003,-0.348369,-1.218471,-0.916688,0.649203,-1.025834,-0.411635,1.001501,-0.579467,-0.576388


In [332]:
y_test

6252    0
4684    0
1731    0
4742    0
4521    0
       ..
6412    1
8285    0
7853    1
1095    1
6929    1
Name: Exited, Length: 2000, dtype: int64

In [333]:
with open('artifacts/classification_scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

# Deep Learning ANN Classification Implementation

In [336]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import Input
import datetime

In [337]:
## Build Our ANN Sequential Model
model = Sequential([
    Input(shape=(X_train.shape[1],)), # Explicitly define the input shape
    Dense(64, activation='relu'), ## HL1 Connected with input layer
    Dense(32, activation='relu'), ## HL2
    Dense(1,activation='sigmoid')  ## output layer
])

In [338]:
model.summary()

In [339]:
log_dir = "logs/fit/classification/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [340]:
## Set up the Tensorboard
tensorboard_callback = TensorBoard(log_dir= log_dir, histogram_freq= 1)

In [341]:
## Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor= 'val_loss', patience= 10, restore_best_weights= True)

In [342]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
#loss = tf.keras.losses.BinaryCrossentropy()

# Compile the Model
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])

### Train the model
history = model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorboard_callback,early_stopping_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8030 - loss: 0.4340 - val_accuracy: 0.8560 - val_loss: 0.3521
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 980us/step - accuracy: 0.8535 - loss: 0.3523 - val_accuracy: 0.8615 - val_loss: 0.3461
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8546 - loss: 0.3512 - val_accuracy: 0.8615 - val_loss: 0.3519
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 990us/step - accuracy: 0.8601 - loss: 0.3426 - val_accuracy: 0.8635 - val_loss: 0.3415
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 933us/step - accuracy: 0.8640 - loss: 0.3312 - val_accuracy: 0.8670 - val_loss: 0.3382
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 874us/step - accuracy: 0.8624 - loss: 0.3323 - val_accuracy: 0.8590 - val_loss: 0.3457
Epoch 7/100
[

In [343]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [344]:
%tensorboard --logdir logs/fit/classification

Reusing TensorBoard on port 6007 (pid 78873), started 1 day, 1:07:54 ago. (Use '!kill 78873' to kill it.)

# Training ANN Sequential Model By Hyperparameter Tuning

In [345]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras import Input

In [346]:
# Define the custom class
class CustomKerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, neurons=32, layers=1, learning_rate=0.01, epochs=50, batch_size=32, verbose=1, callbacks=None, validation_split=0.2):
        self.neurons = neurons
        self.layers = layers
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.callbacks = callbacks
        self._estimator_type = "classifier"
        self.validation_split = validation_split

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape_,)))  # Explicitly define the input shape
        model.add(Dense(self.neurons, activation='relu'))
        
        for _ in range(self.layers - 1):
            model.add(Dense(self.neurons, activation='relu'))
        
        model.add(Dense(1, activation='sigmoid'))
        opt = Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def fit(self, X, y):
        self.input_shape_ = X.shape[1]
        self.model_ = self.build_model()
        self.model_.fit(
            X, 
            y, 
            epochs=self.epochs, 
            batch_size=self.batch_size, 
            verbose=self.verbose, 
            callbacks=self.callbacks,
            validation_split = self.validation_split
        )
        return self

    def predict(self, X):
        predictions = (self.model_.predict(X) > 0.5).astype("int32")
        return np.squeeze(predictions)

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)


In [347]:
# Define the grid search parameters
param_grid = {
    'neurons': [16, 32, 64, 128],
    'layers': [1, 2],
    'learning_rate': [0.01, 0.001],
    'epochs': [50, 100],
    'batch_size': [16, 32]
}

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model = CustomKerasClassifier(callbacks=[early_stopping_callback])

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, verbose=1)
grid_result = grid.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

best_model = grid_result.best_estimator_

Fitting 3 folds for each of 64 candidates, totalling 192 fits
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6973 - loss: 0.5763 - val_accuracy: 0.8463 - val_loss: 0.3789
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8116 - loss: 0.4505 - val_accuracy: 0.8604 - val_loss: 0.3659
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7590 - loss: 0.5005 - val_accuracy: 0.8529 - val_loss: 0.3687
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8070 - loss: 0.4563 - val_accuracy: 0.8351 - val_loss: 0.3701
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7533 - loss: 0.4948 - val_accuracy: 0.8229 - val_loss: 0.4081
Epoch 2/50
[1m267/267[0m [

In [348]:
best_model

In [349]:
grid_result.best_score_

np.float64(0.8581241673372973)

In [350]:
grid_result.best_params_

{'batch_size': 32,
 'epochs': 50,
 'layers': 2,
 'learning_rate': 0.01,
 'neurons': 64}

In [351]:
# Evaluate on the test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step
Test Accuracy: 0.8575


In [352]:
y_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int32)

In [353]:
class CustomData:
    def __init__(  self,
        credit_score: int,
        geography: str,
        gender: str,
        age: int,
        tenure: int,
        balance: int,
        number_of_products: int,
        has_cr_card: bool,
        is_active_member: bool,
        estimated_salary: int):

        self.credit_score = credit_score
        self.geography = geography
        self.gender = gender
        self.age = age
        self.tenure = tenure
        self.balance = balance
        self.number_of_products = number_of_products
        self.has_cr_card = has_cr_card
        self.is_active_member = is_active_member
        self.estimated_salary = estimated_salary

    def get_data_as_data_frame(self):
        custom_data_input_dict = {
            "CreditScore": [self.credit_score],
            "Geography": [self.geography],
            "Gender": [self.gender],
            "Age": [self.age],
            "Tenure": [self.tenure],
            "Balance": [self.balance],
            "NumOfProducts": [self.number_of_products],
            "HasCrCard": [self.has_cr_card],
            "IsActiveMember": [self.is_active_member],
            "EstimatedSalary": [self.estimated_salary]
        }
        return pd.DataFrame(custom_data_input_dict)
        
    def __str__(self):
        return f"CreditScore={self.credit_score}, Geography = {self.geography}, Gender = {self.gender},\nAge = {self.age}, Tenure = {self.tenure}, Balance = {self.balance},\nNumOfProducts = {self.number_of_products}, HasCrCard = {self.has_cr_card}, IsActiveMember = {self.is_active_member},\nEstimatedSalary = {self.estimated_salary})"


In [354]:
# Example input data
input_data = {
    'CreditScore': 600,
    'Geography': 'France',
    'Gender': 'Male',
    'Age': 40,
    'Tenure': 3,
    'Balance': 60000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'EstimatedSalary': 50000
}

In [355]:
# load the encoder and scaler
with open('artifacts/one_hot_encoder_geo.pkl','rb') as file:
    label_encoder_geo=pickle.load(file)

with open('artifacts/label_encoder_gender.pkl', 'rb') as file:
    label_encoder_gender = pickle.load(file)

with open('artifacts/classification_scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [356]:
# One-hot encode 'Geography'
geo_encoded = label_encoder_geo.transform([[input_data['Geography']]]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=label_encoder_geo.get_feature_names_out(['Geography']))
geo_encoded_df



Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [357]:
input_df=pd.DataFrame([input_data])

input_df['Gender']=label_encoder_gender.transform(input_df['Gender'])
input_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,1,40,3,60000,2,1,1,50000


In [358]:
## concatination with one hot encoded 
input_df=pd.concat([input_df.drop("Geography",axis=1),geo_encoded_df],axis=1)
input_df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,1,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [359]:
# Scaling the input data
input_scaled=scaler.transform(input_df)
input_scaled

array([[-0.53598516,  0.91324755,  0.10479359, -0.69539349, -0.25781119,
         0.80843615,  0.64920267,  0.97481699, -0.87683221,  1.00150113,
        -0.57946723, -0.57638802]])

In [360]:
# Prediction
prediction=best_model.predict(input_scaled)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


array(0, dtype=int32)

In [361]:
prediction_probability = prediction
prediction_probability

array(0, dtype=int32)

In [362]:
if prediction_probability > 0.5:
    print('The customer is likely to churn.')
else:
    print('The customer is not likely to churn.')

The customer is not likely to churn.


In [364]:
best_model.model_.save("artifacts/classifier_model.keras")