In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [5]:
## Load the dataset
df = pd.read_csv("artifacts/cleaned_preprocessed_data.csv")
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


# Split Train Test Sets

In [6]:
## Divide the dataset into independent and dependent features
X = df.drop('EstimatedSalary', axis=1)
y = df['EstimatedSalary']

## Split the data in training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scale these features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
(pd.DataFrame(X_train).shape, pd.DataFrame(X_test).shape)

((8000, 12), (2000, 12))

In [8]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.356500,0.913248,-0.655786,0.345680,-1.218471,0.808436,0.649203,0.974817,-0.50858,1.001501,-0.579467,-0.576388
1,-0.203898,0.913248,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,-0.50858,-0.998501,1.725723,-0.576388
2,-0.961472,0.913248,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,1.96626,-0.998501,-0.579467,1.734942
3,-0.940717,-1.094993,-1.131148,1.386753,0.953212,-0.916688,0.649203,-1.025834,1.96626,1.001501,-0.579467,-0.576388
4,-1.397337,0.913248,1.625953,1.386753,1.057449,-0.916688,-1.540351,-1.025834,1.96626,1.001501,-0.579467,-0.576388
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,1.207474,0.913248,1.435808,1.039728,-0.102301,-0.916688,0.649203,0.974817,-0.50858,1.001501,-0.579467,-0.576388
7996,0.314989,-1.094993,1.816097,-1.389442,-1.218471,-0.916688,0.649203,0.974817,-0.50858,1.001501,-0.579467,-0.576388
7997,0.865009,-1.094993,-0.085351,-1.389442,-1.218471,2.533560,-1.540351,-1.025834,1.96626,1.001501,-0.579467,-0.576388
7998,0.159323,0.913248,0.390011,1.039728,1.827259,-0.916688,0.649203,-1.025834,1.96626,1.001501,-0.579467,-0.576388


In [9]:
y_train

9254    179093.26
1561    195978.86
1670     85891.02
6087    153080.40
6669     39488.04
          ...    
5734     69381.05
5191       706.50
5390     92220.12
860      97508.04
7270     53581.14
Name: EstimatedSalary, Length: 8000, dtype: float64

In [10]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.577496,0.913248,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-0.50858,-0.998501,1.725723,-0.576388
1,-0.297297,0.913248,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,-0.50858,1.001501,-0.579467,-0.576388
2,-0.525607,-1.094993,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.50858,-0.998501,-0.579467,1.734942
3,-1.511492,0.913248,1.911170,1.039728,0.689272,0.808436,0.649203,0.974817,-0.50858,-0.998501,1.725723,-0.576388
4,-0.951094,-1.094993,-1.131148,0.692704,0.782839,-0.916688,0.649203,0.974817,-0.50858,-0.998501,-0.579467,1.734942
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.515230,-1.094993,1.340735,-0.001345,0.354911,-0.916688,-1.540351,0.974817,1.96626,-0.998501,1.725723,-0.576388
1996,-0.442586,0.913248,-1.321293,1.733777,-1.218471,-0.916688,-1.540351,0.974817,-0.50858,1.001501,-0.579467,-0.576388
1997,0.813120,-1.094993,0.770301,0.692704,-1.218471,-0.916688,0.649203,-1.025834,1.96626,1.001501,-0.579467,-0.576388
1998,0.418766,0.913248,-0.941003,-0.348369,-1.218471,-0.916688,0.649203,-1.025834,1.96626,1.001501,-0.579467,-0.576388


In [11]:
y_test

6252     41788.37
4684    146379.30
1731     58561.31
4742    170679.74
4521    114669.79
          ...    
6412     45038.29
8285    109895.16
7853     33373.26
1095     76755.99
6929    107674.30
Name: EstimatedSalary, Length: 2000, dtype: float64

In [12]:
with open('artifacts/regression_scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

# Deep Learning ANN Regression Implementation

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import Input
import datetime

In [14]:
# Build the model
model = Sequential([
    Input(shape=(X_train.shape[1],)), # Explicitly define the input shape
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Output layer for regression
])

model.summary()

In [15]:
log_dir = "logs/fit/regression/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [16]:
# Set up the Tensorboard
tensorboard_callback = TensorBoard(log_dir= log_dir, histogram_freq= 1)

In [17]:
# Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor= 'val_loss', patience= 10, restore_best_weights= True)

In [18]:
# compile the model
model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])

In [19]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorboard_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 99911.7891 - mae: 99911.7891 - val_loss: 98503.1250 - val_mae: 98503.1250
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 99897.1797 - mae: 99897.1797 - val_loss: 96862.5469 - val_mae: 96862.5469
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 97281.7578 - mae: 97281.7578 - val_loss: 92673.5859 - val_mae: 92673.5859
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 913us/step - loss: 93655.2266 - mae: 93655.2266 - val_loss: 85758.3594 - val_mae: 85758.3594
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 865us/step - loss: 85960.4688 - mae: 85960.4688 - val_loss: 77036.4297 - val_mae: 77036.4297
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 978us/step - loss: 76638.3672 - mae: 76638.3672 - val_loss: 68153.9062 - 

In [20]:
%load_ext tensorboard

In [22]:
%tensorboard --logdir logs/fit/regression

Reusing TensorBoard on port 6009 (pid 16280), started 0:00:02 ago. (Use '!kill 16280' to kill it.)

# Training ANN Sequential Model By Hyperparameter Tuning

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator, RegressorMixin
from tensorflow.keras import Input

In [24]:

class CustomKerasRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, neurons=32, layers=1, learning_rate=0.01, epochs=50, batch_size=32, verbose=1, callbacks=None, validation_split = 0.2):
        self.neurons = neurons
        self.layers = layers
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.callbacks = callbacks
        self._estimator_type = "regressor"
        self.validation_split = validation_split

    def build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.input_shape_,)))  # Explicitly define the input shape
        model.add(Dense(self.neurons, activation='relu'))
        
        for _ in range(self.layers - 1):
            model.add(Dense(self.neurons, activation='relu'))
        
        model.add(Dense(1, activation='linear'))  # Output layer for regression
        
        model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
        return model

    def fit(self, X, y):
        self.input_shape_ = X.shape[1]
        self.model_ = self.build_model()
        self.model_.fit(
            X, 
            y, 
            epochs=self.epochs, 
            batch_size=self.batch_size, 
            verbose=self.verbose, 
            callbacks=self.callbacks,
            validation_split = self.validation_split
        )
        return self

    def predict(self, X):
        predictions = self.model_.predict(X)
        return np.squeeze(predictions)

    def score(self, X, y):
        predictions = self.predict(X)
        mse = mean_squared_error(y, predictions)
        return -mse  # Negative MSE because GridSearchCV maximizes the score

In [25]:

param_grid = {
    'neurons': [16, 32, 64, 128],
    'layers': [1, 2, 3],
    'learning_rate': [0.01, 0.001],
    'epochs': [50, 100],
    'batch_size': [16, 32]
}

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model = CustomKerasRegressor(callbacks=[early_stopping_callback])

# Custom scorer for GridSearchCV
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Use the custom scorer in GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, verbose=1, scoring=mse_scorer)
grid_result = grid.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best MSE: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

best_model = grid_result.best_estimator_


Fitting 3 folds for each of 96 candidates, totalling 288 fits
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
Epoch 1/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13384513536.0000 - mean_squared_error: 13384513536.0000 - val_loss: 13553456128.0000 - val_mean_squared_error: 13553456128.0000
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14079483904.0000 - mean_squared_error: 14079483904.0000 - val_loss: 13552257024.0000 - val_mean_squared_error: 13552257024.0000
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13813696512.0000 - mean_squared_error: 13813696512.0000 - val_loss: 13553411072.0000 - val_mean_squared_error: 13553411072.0000
Epoch 2/50
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13271208960.0000 - mean_squared_error: 13271208960.0000 - val_loss: 13551961088

2024-12-20 20:44:50.362019: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: Input to reshape is a tensor with 1 values, but the requested shape has 16
	 [[{{function_node __inference_one_step_on_data_1183537}}{{node gradient_tape/compile_loss/mean_squared_error/sub/Reshape}}]]


[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 934us/step - loss: 3236616192.0000 - mean_squared_error: 3236616192.00
[1m249/267[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - loss: 3320547584.0000 - mean_squared_error: 3320547584.0000Epoch 1/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3195336448.0000 - mean_squared_error: 3195336448.0000 - val_loss: 3358906880.0000 - val_mean_squared_error: 3358906880.0000
Epoch 18/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3279387136.0000 - mean_squared_error: 3279387136.0000 - val_loss: 3412474624.0000 - val_mean_squared_error: 3412474624.0000
Epoch 33/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3318595072.0000 - mean_squared_error: 3318595072.0000 - val_loss: 3373685248.0000 - val_mean_squared_error: 3373685248.0000
Epoch 38/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

2024-12-20 20:45:43.870547: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: Input to reshape is a tensor with 1 values, but the requested shape has 16
	 [[{{function_node __inference_one_step_on_data_1251054}}{{node gradient_tape/compile_loss/mean_squared_error/sub/Reshape}}]]


[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3250725376.0000 - mean_squared_error: 3250725376.0000 - val_loss: 3357733632.0000 - val_mean_squared_error: 3357733632.0000
[1m259/267[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 3212888064.0000 - mean_squared_error: 3212888064.0000Epoch 33/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3231932928.0000 - mean_squared_error: 3231932928.0000 - val_loss: 3424888064.0000 - val_mean_squared_error: 3424888064.0000
Epoch 32/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3246408192.0000 - mean_squared_error: 3246408192.0000 - val_loss: 3328324352.0000 - val_mean_squared_error: 3328324352.0000
Epoch 37/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3262288640.0000 - mean_squared_error: 3262288640.0000 - val_loss: 3418170368.0000 - val_mean_squared_error: 3418170368.00

2 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mansysoroush/Documents/Others/Learnings/GitHub Projects/DataAnalysis_MachineLearning_Python_Projects/DeepLearning_ANN/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/var/folders/z2/9xbbtsl54ys8nj3clmn8qg4m0000gn/T/ipykernel_10618/404231644.py", line 29, in fit
  File "/Users/mansysoroush/Documents/Others/Learnings/GitHub Projects/DataAnalysis_MachineLearning_Python_Projects/DeepLearning_ANN/venv/lib/python3.11/site-packages/keras/src/utils/traceback_util

Epoch 1/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 834us/step - loss: 13268669440.0000 - mean_squared_error: 13268669440.0000 - val_loss: 13470534656.0000 - val_mean_squared_error: 13470534656.0000
Epoch 2/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 647us/step - loss: 12781689856.0000 - mean_squared_error: 12781689856.0000 - val_loss: 11771529216.0000 - val_mean_squared_error: 11771529216.0000
Epoch 3/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - loss: 10621574144.0000 - mean_squared_error: 10621574144.0000 - val_loss: 8497578496.0000 - val_mean_squared_error: 8497578496.0000
Epoch 4/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step - loss: 7258897408.0000 - mean_squared_error: 7258897408.0000 - val_loss: 5357261824.0000 - val_mean_squared_error: 5357261824.0000
Epoch 5/100
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step - loss: 476044800

In [26]:
best_model

In [27]:
grid_result.best_score_

np.float64(-3331447501.5494018)

In [28]:
grid_result.best_params_

{'batch_size': 16,
 'epochs': 100,
 'layers': 2,
 'learning_rate': 0.01,
 'neurons': 64}

In [29]:
# Evaluate on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step
Test MSE: 3349692776.8764334


In [30]:
y_pred

array([ 94604.18 ,  95172.625,  98339.02 , ..., 105562.58 , 100163.23 ,
        91731.   ], dtype=float32)

In [31]:
class CustomData:
    def __init__(  self,
        credit_score: int,
        geography: str,
        gender: str,
        age: int,
        tenure: int,
        balance: int,
        number_of_products: int,
        has_cr_card: bool,
        is_active_member: bool,
        exited: bool):

        self.credit_score = credit_score
        self.geography = geography
        self.gender = gender
        self.age = age
        self.tenure = tenure
        self.balance = balance
        self.number_of_products = number_of_products
        self.has_cr_card = has_cr_card
        self.is_active_member = is_active_member
        self.exited = exited

    def get_data_as_data_frame(self):
        custom_data_input_dict = {
            "CreditScore": [self.credit_score],
            "Geography": [self.geography],
            "Gender": [self.gender],
            "Age": [self.age],
            "Tenure": [self.tenure],
            "Balance": [self.balance],
            "NumOfProducts": [self.number_of_products],
            "HasCrCard": [self.has_cr_card],
            "IsActiveMember": [self.is_active_member],
            "Exited": [self.exited]
        }
        return pd.DataFrame(custom_data_input_dict)
        
    def __str__(self):
        return f"CreditScore={self.credit_score}, Geography = {self.geography}, Gender = {self.gender},\nAge = {self.age}, Tenure = {self.tenure}, Balance = {self.balance},\nNumOfProducts = {self.number_of_products}, HasCrCard = {self.has_cr_card}, IsActiveMember = {self.is_active_member},\Exited = {self.exited})"


In [32]:
# Example input data
input_data = {
    'CreditScore': 600,
    'Geography': 'France',
    'Gender': 'Male',
    'Age': 40,
    'Tenure': 3,
    'Balance': 60000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'Exited': 0
}

In [33]:
# load the encoder and scaler
with open('artifacts/one_hot_encoder_geo.pkl','rb') as file:
    label_encoder_geo=pickle.load(file)

with open('artifacts/label_encoder_gender.pkl', 'rb') as file:
    label_encoder_gender = pickle.load(file)

with open('artifacts/regression_scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [34]:
# One-hot encode 'Geography'
geo_encoded = label_encoder_geo.transform([[input_data['Geography']]]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded, columns=label_encoder_geo.get_feature_names_out(['Geography']))
geo_encoded_df



Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [35]:
input_df=pd.DataFrame([input_data])

input_df['Gender']=label_encoder_gender.transform(input_df['Gender'])
input_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,600,France,1,40,3,60000,2,1,1,0


In [36]:
# concatination with one hot encoded 
input_df=pd.concat([input_df.drop("Geography",axis=1),geo_encoded_df],axis=1)
input_df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited,Geography_France,Geography_Germany,Geography_Spain
0,600,1,40,3,60000,2,1,1,0,1.0,0.0,0.0


In [37]:
# Scaling the input data
input_scaled=scaler.transform(input_df)
input_scaled

array([[-0.53598516,  0.91324755,  0.10479359, -0.69539349, -0.25781119,
         0.80843615,  0.64920267,  0.97481699, -0.50857963,  1.00150113,
        -0.57946723, -0.57638802]])

In [38]:
# Prediction
prediction=best_model.predict(input_scaled)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


array(94855.81, dtype=float32)

In [39]:
print(f'The estimated salary is approximately {prediction}.')

The estimated salary is approximately 94855.8125.


In [40]:
best_model.model_.save("artifacts/regressor_model.keras")