Data Preparation for All

In [None]:
from __future__ import print_function
import os
import math
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from tensorflow.keras.layers import LSTM, Dense, GRU
import tensorflow as tf
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/My Drive/Colab Notebooks/dataset/Part1_house_price.csv"
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Convert the 'Date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%dT%H%M%S')

# Extract year and quarter from the 'Date' column and add them as new columns
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter

In [None]:
df.drop(['id','date'], axis=1, inplace=True)
label_col = 'price'

In [None]:
# Identify the zipcode with the highest count
most_common_zipcode = df['zipcode'].value_counts().idxmax()

# Create dummy variables
df_zip = pd.get_dummies(df['zipcode'], prefix='zc')

# Drop the column corresponding to the most common zipcode
df_zip = df_zip.drop('zc_' + str(most_common_zipcode), axis=1)

# Concatenate the dummy variables with the other numerical columns
df_num = df.select_dtypes(include='number')
df = pd.concat([df_num, df_zip], axis=1, join='inner')

Data Preparation for first, second model

In [None]:
# Set a seed value
seed_value = 1234
# Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)
#Set Training and Testing data
train_size, valid_size, test_size = (0.7, 0.3, 0.0)
df_train, df_valid = train_test_split(df,
                                      test_size=valid_size,
                                      random_state=seed_value)

In [None]:
df_y_train = df_train[[label_col]]
df_x_train = df_train.drop(label_col, axis=1)
df_y_valid = df_valid[[label_col]]
df_x_valid = df_valid.drop(label_col, axis=1)

print('Size of training set: ', len(df_x_train))
print('Size of validation set: ', len(df_x_valid))

Size of training set:  14000
Size of validation set:  6000


In [None]:
scaler = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_x_train)
df_x_train = pd.DataFrame(scaler.transform(df_x_train),
                            columns = df_x_train.columns, index = df_x_train.index)
df_x_valid = pd.DataFrame(scaler.transform(df_x_valid),
                            columns = df_x_valid.columns, index = df_x_valid.index)

print('X train min =', round(df_x_train.min().min(),4), '; max =', round(df_x_train.max().max(), 4))
print('X valid min =', round(df_x_valid.min().min(),4), '; max =', round(df_x_valid.max().max(), 4))

X train min = 0.0 ; max = 1.0
X valid min = -0.0 ; max = 1.0175


In [None]:
from tensorflow.keras import metrics
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Nadam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [None]:
arr_x_train = np.array(df_x_train)
arr_y_train = np.array(df_y_train)
arr_x_valid = np.array(df_x_valid)
arr_y_valid = np.array(df_y_valid)

print('Training shape:', arr_x_train.shape)
print('Training samples: ', arr_x_train.shape[0])
print('Validation samples: ', arr_x_valid.shape[0])

Training shape: (14000, 89)
Training samples:  14000
Validation samples:  6000


Linear Regression model

In [None]:
# Load the dataset
X = df.drop('price', axis=1).values # Features
y = df['price'].values # Target

# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 3: Build the Linear Regression Model
model_lr = LinearRegression()

# Step 4: Train the Model
model_lr.fit(X_train, y_train)

# Step 5: # Evaluate the Model
y_pred_lr = model_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
corrcoef_lr = np.corrcoef(y_test, y_pred_lr)

print('Mean Squared Error:', mse_lr)
print('Mean Absolute Error:', mae_lr)
print('Root Mean Squared Error:', rmse_lr)
print('Correlation coefficient:', corrcoef_lr[0, 1])

Mean Squared Error: 26349266013.266335
Mean Absolute Error: 95567.65133333333
Root Mean Squared Error: 162324.56996174774
Correlation coefficient: 0.9007515634535999


first model

In [None]:
# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [None]:
def model_1(x_size, y_size):
    t_model = Sequential()
    t_model.add(Dense(100, activation="tanh", input_shape=(x_size,)))
    t_model.add(Dropout(0.2))
    t_model.add(Dense(180, activation="relu"))
    t_model.add(Dense(20, activation="relu"))
    t_model.add(Dense(y_size))
    t_model.compile(
        loss='mean_squared_error',
        optimizer=RMSprop(learning_rate=0.005, rho=0.9, momentum=0.0, epsilon=1e-07, weight_decay=0.0,),
        metrics=[metrics.mae])
    return(t_model)

In [None]:
model1 = model_1(arr_x_train.shape[1], arr_y_train.shape[1])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               9000      
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 180)               18180     
                                                                 
 dense_2 (Dense)             (None, 20)                3620      
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 30821 (120.39 KB)
Trainable params: 30821 (120.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model1.fit(arr_x_train, arr_y_train,batch_size=64,epochs=500,shuffle=True,verbose=2,validation_data=(arr_x_valid, arr_y_valid),callbacks=[early_stopping])

Epoch 1/500
219/219 - 4s - loss: 317089579008.0000 - mean_absolute_error: 423237.3438 - val_loss: 162158084096.0000 - val_mean_absolute_error: 231353.0000 - 4s/epoch - 19ms/step
Epoch 2/500
219/219 - 1s - loss: 103644643328.0000 - mean_absolute_error: 180811.4062 - val_loss: 73600352256.0000 - val_mean_absolute_error: 139137.9375 - 935ms/epoch - 4ms/step
Epoch 3/500
219/219 - 1s - loss: 57586053120.0000 - mean_absolute_error: 126240.1172 - val_loss: 52826116096.0000 - val_mean_absolute_error: 119960.3594 - 921ms/epoch - 4ms/step
Epoch 4/500
219/219 - 1s - loss: 43968851968.0000 - mean_absolute_error: 114803.9609 - val_loss: 44574535680.0000 - val_mean_absolute_error: 118867.7578 - 691ms/epoch - 3ms/step
Epoch 5/500
219/219 - 1s - loss: 36351025152.0000 - mean_absolute_error: 111185.2266 - val_loss: 34723274752.0000 - val_mean_absolute_error: 95298.0156 - 549ms/epoch - 3ms/step
Epoch 6/500
219/219 - 1s - loss: 33621934080.0000 - mean_absolute_error: 109444.2969 - val_loss: 31566839808.0

<keras.src.callbacks.History at 0x7dce1afda500>

In [None]:
# After training the model and making predictions
y_pred1 = model1.predict(arr_x_valid)

# Calculate Mean Squared Error
mse1 = mean_squared_error(arr_y_valid, y_pred1)
print('Mean Squared Error:', mse1)

# Calculate Root Mean Squared Error
rmse1 = np.sqrt(mse1)
print('Root Mean Squared Error:', rmse1)

# Calculate Mean Absolute Error
mae1 = mean_absolute_error(arr_y_valid, y_pred1)
print('Mean Absolute Error:', mae1)

# Calculate Correlation Coefficient
corrcoef1 = np.corrcoef(arr_y_valid.flatten(), y_pred1.flatten())
print('Correlation coefficient:', corrcoef1[0, 1])

Mean Squared Error: 15630847198.03376
Root Mean Squared Error: 125023.38660440198
Mean Absolute Error: 71609.40765104166
Correlation coefficient: 0.9428185525395911


Second Model

In [None]:
# Before defining and training your second model
tf.keras.backend.clear_session()
# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

In [None]:
def model_2(x_size, y_size):
    model = Sequential()
    model.add(Dense(200, activation="relu", input_shape=(x_size,)))
    model.add(Dropout(0.1))
    model.add(Dense(100, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=Adam(), metrics=[metrics.mae])
    return model

In [None]:
model2 = model_2(arr_x_train.shape[1], arr_y_train.shape[1])
model2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               18000     
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_1 (Dense)             (None, 100)               20100     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 38201 (149.22 KB)
Trainable params: 38201 (149.22 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model2.fit(arr_x_train, arr_y_train,batch_size=64,epochs=500,shuffle=True,verbose=2,validation_data=(arr_x_valid, arr_y_valid),callbacks=[early_stopping])

Epoch 1/500
219/219 - 2s - loss: 416981614592.0000 - mean_absolute_error: 533945.1875 - val_loss: 421590990848.0000 - val_mean_absolute_error: 531845.3125 - 2s/epoch - 9ms/step
Epoch 2/500
219/219 - 1s - loss: 396021432320.0000 - mean_absolute_error: 514993.7500 - val_loss: 379163475968.0000 - val_mean_absolute_error: 492708.4375 - 594ms/epoch - 3ms/step
Epoch 3/500
219/219 - 1s - loss: 328160772096.0000 - mean_absolute_error: 447363.8438 - val_loss: 287113216000.0000 - val_mean_absolute_error: 395472.3750 - 571ms/epoch - 3ms/step
Epoch 4/500
219/219 - 1s - loss: 225859354624.0000 - mean_absolute_error: 324696.2500 - val_loss: 183957880832.0000 - val_mean_absolute_error: 258736.2812 - 541ms/epoch - 2ms/step
Epoch 5/500
219/219 - 1s - loss: 142315569152.0000 - mean_absolute_error: 215277.5469 - val_loss: 124453298176.0000 - val_mean_absolute_error: 195546.9375 - 575ms/epoch - 3ms/step
Epoch 6/500
219/219 - 1s - loss: 107550113792.0000 - mean_absolute_error: 189009.7969 - val_loss: 10724

<keras.src.callbacks.History at 0x7dce0b5c69e0>

In [None]:
# After training the model and making predictions
y_pred2 = model2.predict(arr_x_valid)

# Calculate Mean Squared Error
mse2 = mean_squared_error(arr_y_valid, y_pred2)
print('Mean Squared Error:', mse2)

# Calculate Root Mean Squared Error
rmse2 = np.sqrt(mse2)
print('Root Mean Squared Error:', rmse2)

# Calculate Mean Absolute Error
mae2 = mean_absolute_error(arr_y_valid, y_pred2)
print('Mean Absolute Error:', mae2)

# Calculate Correlation Coefficient
corrcoef2 = np.corrcoef(arr_y_valid.flatten(), y_pred2.flatten())
print('Correlation coefficient:', corrcoef2[0, 1])

Mean Squared Error: 15166711417.69289
Root Mean Squared Error: 123153.2030346466
Mean Absolute Error: 72016.57014160157
Correlation coefficient: 0.9439545367166841


# Third Model

In [None]:
# Before defining and training your second model
tf.keras.backend.clear_session()
# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

In [None]:
def model_3(x_size, y_size):
    model = Sequential()
    model.add(Dense(128, activation="relu", input_shape=(x_size,)))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001), metrics=[metrics.mae])
    return model

In [None]:
model3 = model_3(arr_x_train.shape[1], arr_y_train.shape[1])
model3.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               11520     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 19841 (77.50 KB)
Trainable params: 19841 (77.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model3.fit(arr_x_train, arr_y_train,batch_size=64,epochs=500,shuffle=True,verbose=2,validation_data=(arr_x_valid, arr_y_valid),callbacks=[early_stopping])

Epoch 1/500
219/219 - 1s - loss: 417700511744.0000 - mean_absolute_error: 534574.3750 - val_loss: 424714633216.0000 - val_mean_absolute_error: 534617.8750 - 1s/epoch - 6ms/step
Epoch 2/500
219/219 - 1s - loss: 408561057792.0000 - mean_absolute_error: 526431.0000 - val_loss: 405993816064.0000 - val_mean_absolute_error: 517785.0938 - 501ms/epoch - 2ms/step
Epoch 3/500
219/219 - 1s - loss: 376666259456.0000 - mean_absolute_error: 496599.5000 - val_loss: 359639252992.0000 - val_mean_absolute_error: 473640.9688 - 520ms/epoch - 2ms/step
Epoch 4/500
219/219 - 0s - loss: 317012082688.0000 - mean_absolute_error: 436043.7500 - val_loss: 288308559872.0000 - val_mean_absolute_error: 396871.5625 - 461ms/epoch - 2ms/step
Epoch 5/500
219/219 - 1s - loss: 240939155456.0000 - mean_absolute_error: 345394.5000 - val_loss: 210778980352.0000 - val_mean_absolute_error: 296431.5312 - 511ms/epoch - 2ms/step
Epoch 6/500
219/219 - 1s - loss: 171649056768.0000 - mean_absolute_error: 250937.8906 - val_loss: 15136

<keras.src.callbacks.History at 0x7dce1b611f30>

In [None]:
# After training the model and making predictions
y_pred3 = model3.predict(arr_x_valid)

# Calculate Mean Squared Error
mse3 = mean_squared_error(arr_y_valid, y_pred3)
print('Mean Squared Error:', mse3)

# Calculate Root Mean Squared Error
rmse3 = np.sqrt(mse3)
print('Root Mean Squared Error:', rmse3)

# Calculate Mean Absolute Error
mae3 = mean_absolute_error(arr_y_valid, y_pred3)
print('Mean Absolute Error:', mae3)

# Calculate Correlation Coefficient
corrcoef3 = np.corrcoef(arr_y_valid.flatten(), y_pred3.flatten())
print('Correlation coefficient:', corrcoef3[0, 1])

Mean Squared Error: 15632696661.17599
Root Mean Squared Error: 125030.78285436747
Mean Absolute Error: 73246.22023144532
Correlation coefficient: 0.9421820738150992


# Fourth Model

In [None]:
# Before defining and training your second model
tf.keras.backend.clear_session()
# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

In [None]:
def model_4(x_size, y_size):
    model = Sequential()
    model.add(Dense(256, activation="relu", input_shape=(x_size,)))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0005), metrics=[metrics.mae])
    return model

In [None]:
model4 = model_4(arr_x_train.shape[1], arr_y_train.shape[1])
model4.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               23040     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 64257 (251.00 KB)
Trainable params: 64257 (251.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model4.fit(arr_x_train, arr_y_train,batch_size=64,epochs=500,shuffle=True,verbose=2,validation_data=(arr_x_valid, arr_y_valid),callbacks=[early_stopping])

Epoch 1/500
219/219 - 2s - loss: 379836203008.0000 - mean_absolute_error: 497756.4375 - val_loss: 235117051904.0000 - val_mean_absolute_error: 329999.6875 - 2s/epoch - 8ms/step
Epoch 2/500
219/219 - 1s - loss: 114763907072.0000 - mean_absolute_error: 206496.8750 - val_loss: 98147581952.0000 - val_mean_absolute_error: 196008.9531 - 711ms/epoch - 3ms/step
Epoch 3/500
219/219 - 1s - loss: 86941933568.0000 - mean_absolute_error: 181694.7188 - val_loss: 85520826368.0000 - val_mean_absolute_error: 171681.2344 - 679ms/epoch - 3ms/step
Epoch 4/500
219/219 - 1s - loss: 74323197952.0000 - mean_absolute_error: 162891.3594 - val_loss: 71826530304.0000 - val_mean_absolute_error: 157718.0625 - 711ms/epoch - 3ms/step
Epoch 5/500
219/219 - 1s - loss: 60676325376.0000 - mean_absolute_error: 141129.5156 - val_loss: 58250190848.0000 - val_mean_absolute_error: 128132.9609 - 719ms/epoch - 3ms/step
Epoch 6/500
219/219 - 1s - loss: 48713240576.0000 - mean_absolute_error: 119248.6094 - val_loss: 48125718528.0

<keras.src.callbacks.History at 0x7dce1b577d60>

In [None]:
# After training the model and making predictions
y_pred4 = model4.predict(arr_x_valid)

# Calculate Mean Squared Error
mse4 = mean_squared_error(arr_y_valid, y_pred4)
print('Mean Squared Error:', mse4)

# Calculate Root Mean Squared Error
rmse4 = np.sqrt(mse4)
print('Root Mean Squared Error:', rmse4)

# Calculate Mean Absolute Error
mae4 = mean_absolute_error(arr_y_valid, y_pred4)
print('Mean Absolute Error:', mae4)

# Calculate Correlation Coefficient
corrcoef4 = np.corrcoef(arr_y_valid.flatten(), y_pred4.flatten())
print('Correlation coefficient:', corrcoef4[0, 1])

Mean Squared Error: 14605136860.969461
Root Mean Squared Error: 120851.7143484918
Mean Absolute Error: 70621.40319661458
Correlation coefficient: 0.9462410257418002


# Fifth model

In [None]:
# Before defining and training your second model
tf.keras.backend.clear_session()
# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

In [None]:
def model_5(x_size, y_size):
    model = Sequential()
    model.add(Dense(100, activation="relu", input_shape=(x_size,)))
    model.add(Dense(50, activation="relu"))
    model.add(Dense(25, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=RMSprop(), metrics=[metrics.mae])
    return model

In [None]:
model5 = model_5(arr_x_train.shape[1], arr_y_train.shape[1])
model5.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               9000      
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dense_2 (Dense)             (None, 25)                1275      
                                                                 
 dense_3 (Dense)             (None, 1)                 26        
                                                                 
Total params: 15351 (59.96 KB)
Trainable params: 15351 (59.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model5.fit(arr_x_train, arr_y_train,batch_size=64,epochs=500,shuffle=True,verbose=2,validation_data=(arr_x_valid, arr_y_valid),callbacks=[early_stopping])

Epoch 1/500
219/219 - 1s - loss: 417530806272.0000 - mean_absolute_error: 534426.5625 - val_loss: 424181432320.0000 - val_mean_absolute_error: 534147.3750 - 1s/epoch - 5ms/step
Epoch 2/500
219/219 - 1s - loss: 406090350592.0000 - mean_absolute_error: 524204.9375 - val_loss: 399731752960.0000 - val_mean_absolute_error: 512042.0312 - 674ms/epoch - 3ms/step
Epoch 3/500
219/219 - 1s - loss: 359046021120.0000 - mean_absolute_error: 479207.9688 - val_loss: 324484136960.0000 - val_mean_absolute_error: 437367.9062 - 671ms/epoch - 3ms/step
Epoch 4/500
219/219 - 1s - loss: 254135402496.0000 - mean_absolute_error: 360771.0625 - val_loss: 194131083264.0000 - val_mean_absolute_error: 272565.2188 - 681ms/epoch - 3ms/step
Epoch 5/500
219/219 - 1s - loss: 133780799488.0000 - mean_absolute_error: 211472.9844 - val_loss: 109221486592.0000 - val_mean_absolute_error: 200056.5156 - 780ms/epoch - 4ms/step
Epoch 6/500
219/219 - 1s - loss: 99557572608.0000 - mean_absolute_error: 197232.2031 - val_loss: 102347

<keras.src.callbacks.History at 0x7dce1b378be0>

In [None]:
# After training the model and making predictions
y_pred5 = model5.predict(arr_x_valid)

# Calculate Mean Squared Error
mse5 = mean_squared_error(arr_y_valid, y_pred5)
print('Mean Squared Error:', mse5)

# Calculate Root Mean Squared Error
rmse5 = np.sqrt(mse5)
print('Root Mean Squared Error:', rmse5)

# Calculate Mean Absolute Error
mae5 = mean_absolute_error(arr_y_valid, y_pred5)
print('Mean Absolute Error:', mae5)

# Calculate Correlation Coefficient
corrcoef5 = np.corrcoef(arr_y_valid.flatten(), y_pred5.flatten())
print('Correlation coefficient:', corrcoef5[0, 1])

Mean Squared Error: 14795300432.342829
Root Mean Squared Error: 121635.93396830901
Mean Absolute Error: 70846.21560416666
Correlation coefficient: 0.9454205602627136


Prepare for GRU and LSTM model

In [None]:
X_train_3d = np.reshape(df_x_train.values, (df_x_train.shape[0], 1, df_x_train.shape[1]))
X_test_3d = np.reshape(df_x_valid.values, (df_x_valid.shape[0], 1, df_x_valid.shape[1]))

# Model GRU 1

In [None]:
# Before defining and training the GRU model
tf.keras.backend.clear_session()
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [None]:
def model_gru1(x_size, y_size):
  model = Sequential()
  model.add(GRU(100, return_sequences=True, activation='relu', input_shape=(1, X_train_3d.shape[2])))
  model.add(Dropout(0.2))
  model.add(GRU(50, activation='relu'))
  model.add(Dense(1))
  model.compile(optimizer='adam', loss='mse')
  return model

In [None]:
# Create your GRU model
model_gru1 = model_gru1(X_train_3d.shape[2], arr_y_train.shape[1])  # Assuming y_size is 1
model_gru1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 1, 100)            57300     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 gru_1 (GRU)                 (None, 50)                22800     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 80151 (313.09 KB)
Trainable params: 80151 (313.09 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model_gru1.fit(X_train_3d, arr_y_train, batch_size=64, epochs=500, shuffle=True, verbose=2, validation_data=(X_test_3d, arr_y_valid), callbacks=[early_stopping])

Epoch 1/500
219/219 - 4s - loss: 417936900096.0000 - val_loss: 425610641408.0000 - 4s/epoch - 19ms/step
Epoch 2/500
219/219 - 1s - loss: 411789262848.0000 - val_loss: 412872966144.0000 - 1s/epoch - 5ms/step
Epoch 3/500
219/219 - 1s - loss: 390121553920.0000 - val_loss: 381294280704.0000 - 1s/epoch - 5ms/step
Epoch 4/500
219/219 - 2s - loss: 348547252224.0000 - val_loss: 329836560384.0000 - 2s/epoch - 8ms/step
Epoch 5/500
219/219 - 2s - loss: 289799471104.0000 - val_loss: 265634824192.0000 - 2s/epoch - 9ms/step
Epoch 6/500
219/219 - 1s - loss: 225589870592.0000 - val_loss: 202865623040.0000 - 1s/epoch - 6ms/step
Epoch 7/500
219/219 - 1s - loss: 169769762816.0000 - val_loss: 154667515904.0000 - 1s/epoch - 6ms/step
Epoch 8/500
219/219 - 1s - loss: 132482375680.0000 - val_loss: 126966538240.0000 - 1s/epoch - 5ms/step
Epoch 9/500
219/219 - 1s - loss: 113606000640.0000 - val_loss: 114438193152.0000 - 1s/epoch - 6ms/step
Epoch 10/500
219/219 - 1s - loss: 105882533888.0000 - val_loss: 10939758

<keras.src.callbacks.History at 0x7dce1b1f6bf0>

In [None]:
# After training the model and making predictions
y_pred_gru1 = model_gru1.predict(X_test_3d)

# Calculate Mean Squared Error
mse_gru1 = mean_squared_error(arr_y_valid, y_pred_gru1)
print('Mean Squared Error:', mse_gru1)

# Calculate Root Mean Squared Error
rmse_gru1 = np.sqrt(mse_gru1)
print('Root Mean Squared Error:', rmse_gru1)

# Calculate Mean Absolute Error
mae_gru1 = mean_absolute_error(arr_y_valid, y_pred_gru1)
print('Mean Absolute Error:', mae_gru1)

# Calculate Correlation Coefficient
corrcoef_gru1 = np.corrcoef(arr_y_valid.flatten(), y_pred_gru1.flatten())
print('Correlation coefficient:', corrcoef_gru1[0, 1])

Mean Squared Error: 14682533324.373922
Root Mean Squared Error: 121171.50376377246
Mean Absolute Error: 71583.78283072916
Correlation coefficient: 0.9458406357648705


# Model GRU 2

In [None]:
# Before defining and training the GRU model
tf.keras.backend.clear_session()
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [None]:
def model_gru2(x_size, y_size):
    model = Sequential()
    model.add(GRU(100, return_sequences=True, activation="tanh", input_shape=(1, X_train_3d.shape[2])))
    model.add(Dropout(0.2))
    model.add(GRU(180, return_sequences=True, activation="relu"))
    model.add(GRU(20, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=RMSprop(learning_rate=0.005, rho=0.9, momentum=0.0, epsilon=1e-07, weight_decay=0.0,), metrics=[metrics.mae])
    return model

In [None]:
# Create your GRU model
model_gru2 = model_gru2(X_train_3d.shape[2], arr_y_train.shape[1])  # Assuming y_size is 1
model_gru2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 1, 100)            57300     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 gru_1 (GRU)                 (None, 1, 180)            152280    
                                                                 
 gru_2 (GRU)                 (None, 20)                12120     
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 221721 (866.10 KB)
Trainable params: 221721 (866.10 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model_gru2.fit(X_train_3d, arr_y_train, batch_size=64, epochs=500, shuffle=True, verbose=2, validation_data=(X_test_3d, arr_y_valid), callbacks=[early_stopping])

Epoch 1/500
219/219 - 10s - loss: 331350736896.0000 - mean_absolute_error: 440548.1250 - val_loss: 188031795200.0000 - val_mean_absolute_error: 256009.0312 - 10s/epoch - 45ms/step
Epoch 2/500
219/219 - 2s - loss: 113451016192.0000 - mean_absolute_error: 189497.8594 - val_loss: 77223419904.0000 - val_mean_absolute_error: 141643.3906 - 2s/epoch - 9ms/step
Epoch 3/500
219/219 - 2s - loss: 55725506560.0000 - mean_absolute_error: 119119.3516 - val_loss: 46923612160.0000 - val_mean_absolute_error: 110640.1250 - 2s/epoch - 9ms/step
Epoch 4/500
219/219 - 2s - loss: 35796353024.0000 - mean_absolute_error: 99568.9141 - val_loss: 34113574912.0000 - val_mean_absolute_error: 95436.1172 - 2s/epoch - 9ms/step
Epoch 5/500
219/219 - 4s - loss: 27189147648.0000 - mean_absolute_error: 92670.3203 - val_loss: 26717677568.0000 - val_mean_absolute_error: 84494.5781 - 4s/epoch - 17ms/step
Epoch 6/500
219/219 - 2s - loss: 23971821568.0000 - mean_absolute_error: 88784.3594 - val_loss: 23915286528.0000 - val_mea

<keras.src.callbacks.History at 0x7dce186dae00>

In [None]:
# After training the model and making predictions
y_pred_gru2 = model_gru2.predict(X_test_3d)

# Calculate Mean Squared Error
mse_gru2 = mean_squared_error(arr_y_valid, y_pred_gru2)
print('Mean Squared Error:', mse_gru1)

# Calculate Root Mean Squared Error
rmse_gru2 = np.sqrt(mse_gru2)
print('Root Mean Squared Error:', rmse_gru2)

# Calculate Mean Absolute Error
mae_gru2 = mean_absolute_error(arr_y_valid, y_pred_gru2)
print('Mean Absolute Error:', mae_gru2)

# Calculate Correlation Coefficient
corrcoef_gru2 = np.corrcoef(arr_y_valid.flatten(), y_pred_gru2.flatten())
print('Correlation coefficient:', corrcoef_gru2[0, 1])

Mean Squared Error: 14682533324.373922
Root Mean Squared Error: 127018.69472448181
Mean Absolute Error: 73228.558609375
Correlation coefficient: 0.9445191057344307


# Model LSTM 1

In [None]:
# Before defining and training the GRU model
tf.keras.backend.clear_session()
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [None]:
def model_lstm1(x_size, y_size):
  model = Sequential()
  model.add(LSTM(100, return_sequences=True, activation='relu', input_shape=(1, X_train_3d.shape[2])))
  model.add(Dropout(0.2))
  model.add(LSTM(50, activation='relu'))
  model.add(Dense(1))
  model.compile(optimizer='adam', loss='mse')
  return model

In [None]:
# Create your LSTM model
model_lstm1 = model_lstm1(X_train_3d.shape[2], arr_y_train.shape[1])
model_lstm1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 100)            76000     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 lstm_1 (LSTM)               (None, 50)                30200     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 106251 (415.04 KB)
Trainable params: 106251 (415.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model_lstm1.fit(X_train_3d, arr_y_train, batch_size=64, epochs=500, shuffle=True, verbose=2, validation_data=(X_test_3d, arr_y_valid), callbacks=[early_stopping])

Epoch 1/500
219/219 - 4s - loss: 417976942592.0000 - val_loss: 425736339456.0000 - 4s/epoch - 19ms/step
Epoch 2/500
219/219 - 1s - loss: 412157018112.0000 - val_loss: 413712908288.0000 - 1s/epoch - 6ms/step
Epoch 3/500
219/219 - 2s - loss: 392038776832.0000 - val_loss: 384701595648.0000 - 2s/epoch - 9ms/step
Epoch 4/500
219/219 - 2s - loss: 353960722432.0000 - val_loss: 337535205376.0000 - 2s/epoch - 8ms/step
Epoch 5/500
219/219 - 3s - loss: 299622825984.0000 - val_loss: 277474148352.0000 - 3s/epoch - 12ms/step
Epoch 6/500
219/219 - 2s - loss: 238271348736.0000 - val_loss: 216018829312.0000 - 2s/epoch - 11ms/step
Epoch 7/500
219/219 - 3s - loss: 181642379264.0000 - val_loss: 165013356544.0000 - 3s/epoch - 14ms/step
Epoch 8/500
219/219 - 4s - loss: 140002312192.0000 - val_loss: 132069236736.0000 - 4s/epoch - 18ms/step
Epoch 9/500
219/219 - 3s - loss: 116057841664.0000 - val_loss: 115062824960.0000 - 3s/epoch - 13ms/step
Epoch 10/500
219/219 - 1s - loss: 105027207168.0000 - val_loss: 107

<keras.src.callbacks.History at 0x7dce1ac3f880>

In [None]:
# After training the model and making predictions
y_pred_lstm1 = model_lstm1.predict(X_test_3d)

# Calculate Mean Squared Error
mse_lstm1 = mean_squared_error(arr_y_valid, y_pred_lstm1)
print('Mean Squared Error:', mse_lstm1)

# Calculate Root Mean Squared Error
rmse_lstm1 = np.sqrt(mse_lstm1)
print('Root Mean Squared Error:', rmse_lstm1)

# Calculate Mean Absolute Error
mae_lstm1 = mean_absolute_error(arr_y_valid, y_pred_lstm1)
print('Mean Absolute Error:', mae_lstm1)

# Calculate Correlation Coefficient
corrcoef_lstm1 = np.corrcoef(arr_y_valid.flatten(), y_pred_lstm1.flatten())
print('Correlation coefficient:', corrcoef_lstm1[0, 1])

Mean Squared Error: 14642451069.219534
Root Mean Squared Error: 121005.99600523742
Mean Absolute Error: 70960.46025
Correlation coefficient: 0.9459655935121667


# Model LSTM 2

In [None]:
# Before defining and training the GRU model
tf.keras.backend.clear_session()
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [None]:
def model_lstm2(x_size, y_size):
    model = Sequential()
    model.add(LSTM(100, return_sequences=True, activation="tanh", input_shape=(1, X_train_3d.shape[2])))
    model.add(Dropout(0.2))
    model.add(LSTM(180, return_sequences=True, activation="relu"))
    model.add(LSTM(20, activation="relu"))
    model.add(Dense(y_size))
    model.compile(loss='mean_squared_error', optimizer=RMSprop(learning_rate=0.005, rho=0.9, momentum=0.0, epsilon=1e-07, weight_decay=0.0,), metrics=[metrics.mae])
    return model


In [None]:
# Create your LSTM model
model_lstm2 = model_lstm2(X_train_3d.shape[2], arr_y_train.shape[1])  # Assuming y_size is 1
model_lstm2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 100)            76000     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 lstm_1 (LSTM)               (None, 1, 180)            202320    
                                                                 
 lstm_2 (LSTM)               (None, 20)                16080     
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 294421 (1.12 MB)
Trainable params: 294421 (1.12 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model_lstm2.fit(X_train_3d, arr_y_train, batch_size=64, epochs=500, shuffle=True, verbose=2, validation_data=(X_test_3d, arr_y_valid), callbacks=[early_stopping])

Epoch 1/500
219/219 - 9s - loss: 338253905920.0000 - mean_absolute_error: 448978.3125 - val_loss: 200861073408.0000 - val_mean_absolute_error: 271145.2500 - 9s/epoch - 41ms/step
Epoch 2/500
219/219 - 2s - loss: 122472529920.0000 - mean_absolute_error: 201555.1406 - val_loss: 84233240576.0000 - val_mean_absolute_error: 148885.2188 - 2s/epoch - 10ms/step
Epoch 3/500
219/219 - 2s - loss: 61157924864.0000 - mean_absolute_error: 124479.9531 - val_loss: 50828824576.0000 - val_mean_absolute_error: 112446.5391 - 2s/epoch - 10ms/step
Epoch 4/500
219/219 - 4s - loss: 39246733312.0000 - mean_absolute_error: 101911.8828 - val_loss: 37662621696.0000 - val_mean_absolute_error: 99535.7031 - 4s/epoch - 19ms/step
Epoch 5/500
219/219 - 2s - loss: 29370861568.0000 - mean_absolute_error: 93067.0000 - val_loss: 28847020032.0000 - val_mean_absolute_error: 85630.9062 - 2s/epoch - 10ms/step
Epoch 6/500
219/219 - 2s - loss: 25377658880.0000 - mean_absolute_error: 89159.1562 - val_loss: 25794088960.0000 - val_m

<keras.src.callbacks.History at 0x7dce1a7e4100>

In [None]:
# After training the model and making predictions
y_pred_lstm2 = model_lstm2.predict(X_test_3d)

# Calculate Mean Squared Error
mse_lstm2 = mean_squared_error(arr_y_valid, y_pred_lstm2)
print('Mean Squared Error:', mse_lstm2)

# Calculate Root Mean Squared Error
rmse_lstm2 = np.sqrt(mse_lstm2)
print('Root Mean Squared Error:', rmse_lstm2)

# Calculate Mean Absolute Error
mae_lstm2 = mean_absolute_error(arr_y_valid, y_pred_lstm2)
print('Mean Absolute Error:', mae_lstm2)

# Calculate Correlation Coefficient
corrcoef_lstm2 = np.corrcoef(arr_y_valid.flatten(), y_pred_lstm2.flatten())
print('Correlation coefficient:', corrcoef_lstm2[0, 1])

Mean Squared Error: 15074461532.298344
Root Mean Squared Error: 122778.09874850785
Mean Absolute Error: 70879.35404427083
Correlation coefficient: 0.9445071752685175


In [None]:
# Create a dictionary with the metrics
data = {
    'Model': ['Model MLP 1', 'Model MLP 2', 'Model MLP 3', 'Model MLP 4', 'Model MLP 5', 'Linear Regression', 'Model GRU 1', 'Model GRU 2', 'Model LSTM 1', 'Model LSTM 2'],
    'MSE': [format(mse1, '.5f'), format(mse2, '.5f'), format(mse3, '.5f'), format(mse4, '.5f'), format(mse5, '.5f'), format(mse_lr, '.5f'), format(mse_gru1, '.5f'), format(mse_gru2, '.5f'), format(mse_lstm1, '.5f'), format(mse_lstm2, '.5f')],
    'RMSE': [rmse1, rmse2, rmse3, rmse4, rmse5, rmse_lr, rmse_gru1, rmse_gru2, rmse_lstm1, rmse_lstm2],
    'MAE': [mae1, mae2, mae3, mae4, mae5, mae_lr, mae_gru1, mae_gru2, mae_lstm1, mae_lstm2],
    'Correlation Coefficient': [corrcoef1[0, 1], corrcoef2[0, 1], corrcoef3[0, 1], corrcoef4[0, 1], corrcoef5[0, 1], corrcoef_lr[0, 1], corrcoef_gru1[0, 1], corrcoef_gru2[0, 1], corrcoef_lstm1[0, 1], corrcoef_lstm2[0, 1]]
}

# Convert the dictionary to a pandas DataFrame
deptrai = pd.DataFrame(data)

# Print the DataFrame
deptrai

Unnamed: 0,Model,MSE,RMSE,MAE,Correlation Coefficient
0,Model MLP 1,15630847198.03376,125023.386604,71609.407651,0.942819
1,Model MLP 2,15166711417.69289,123153.203035,72016.570142,0.943955
2,Model MLP 3,15632696661.17599,125030.782854,73246.220231,0.942182
3,Model MLP 4,14605136860.96946,120851.714348,70621.403197,0.946241
4,Model MLP 5,14795300432.34283,121635.933968,70846.215604,0.945421
5,Linear Regression,26349266013.26633,162324.569962,95567.651333,0.900752
6,Model GRU 1,14682533324.37392,121171.503764,71583.782831,0.945841
7,Model GRU 2,16133748809.5111,127018.694724,73228.558609,0.944519
8,Model LSTM 1,14642451069.21953,121005.996005,70960.46025,0.945966
9,Model LSTM 2,15074461532.29834,122778.098749,70879.354044,0.944507
