In [1]:
# Standard libraries
import os
import math
import time

# Data manipulation and analysis
import numpy as np
import pandas as pd

#Data Visualization
import matplotlib.pyplot as plt
from datetime import datetime,date,timedelta


# Machine Learning - Scikit-learn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, ParameterGrid, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import  MinMaxScaler, StandardScaler


# Deep Learning - TensorFlow and Keras
import keras
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError


#Grid search 
import itertools
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasRegressor
import warnings

warnings.filterwarnings('ignore')


In [2]:
## Read in the data set kepping date as index
df=pd.read_csv('weighted_sum.csv')

# make 'date' column as index replacing default numerical index
df.index = df['date']

# drop the column labeled as date
df.drop(columns='date', inplace=True)


# Replace negative values with zero in 'weighted_sum'
df['weighted_sum'] = df['weighted_sum'].clip(lower=0)

# filter out the specified data range containing zero
df=df.drop(df.loc['2023-05-12':'2023-07-27'].index)

# filter out the first two months assuming machine unstability
df=df.drop(df.loc['2019-10-24':'2020-01-31'].index)

# replace 7 irregular data points with 8th highest value.
df[df['weighted_sum']>570]=570
print(df)

            weighted_sum
date                    
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2023-12-02    114.049422
2023-12-03      0.000000
2023-12-04    226.893157
2023-12-05    390.659238
2023-12-06    425.322937

[1317 rows x 1 columns]


In [3]:
# Define the train-test split parameters (70:30)
initial_train_size = int(0.7 * len(df)) # Start with 70% training data
initial_train_data= df[:initial_train_size]
initial_test_data=df[initial_train_size:]


# Now, split the initial training data into 70% training and 30% validation
train_size = int(0.7 * len(initial_train_data))
train_data=initial_train_data[:train_size]
validation_set = initial_train_data[train_size:]

print(initial_train_data, initial_test_data , train_data ,validation_set)

            weighted_sum
date                    
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2022-08-06    105.431713
2022-08-07      0.000000
2022-08-08    256.128624
2022-08-09    394.169786
2022-08-10    438.000989

[921 rows x 1 columns]             weighted_sum
date                    
2022-08-11    408.674950
2022-08-12    311.163507
2022-08-13    114.743957
2022-08-15    273.145554
2022-08-16    382.488038
...                  ...
2023-12-02    114.049422
2023-12-03      0.000000
2023-12-04    226.893157
2023-12-05    390.659238
2023-12-06    425.322937

[396 rows x 1 columns]             weighted_sum
date                    
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2021-11-02      0.000000
2021-11-03    119.047367
2021-11-04    278.944609
2021-11-05    342.369322
20

In [4]:

# Initialize parameters
time_step = 1
features = 1

# Build a function for the LSTM model
def create_lstm_model(n_layers, n_neurons, dropout_rate, time_step):
    model = Sequential()
    model.add(keras.Input(shape=(time_step, features)))
    model.add(LSTM(n_neurons, activation='relu',return_sequences=True))
    model.add(LSTM(n_neurons,  dropout=dropout_rate, activation='relu'))
    model.add(Dense(1)) 
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['root_mean_squared_error'])
    return model 


# function to Transform Data to Supervised Learning Format
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - time_step - 1):
        dataX.append(dataset.iloc[i:(i + time_step), 0])  # Corrected indexing for Pandas
        dataY.append(dataset.iloc[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

# Now, split the initial training data into 70% training and 30% validation
train_size = int(0.7 * len(initial_train_data))  # 70% of the initial train size
train_data = initial_train_data[:train_size]  # 70% for actual training
test_data = initial_train_data[train_size:]  # Remaining 30% for validation

# Transform to supervised learning
X_train, y_train = create_dataset(train_data, time_step=time_step)
X_test, y_test = create_dataset(test_data, time_step=time_step)

# Reshape input to be [samples, time steps, features] for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))



# Define the parameter grid
param_grid = {
    'n_layers': [2, 3, 4, 5],
    'n_neurons': [10, 20, 50, 100],
    'dropout_rate': [0.0, 0.1, 0.5],
    'time_step': [1, 7, 14, 30]
}

# Initialize variables to store the best model and the lowest RMSE
best_rmse = float('inf')  # Start with a very high value for RMSE
best_model = None
best_params = None
best_iteration= None

# total number of combinations
total_combinations= len(list(itertools.product(*param_grid.values())))
print(f'Total number of combination:{total_combinations}')

# define the number of split
num_splits =3
split_size=total_combinations/num_splits
print(f'Split size:{split_size}')

Total number of combination:192
Split size:64.0


In [9]:
# first split(i=1 to 64)
print('\nFirst split:')
for i, params in enumerate(itertools.product(*param_grid.values()), start=1):
    if i > split_size:
        break
    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    print(f"Iteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")
    
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update best model if the current one is better
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration=i

# Output the best model and parameters
print(f'\nBest model found in Iteration: {best_iteration}')
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters: Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
    


First split:
Iteration 1: 2 layers, 10 neurons, 0.0 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

RMSE for current model: 172.36502306438786
Iteration 2: 2 layers, 10 neurons, 0.0 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 165.71475434797313
Iteration 3: 2 layers, 10 neurons, 0.0 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

RMSE for current model: 176.9894297595102
Iteration 4: 2 layers, 10 neurons, 0.0 dropout, 30 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

RMSE for current model: 168.3333276535485
Iteration 5: 2 layers, 10 neurons, 0.1 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 172.58473967169857
Iteration 6: 2 layers, 10 neurons, 0.1 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━

In [6]:
396*25
9900/60
165/60

2.75

In [13]:
# second split(i= 65 to 128)

print('\nSecond split:')
for i, params in enumerate (itertools.product(*param_grid.values()), start=1):
    if i<= split_size: # Continue to the next iteration until i > 64
        continue

    if i > 2*split_size: 
        break   
    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    # Print iteration number, parameter combination, and total combinations
    print(f"\nIteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")
    
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update the best model 
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration = i

# Output the best model and parameters
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters: Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
print(f'\nBest model found in Iteration: {best_iteration}')


Second split:

Iteration 65: 3 layers, 20 neurons, 0.1 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 159.94118176844702

Iteration 66: 3 layers, 20 neurons, 0.1 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 160.6529906085544

Iteration 67: 3 layers, 20 neurons, 0.1 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 163.01341873758872

Iteration 68: 3 layers, 20 neurons, 0.1 dropout, 30 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step

RMSE for current model: 162.9034995264752

Iteration 69: 3 layers, 20 neurons, 0.5 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 218.3556635152147

Iteration 70: 3 layers, 20 neurons, 0.5 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━

In [15]:
# Third split
print('\nThird split')
for i, params in enumerate(itertools.product(*param_grid.values()), start=1):
    if i <= 2*split_size:
        continue     
        
    if i > total_combinations:
        break

    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    # Print iteration number, parameter combination, and total combinations
    print(f"\nIteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")
    
    
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update best model if the current one is better
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration = i
# Output the best model and parameters
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters:Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
print(f'\nBest model found in Iteration: {best_iteration}')


Third split

Iteration 129: 4 layers, 50 neurons, 0.5 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 191.29059060126602

Iteration 130: 4 layers, 50 neurons, 0.5 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 169.56865944386297

Iteration 131: 4 layers, 50 neurons, 0.5 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 176.99043622366048

Iteration 132: 4 layers, 50 neurons, 0.5 dropout, 30 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step

RMSE for current model: 182.22664008086016

Iteration 133: 4 layers, 100 neurons, 0.0 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

RMSE for current model: 151.81628345323426

Iteration 134: 4 layers, 100 neurons, 0.0 dropout, 7 time steps.
[1m9/9[0m 

In [8]:
from sklearn.model_selection import GridSearchCV, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


# Initialize parameters
time_step = 1
features = 1

# function to Transform Data to Supervised Learning Format
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - time_step - 1):
        dataX.append(dataset.iloc[i:(i + time_step), 0])  # Corrected indexing for Pandas
        dataY.append(dataset.iloc[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

# Now, split the initial training data into 70% training and 30% validation
train_size = int(0.7 * len(initial_train_data))  # 70% of the initial train size
train_data = initial_train_data[:train_size]  # 70% for actual training
test_data = initial_train_data[train_size:]  # Remaining 30% for validation

# Transform to supervised learning
X_train, y_train = create_dataset(train_data, time_step=time_step)
X_test, y_test = create_dataset(test_data, time_step=time_step)

# Reshape input to be [samples, time steps, features] for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


# Build the LSTM model
def create_lstm_model(n_layers, n_neurons, dropout_rate, time_step, input_shape=(1,1)):
    model = Sequential()
    model.add(keras.Input(shape=(time_step, 1)))
    model.add(LSTM(n_neurons, activation='relu',return_sequences=True))
    model.add(LSTM(n_neurons,  dropout=dropout_rate, activation='relu'))
    model.add(Dense(1)) 
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['root_mean_squared_error'])
    return model 


param_grid = {
    'model__n_layers': [3, 4, 5],
    'model__n_neurons': [10, 20],
    'model__dropout_rate': [0.1, 0.5],
    'model__time_step':[1,7]
}

    
# Define EarlyStopping callback
callback = keras.callbacks.EarlyStopping(monitor='root_mean_squared_error', min_delta = 0.01, mode='min', patience = 5)
    
# Use KerasRegressor with the build_fn
model = KerasRegressor(model=create_lstm_model, epochs=20, batch_size=32, input_shape=(time_step, features), callbacks=[callback])
    
# Perform grid search
for n_nlayers in 
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid_search.fit( X_train, y_train)

# Extract results from GridSearchCV
results = pd.DataFrame(grid_result.cv_results_)
    
# Print the best results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")


Epoch 1/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 115684.6172 - root_mean_squared_error: 340.1140
Epoch 2/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 118210.8594 - root_mean_squared_error: 343.7699 
Epoch 3/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 118796.0938 - root_mean_squared_error: 344.6325 
Epoch 4/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 111685.6328 - root_mean_squared_error: 334.1299 
Epoch 5/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 113498.7812 - root_mean_squared_error: 336.7443 
Epoch 6/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 115137.9609 - root_mean_squared_error: 339.2903 
Epoch 7/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 112876.2969 - root_mean_squared_error: 335.9185 
Epoch 8/20
[1