In [6]:
# Standard libraries
import os
import math
import time

# Data manipulation and analysis
import numpy as np
import pandas as pd

#Data Visualization
import matplotlib.pyplot as plt
from datetime import datetime,date,timedelta


# Machine Learning - Scikit-learn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, ParameterGrid, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import  MinMaxScaler, StandardScaler


# Deep Learning - TensorFlow and Keras
import keras
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError


#Grid search 
import itertools
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasRegressor
import warnings

warnings.filterwarnings('ignore')


In [7]:
# Readin the data set kepping date as index
df=pd.read_csv('weighted_sum.csv')

# Make the 'date' column as index of the DataFrame
df.index = df["date"].values

df.index = pd.to_datetime(df.index)

df.drop(columns='date', inplace=True)

# Reindexing DataFrame with pd.date_range, ensure df spans a specified period, filling in any gaps with NaN values.
idx = pd.date_range('2019-10-24', '2023-12-06')

df = df.reindex(idx)

# filter out the specified data range contaning zero
df=df.drop(df.loc['2023-05-12':'2023-07-27'].index)

# Filter out the initial date range considering machine unstability
df=df.drop(df.loc['2019-10-24':'2020-01-31'].index)


rows_to_fill = np.where(df["weighted_sum"].isna())

for row in rows_to_fill:

    df.iloc[row] = df.iloc[row + 7]

# Replace negative values with zero in 'weighted_sum'
df['weighted_sum'] = df['weighted_sum'].clip(lower=0)

# replace 7 irregular data points with 8th highest value.
df[df['weighted_sum']>570]=570

print(df)
df.describe()

            weighted_sum
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2023-12-02    114.049422
2023-12-03      0.000000
2023-12-04    226.893157
2023-12-05    390.659238
2023-12-06    425.322937

[1328 rows x 1 columns]


Unnamed: 0,weighted_sum
count,1328.0
mean,294.511578
std,164.458734
min,0.0
25%,149.168497
50%,356.291042
75%,428.762568
max,570.0


#### Train test split

In [8]:
# Define the train-test split parameters (70:30)
initial_train_size = int(0.7 * len(df)) # Start with 70% training data
initial_train_data= df[:initial_train_size]
initial_test_data=df[initial_train_size:]


# Now, split the initial training data into 70% training and 30% validation
train_size = int(0.7 * len(initial_train_data))
train_data=initial_train_data[:train_size]
validation_set = initial_train_data[train_size:]

print(initial_train_data, initial_test_data , train_data ,validation_set)

            weighted_sum
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2022-08-13    114.743957
2022-08-14      0.000000
2022-08-15    273.145554
2022-08-16    382.488038
2022-08-17    276.267078

[929 rows x 1 columns]             weighted_sum
2022-08-18    193.901423
2022-08-19    432.053452
2022-08-20    134.239967
2022-08-21      0.000000
2022-08-22    293.099946
...                  ...
2023-12-02    114.049422
2023-12-03      0.000000
2023-12-04    226.893157
2023-12-05    390.659238
2023-12-06    425.322937

[399 rows x 1 columns]             weighted_sum
2020-02-01    239.210369
2020-02-02     28.280318
2020-02-03    199.366981
2020-02-04    423.067131
2020-02-05    409.692107
...                  ...
2021-11-07      0.000000
2021-11-08    128.806820
2021-11-09    416.624904
2021-11-10    436.489289
2021-11-11    444.859699

[650 rows x 1 columns]             weighted_sum
202

#### Create lstm model function, create function for transforming data to supervised learning and define tuning parameter space for lstm model.

In [13]:

# Initialize parameters
time_step = 1
features = 1

# Build a function for the LSTM model
def create_lstm_model(n_layers, n_neurons, dropout_rate, time_step):
    model = Sequential()
    model.add(keras.Input(shape=(time_step, features)))
    model.add(LSTM(n_neurons, activation='relu',return_sequences=True))
    model.add(LSTM(n_neurons,  dropout=dropout_rate, activation='relu'))
    model.add(Dense(1)) 
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['root_mean_squared_error'])
    return model 


# function to Transform Data to Supervised Learning Format
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - time_step - 1):
        dataX.append(dataset.iloc[i:(i + time_step), 0])  # Corrected indexing for Pandas
        dataY.append(dataset.iloc[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

# Now, split the initial training data into 70% training and 30% validation
train_size = int(0.7 * len(initial_train_data))  # 70% of the initial train size
train_data = initial_train_data[:train_size]  # 70% for actual training
test_data = initial_train_data[train_size:]  # Remaining 30% for validation

# Transform to supervised learning
X_train, y_train = create_dataset(train_data, time_step=time_step)
X_test, y_test = create_dataset(test_data, time_step=time_step)

# Reshape input to be [samples, time steps, features] for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


# Define the parameter grid
param_grid = {
    'n_layers': [2, 3, 4, 5],
    'n_neurons': [10, 20, 50, 100],
    'dropout_rate': [0.0, 0.1, 0.5],
    'time_step': [1, 7, 14, 30]
}

# Initialize variables to store the best model and the lowest RMSE
best_rmse = float('inf')  # Start with a very high value for RMSE
best_model = None
best_params = None
best_iteration= None

# total number of combinations
total_combinations= len(list(itertools.product(*param_grid.values())))
print(f'Total number of combination:{total_combinations}')

# define the number of split
num_splits =3
split_size=total_combinations/num_splits
print(f'Split size:{split_size}')

Total number of combination:192
Split size:64.0


### We split the total combination of parameter in three equal parts to execute reducing running complexity

In [13]:
# first split(i=1 to 64)

print('\nFirst split:')
for i, params in enumerate(itertools.product(*param_grid.values()), start=1):
    if i > split_size:
        break
    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    print(f"Iteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")
    
    # Transform to supervised learning
    X_train, y_train = create_dataset(train_data, time_step=time_step)
    X_test, y_test = create_dataset(test_data, time_step=time_step)
    
    # Reshape input to be [samples, time steps, features] for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
        
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update best model if the current one is better
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration=i

# Output the best model and parameters
print(f'\nBest model found in Iteration: {best_iteration}')
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters: Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
    


First split:
Iteration 1: 2 layers, 10 neurons, 0.0 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

RMSE for current model: 169.87734745394823
Iteration 2: 2 layers, 10 neurons, 0.0 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

RMSE for current model: 101.52814145860907
Iteration 3: 2 layers, 10 neurons, 0.0 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

RMSE for current model: 101.82280778071348
Iteration 4: 2 layers, 10 neurons, 0.0 dropout, 30 time steps.
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step

RMSE for current model: 162.8440214046701
Iteration 5: 2 layers, 10 neurons, 0.1 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

RMSE for current model: 166.27531126293442
Iteration 6: 2 layers, 10 neurons, 0.1 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━

In [17]:
# second split(i= 65 to 128)

print('\nSecond split:')
for i, params in enumerate (itertools.product(*param_grid.values()), start=1):
    if i<= split_size: # Continue to the next iteration until i > 64
        continue

    if i > 2*split_size: 
        break   
    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    # Print iteration number, parameter combination, and total combinations
    print(f"\nIteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")
    
    # Transform to supervised learning
    X_train, y_train = create_dataset(train_data, time_step=time_step)
    X_test, y_test = create_dataset(test_data, time_step=time_step)
    
    # Reshape input to be [samples, time steps, features] for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update the best model 
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration = i

# Output the best model and parameters
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters: Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
print(f'\nBest model found in Iteration: {best_iteration}')


Second split:

Iteration 65: 3 layers, 20 neurons, 0.1 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

RMSE for current model: 161.11560492378376

Iteration 66: 3 layers, 20 neurons, 0.1 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

RMSE for current model: 104.46573043874612

Iteration 67: 3 layers, 20 neurons, 0.1 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

RMSE for current model: 146.52803772281413

Iteration 68: 3 layers, 20 neurons, 0.1 dropout, 30 time steps.
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step

RMSE for current model: 180.9519356120199

Iteration 69: 3 layers, 20 neurons, 0.5 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step

RMSE for current model: 191.99443728364406

Iteration 70: 3 layers, 20 neurons, 0.5 dropout, 7 time steps.
[1m9/9[0m [32m━━━

In [19]:
# Third split (i= 129 to 192)

print('\nThird split')
for i, params in enumerate(itertools.product(*param_grid.values()), start=1):
    if i <= 2*split_size:
        continue     
        
    if i > total_combinations:
        break

    n_layers, n_neurons, dropout_rate, time_step = params  # Unpack the parameters
    
    # Print iteration number, parameter combination, and total combinations
    print(f"\nIteration {i}: {n_layers} layers, {n_neurons} neurons, {dropout_rate} dropout, {time_step} time steps.")

    # Transform to supervised learning
    X_train, y_train = create_dataset(train_data, time_step=time_step)
    X_test, y_test = create_dataset(test_data, time_step=time_step)
    
    # Reshape input to be [samples, time steps, features] for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    
    
    # Create your LSTM model with the current parameter combination
    model = create_lstm_model(n_layers=n_layers, n_neurons=n_neurons, dropout_rate=dropout_rate, time_step=time_step)
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=False)
    
    # Predict on validation set
    y_pred = model.predict(X_test)
    
    # Reshape y_pred to be 1D for RMSE calculation
    y_pred = y_pred.reshape(-1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"\nRMSE for current model: {rmse}")
    
    # Update best model if the current one is better
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_params = params
        best_iteration = i
# Output the best model and parameters
print(f"\nBest model RMSE: {best_rmse}")
print(f"\nBest parameters:Layers={best_params[0]}, Neurons={best_params[1]}, Dropout={best_params[2]}, Time step={best_params[3]}")
print(f'\nBest model found in Iteration: {best_iteration}')


Third split

Iteration 129: 4 layers, 50 neurons, 0.5 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

RMSE for current model: 197.9824438025524

Iteration 130: 4 layers, 50 neurons, 0.5 dropout, 7 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

RMSE for current model: 169.01118191053186

Iteration 131: 4 layers, 50 neurons, 0.5 dropout, 14 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step

RMSE for current model: 173.8722172373733

Iteration 132: 4 layers, 50 neurons, 0.5 dropout, 30 time steps.
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step

RMSE for current model: 188.2651719567988

Iteration 133: 4 layers, 100 neurons, 0.0 dropout, 1 time steps.
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

RMSE for current model: 150.81468005243715

Iteration 134: 4 layers, 100 neurons, 0.0 dropout, 7 time steps.
[1m9/9[0m [32