In [131]:
# Libaries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Flatten
from keras import backend as K
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error as mape
#from keras.callbacks import EarlyStopping
from keras.layers import ConvLSTM2D

import sys
sys.path.append('/Users/ludwigbaunach/Documents/Studium/PhD/Alaiko/Paper_1_Project/Main/src')
from utils.data_split import ml_data_date_split

In [91]:
df = pd.read_pickle("../data/processed/L_6_test.pkl", )

In [35]:
# Set seed
np.random.seed(42)
# Horizon 
time_horizon = 9
# LSTM
n_steps_in = 30
n_steps_out = time_horizon

In [30]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
# define cusstom function 

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:out_end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# LSTM Forecast 
def LSTM_forecast(train, test, model, drop_columns):

    # drop the first two columns (assuming they're date and target variable) and get np array
    features = train.drop(train.columns[drop_columns], axis=1).values
    test_features = train.iloc[-n_steps_in:].drop(train.columns[drop_columns], axis=1).values  
    
    # reshape the target variable
    out_seq = np.array(train.quantity).reshape((len(train), 1))

    # horizontally stack features and target variable
    dataset = np.hstack((features, out_seq))

    # choose a number of time steps
    n_steps_in, n_steps_out = 30, 9

    # covert into input/output
    X, y = split_sequences(dataset, n_steps_in, n_steps_out)

    # the dataset knows the number of features, e.g. 2
    n_features = X.shape[2]
    
    # fit model
    model.fit(X, y, epochs=200, verbose=0)
    
    # forecast
    
    # reshape test_features
    x_test_input = test_features.reshape((1, n_steps_in, n_features))
    #predict
    yhat = model.predict(x_test_input, verbose=0)
    # make yhat 1-dimensional
    yhat_1d = yhat.flatten()

    return yhat_1d

In [151]:
# define model
    model = Sequential()
    model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
    model.add(LSTM(100, activation='relu'))
    model.add(Dense(n_steps_out))
    model.compile(optimizer='adam', loss='mse')

<keras.callbacks.History at 0x7ff11af7c970>

In [164]:
test_features = train.iloc[-n_steps_in:].drop(train.columns[[0, 1]], axis=1).values
test_features.shape

(30, 54)

In [169]:
# Get all features from the third column onward, for the last 30 entries
test_features = train.iloc[-n_steps_in:].drop(train.columns[[0, 1]], axis=1).values

# reshape it to fit the LSTM input shape
x_test_input = test_features.reshape((1, n_steps_in, n_features))
yhat = model.predict(x_test_input, verbose=0)
yhat_1d = yhat.flatten()
mape(test.quantity, yhat_1d)

0.350144006640273

## 1. Tuning

In [None]:
from keras_tuner import RandomSearch

def build_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32),
                   activation='relu', return_sequences=True, 
                   input_shape=(n_steps_in, n_features)))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32),
                   activation='relu'))
    model.add(Dense(n_steps_out))
    model.compile(optimizer='adam', loss='mse')
    return model



# drop the first two columns (assuming they're date and target variable) and get np array
features = train.drop(train.columns[drop_columns], axis=1).values
test_features = train.iloc[-n_steps_in:].drop(train.columns[drop_columns], axis=1).values  

# reshape the target variable
out_seq = np.array(train.quantity).reshape((len(train), 1))

# horizontally stack features and target variable
dataset = np.hstack((features, out_seq))

# choose a number of time steps
n_steps_in, n_steps_out = 30, 9

# covert into input/output
X, y = split_sequences(dataset, n_steps_in, n_steps_out)

# the dataset knows the number of features, e.g. 2
n_features = X.shape[2]

# Keras Tuner's hyperparameter tuning
tuner = RandomSearch(
    build_model,
    objective='loss',
    max_trials=5,  # how many model variations to test
    executions_per_trial=3,  # how many trials per variation
    directory='random_search',
    project_name='lstm_forecasting'
)

tuner.search_space_summary()

# Perform hyperparameter search
tuner.search(X, y, epochs=20, validation_split=0.2)

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the LSTM layer is {best_hps.get('units')}
""")

# Build the model with the optimal hyperparameters
model = tuner.hypermodel.build(best_hps)

# fit model
model.fit(X, y, epochs=200, verbose=0)

# forecast

# reshape test_features
x_test_input = test_features.reshape((1, n_steps_in, n_features))
#predict
yhat = model.predict(x_test_input, verbose=0)
# make yhat 1-dimensional
yhat_1d = yhat.flatten()



## 2. Backtesting

## 3. Forecast

## STRATEGY

1. Search for hyperparameters in training data
2. Take best parameters and perform backtesting 
3. Forecast using the parameters

## OLD

In [None]:
from sklearn.model_selection import TimeSeriesSplit


# Define LSTM model structure
def create_lstm_model(n_steps, n_features):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error', run_eagerly=True)
    return model

# Function to reshape data for LSTM
def reshape_data(data, time_steps):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps), 1:])  # Use all but the first column as features
        y.append(data[i + time_steps, 0])  # Use the first column as target
    return np.array(X), np.array(y)

time_steps = 9
results = {}
tscv = TimeSeriesSplit(gap=0, max_train_size=None, 
                                   n_splits=5, 
                                   test_size=time_horizon)


# Define Data
train_df, test_df = ml_data_date_split(df, 8) # split data with custom function
# Sort values (important for time series split)
train_sorted_df = train_df.sort_values(by=['date'])
test_sorted_df = test_df.sort_values(by=['date'])

train_sorted_df = train_sorted_df.set_index("date")
test_sorted_df = test_sorted_df.set_index("date")

data = train_sorted_df.values

for train_index, test_index in tscv.split(data):
    train_data, test_data = data[train_index], data[test_index]

    # Reshape data for LSTM
    X_train, y_train = reshape_data(train_data, time_steps)
    X_test, y_test = reshape_data(test_data, time_steps)

    # Initialize and fit LSTM model
    model = create_lstm_model(time_steps, X_train.shape[2])
    model.fit(X_train, y_train, epochs=30, verbose=0)

    # Backtest on train data
    y_train_pred = model.predict(X_train)
    backtest_df = pd.DataFrame({
        "date": train_sorted_df.index[time_steps:len(y_train_pred)+time_steps],
        "actual": y_train,
        "pred": y_train_pred.flatten()
    })

In [145]:
X_train.shape

(917, 9, 54)

In [56]:
backtest_df

Unnamed: 0,date,actual,pred
0,2020-04-09,133.0,59.722706
1,2020-04-10,99.0,65.757957
2,2020-04-11,102.0,107.461853
3,2020-04-12,100.0,188.496429
4,2020-04-13,90.0,187.498001
...,...,...,...
948,2022-11-13,31567.0,23244.406250
949,2022-11-14,16820.0,16320.597656
950,2022-11-15,13479.0,12720.399414
951,2022-11-16,16334.0,13939.563477


In [50]:
test_data.shape

(9, 55)

In [33]:
# Frame the supervised learning problem
n_days = 9  # number of days to forecast
n_features = df.shape[1]  # number of features in the data
n_obs = n_days * n_features
reframed = series_to_supervised(df, n_days, n_days)
values = reframed.values

# Split into train and test sets
n_train_days = 365  # number of days to use for training
train = values[:n_train_days, :]
test = values[n_train_days:, :]
# Split into input and outputs
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]
# Reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_days, n_features))
test_X = test_X.reshape((test_X.shape[0], n_days, n_features))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss=root_mean_squared_error)

# Fit the model
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)

# Make a prediction
yhat = model.predict(test_X)

Epoch 1/50
6/6 - 4s - loss: 2947.3850 - val_loss: 9954.5234 - 4s/epoch - 715ms/step
Epoch 2/50
6/6 - 0s - loss: 2947.3096 - val_loss: 9954.4443 - 89ms/epoch - 15ms/step
Epoch 3/50
6/6 - 0s - loss: 2947.1707 - val_loss: 9954.2998 - 85ms/epoch - 14ms/step
Epoch 4/50
6/6 - 0s - loss: 2947.0928 - val_loss: 9954.2471 - 84ms/epoch - 14ms/step
Epoch 5/50
6/6 - 0s - loss: 2947.0278 - val_loss: 9954.1934 - 92ms/epoch - 15ms/step
Epoch 6/50
6/6 - 0s - loss: 2946.9619 - val_loss: 9954.1387 - 71ms/epoch - 12ms/step
Epoch 7/50
6/6 - 0s - loss: 2946.8948 - val_loss: 9954.0850 - 70ms/epoch - 12ms/step
Epoch 8/50
6/6 - 0s - loss: 2946.8254 - val_loss: 9954.0322 - 77ms/epoch - 13ms/step
Epoch 9/50
6/6 - 0s - loss: 2946.7527 - val_loss: 9953.9775 - 70ms/epoch - 12ms/step
Epoch 10/50
6/6 - 0s - loss: 2946.6724 - val_loss: 9953.8857 - 72ms/epoch - 12ms/step
Epoch 11/50
6/6 - 0s - loss: 2946.4285 - val_loss: 9953.6338 - 70ms/epoch - 12ms/step
Epoch 12/50
6/6 - 0s - loss: 2946.2908 - val_loss: 9953.3818 - 7

In [20]:
yhat

array([[16.599987],
       [16.599985],
       [16.599976],
       [16.59997 ],
       [16.59997 ],
       [16.59997 ],
       [16.59998 ],
       [16.59998 ],
       [16.59998 ],
       [16.6     ],
       [16.600025],
       [16.600075],
       [16.600092],
       [16.600113],
       [16.600136],
       [16.60017 ],
       [16.600563],
       [16.600552],
       [16.600569],
       [16.600538],
       [16.600536],
       [16.600506],
       [16.60042 ],
       [16.600191],
       [16.60005 ],
       [16.600014],
       [16.599993],
       [16.600428],
       [16.601051],
       [16.601183],
       [16.60244 ],
       [16.610025],
       [16.610346],
       [16.611515],
       [16.611193],
       [16.610655],
       [16.610048],
       [16.608297],
       [16.603857],
       [16.601175],
       [16.600372],
       [16.599981],
       [16.599968],
       [16.599968],
       [16.599968],
       [16.599968],
       [16.599968],
       [16.599968],
       [16.599968],
       [16.599968],


In [5]:
scale_cols = ['q_roll_mean_7d', 'q_roll_std_7d', 'q_roll_mean_14d', 'q_roll_std_14d', 'q_lag_1d',
       'q_lag_7d', 'q_lag_14d', 'q_lag_28d', 'q_mean_lag_7_14_28',
       'precipitation_height', 'sunshine_duration', 'temperature_air_mean_200',
       'sunshine_duration_h', 'suns_classes', 'temp_classes', "rain_classes"]

       
encode_cols = ["tm_y"]

In [6]:
# Make year a catagory
df[encode_cols] = df[encode_cols].astype('category',copy=False)

# Get dummies
just_dummies = pd.get_dummies(df[encode_cols])
encoded_df = pd.concat([df, just_dummies], axis=1)      
encoded_df.drop(encode_cols, inplace=True, axis=1)

In [7]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder


# Split the data into training and testing sets

# splot data
x_train, x_test = ml_data_split(encoded_df.reset_index(), 10)

# get y that has to be predicted
y_train = x_train['quantity'].copy()
y_test = x_test['quantity'].copy()

# clean data: drop y variable and date from dataset
x_train.drop(["quantity", 'date'], axis=1, inplace=True)
x_test.drop(["quantity", 'date'], axis=1, inplace=True)

# Scale the selected columns using MinMaxScaler
scaler = MinMaxScaler()
scaled_x_train = x_train.copy()
scaled_x_test = x_test.copy()

scaled_x_train[scale_cols] = scaler.fit_transform(x_train[scale_cols])
scaled_x_test[scale_cols] = scaler.transform(x_test[scale_cols])

In [19]:
# ensure all data is float
train_values = scaled_x_train.values.astype('float32')
test_values = scaled_x_test.values.astype('float32')
#train_values.info()

In [30]:
scaled_x_train.shape

(969, 54)

In [42]:
train_test = scaled_x_train.to_numpy()

# Set up array

n_future = 10
n_past = 60

#shape of train_test
print("Shape of train_test: ",scaled_x_train.shape)

data = train_test.reshape(969, 10, 54)

Shape of train_test:  (969, 54)


ValueError: cannot reshape array of size 52326 into shape (969,10,54)

In [None]:
#Empty lists to be populated using formatted training data
trainX = []
trainY = []

n_future = 1   # Number of days we want to look into the future based on the past days.
n_past = 14  # Number of past days we want to use to predict the future.

#Reformat input data into a shape: (n_samples x timesteps x n_features)
#In my example, my df_for_training_scaled has a shape (12823, 5)
#12823 refers to the number of data points and 5 refers to the columns (multi-variables).
for i in range(n_past, len(df_for_training_scaled) - n_future +1):
    trainX.append(df_for_training_scaled[i - n_past:i, 0:df_for_training.shape[1]])
    trainY.append(df_for_training_scaled[i + n_future - 1:i + n_future, 0])

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

In [31]:
#Empty lists to be populated using formatted training data
trainX = []
trainY = []

n_future = 10   # Number of days we want to look into the future based on the past days.
n_past = 60  # Number of past days we want to use to predict the future.

#Reformat input data into a shape: (n_samples x timesteps x n_features)
#In my example, my df_for_training_scaled has a shape (12823, 5)
#12823 refers to the number of data points and 5 refers to the columns (multi-variables).
for i in range(n_past, len(scaled_x_train) - n_future +1):
    trainX.append(scaled_x_train[i - n_past:i, 0:scaled_x_train.shape[1]])
    trainY.append(scaled_x_train[i + n_future - 1:i + n_future, 0])

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

InvalidIndexError: (slice(0, 60, None), slice(0, 54, None))

In [17]:
train_values.values

(969, 54)

In [26]:
x_train.shape[1]

54

In [25]:
train_values.shape

(969, 54)

In [28]:
x_train = x_train.values
x_test = x_test.values

n_features = x_train.shape[1]

# Reshape the input data to be 3-dimensional
test_X_train = x_train.reshape((x_train.shape[0], n_steps, n_features))
test_X_test = x_test.reshape((x_test.shape[0], n_steps, n_features))

test_X_train.shape

ValueError: cannot reshape array of size 52326 into shape (969,10,54)

In [23]:
# Define the number of time steps and features
n_steps = 10
n_features = x_train.shape[1]

# Reshape the input data to be 3-dimensional
test_X_train = train_values.reshape((train_values.shape[0], n_steps, n_features))
test_X_test = test_values.reshape((test_values.shape[0], n_steps, n_features))

test_x_train.shape

ValueError: cannot reshape array of size 52326 into shape (969,10,54)

In [88]:
#Empty lists to be populated using formatted training data
x_train_array = []
y_train_array = []

n_future = 10   # Number of days we want to look into the future based on the past days.
n_past = 30  # Number of past days we want to use to predict the future.

#Reformat input data into a shape: (n_samples x timesteps x n_features)
#In my example, my df_for_training_scaled has a shape (12823, 5)
#12823 refers to the number of data points and 5 refers to the columns (multi-variables).
for i in range(n_past, len(x_train) - n_future +1):
    x_train_array (x_train[i - n_past:i, 0:x_train.shape[0]])
    y_train_array.append(y_train[i + n_future - 1:i + n_future, 0])


InvalidIndexError: (slice(0, 30, None), slice(0, 969, None))

In [82]:
# define the Autoencoder model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(x_train.shape[0], x_train.shape[1]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[0]))

model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 969, 64)           30464     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 969)               31977     
                                                                 
Total params: 74,857
Trainable params: 74,857
Non-trainable params: 0
_________________________________________________________________


In [83]:
history = model.fit(x_train, y_train, epochs=5, batch_size=16, validation_split=0.1, verbose=1)

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()


Epoch 1/5


ValueError: in user code:

    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/miniconda3/envs/snowflakes/lib/python3.10/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_5" is incompatible with the layer: expected shape=(None, 969, 54), found shape=(None, 54)
