In [1]:
# NOTE: Script to run LSTM over selection of df's

# Libaries
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout
from sklearn.model_selection import TimeSeriesSplit

import sys
sys.path.append('/Users/ludwigbaunach/Documents/Studium/PhD/Alaiko/Paper_1_Project/Main/src')
from utils.data_split import ml_data_date_split

import warnings
warnings.filterwarnings('ignore')

2023-06-29 09:22:15.059814: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [31]:
# Custom Functions

# Split a univariate sequence into samples
def split_sequences(sequence, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequence)):
        # Find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # Check if we are beyond the sequence
        if out_end_ix > len(sequence):
            break
        # Gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)


# LSTM Forecast
def LSTM_forecast(train_data, n_steps_in, n_steps_out, epochs, batch_size):
    
    # Set up data
    # Get quantity column as np array
    sequence = train_data['quantity'].values
    # Split into input/output
    X, y = split_sequences(sequence, n_steps_in, n_steps_out)

    # Reshape X to fit the LSTM input shape (samples, timesteps, features)
    X = X.reshape((X.shape[0], X.shape[1], 1))
    
    # Define Model 
    
    # Model
    model = Sequential()
    model.add(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
    model.add(LSTM(100, activation='relu'))
    model.add(Dense(n_steps_out))
    model.compile(optimizer='adam', loss='mse')

    # Train
    
    # Fit model
    model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=0)

    # Forecast
    
    # Get last n_steps_in quantities
    x_input = sequence[-n_steps_in:]
    x_input = x_input.reshape((1, n_steps_in, 1))

    # Predict
    yhat = model.predict(x_input, verbose=0)
    # Flatten yhat
    yhat_1d = yhat.flatten()

    return yhat_1d


def LSTM_backtesting(df, tscv):
    # Initialize an empty lists to store predictions, actuals and dates
    preds = []
    actuals = []
    dates = []

    # loop over pre-defined time series split / time series cross validation (tscv)
    for i, (train_index, test_index) in enumerate(tscv.split(df)):
        # Train and test data
        train_fold, test_fold = df.iloc[train_index], df.iloc[test_index]

        # Append Dates
        dates_fold = df.date.iloc[test_index].tolist()

        # Track
        print(f"Fold Nr. = {i}")

        # Predict on the current test fold
        y_pred_fold = LSTM_forecast(train_fold, n_steps_in, n_steps_out, epochs, batch_size)

        # append scores and predictions
        preds.append(y_pred_fold.tolist())
        actuals.append(test_fold.quantity.tolist())
        dates.append(dates_fold)

    # make df that holds predictions and actuals
    # flatten nested lists
    dates = np.concatenate(dates).tolist()
    actuals = np.concatenate(actuals).tolist()
    preds = np.concatenate(preds).tolist()

    # Error analysis
    error = pd.DataFrame({
        "date": dates,
        "actual": actuals,
        "pred": preds
    }).reset_index(drop=True)

    return error

In [34]:
%%time
# Set Modelling Parameters

# Set seed
np.random.seed(42)

# Horizon
time_horizon = 9
n_steps_in = 30
n_steps_out = time_horizon
n_features = 1

# TimeSeriesSplit
tscv = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=time_horizon)

# Model parameters
batch_size = 32
epochs = 150

# define DFs and groups

df_list = ["L_3"]
#df_list = ["L_3", "L_4", "L_6"]
group_list = ["warehouse_chain"]
#group_list = ["warehouse_chain", "new_customer_id", "empty"]

# Load data
test_data = {}
for i in df_list:
    test_data[i] = pd.read_pickle(f"../data/processed/{i}_test.pkl")

# Initialize an empty dictionary to store the results
results = {}

# lightGBM modelling
for level, group in zip(df_list, group_list):

    # Initialize DataFrames to store the backtest and forecasted values
    backtest_values = pd.DataFrame(columns=["date", "actual", "pred", "level", "group"])
    forecast_values = pd.DataFrame(columns=["date", "actual", "pred", "level", "group"])

    # Select the right level
    df = test_data[level].copy()

    # Select the right group

    if level == "L_6":

        print("Group:", "L_6")

        # selected group
        group_df = df.copy()

        # Define Data
        train_df, test_df = ml_data_date_split(group_df, 8)  # split data with custom function

        print("Start backtest:")
        # Backtest
        temp_backtest_df = LSTM_backtesting(train_df, tscv)
        temp_backtest_df["level"] = level
        temp_backtest_df["group"] = "empty"

        backtest_values = pd.concat([backtest_values, temp_backtest_df])

        print("Start forecast:")
        y_pred = LSTM_forecast(train_df, n_steps_in, n_steps_out, epochs, batch_size)

        temp_forecast_df = pd.DataFrame({'date': test_df.date,
                                         'actual': test_df.quantity,
                                         'pred': y_pred})
        temp_forecast_df["level"] = level
        temp_forecast_df["group"] = "empty"

        forecast_values = pd.concat([forecast_values, temp_forecast_df])

    else:

        for i_ts in df[group].unique():
            print("Group:", i_ts)

            # selected group
            group_df = df[df[group] == i_ts]

            # Define Data
            train_df, test_df = ml_data_date_split(group_df, 8)  # split data with custom function

            print("Start backtest:")
            # Backtest
            temp_backtest_df = LSTM_backtesting(train_df, tscv)
            temp_backtest_df["level"] = level
            temp_backtest_df["group"] = i_ts

            backtest_values = pd.concat([backtest_values, temp_backtest_df])

            print("Start forecast:")
            y_pred = LSTM_forecast(train_df, model, n_steps_in, n_steps_out, epochs, batch_size)

            temp_forecast_df = pd.DataFrame({'date': test_df.date,
                                             'actual': test_df.quantity,
                                             'pred': y_pred})
            temp_forecast_df["level"] = level
            temp_forecast_df["group"] = i_ts

            forecast_values = pd.concat([forecast_values, temp_forecast_df])

    # Store the backtest and prediction data frames in the results dictionary
    # with the corresponding name including the category_name
    results[f"{level}"] = {
        'backtest': backtest_values,
        'pred': forecast_values
    }

Group: WH-Chain-1
Start backtest:
Fold Nr. = 0
Fold Nr. = 1
Fold Nr. = 2
Fold Nr. = 3
Fold Nr. = 4
Start forecast:


TypeError: LSTM_forecast() takes 5 positional arguments but 6 were given

In [35]:
results["L_3"]

KeyError: 'L_3'

In [30]:
%%time





error3 =  LSTM_backtesting(train, model, tscv)
error3

Fold Nr. = 0
Fold Nr. = 1
Fold Nr. = 2
Fold Nr. = 3
Fold Nr. = 4
CPU times: user 30min, sys: 6min 11s, total: 36min 12s
Wall time: 8min 25s


Unnamed: 0,date,actual,pred
0,2022-10-13,3142.0,2043.555908
1,2022-10-14,2648.0,2021.061035
2,2022-10-15,1954.0,2061.219482
3,2022-10-16,2442.0,1988.887695
4,2022-10-17,2012.0,2045.393433
5,2022-10-18,3468.0,2060.311768
6,2022-10-19,2732.0,2084.658447
7,2022-10-20,2922.0,2161.358887
8,2022-10-21,2467.0,2042.018311
9,2022-10-22,2098.0,1806.27417


In [29]:
%%time
error2 =  LSTM_backtesting(train, model, tscv)
error2

Fold Nr. = 0
Fold Nr. = 1
Fold Nr. = 2
Fold Nr. = 3
Fold Nr. = 4
CPU times: user 29min 49s, sys: 6min 7s, total: 35min 56s
Wall time: 7min 45s


Unnamed: 0,date,actual,pred
0,2022-10-13,3142.0,
1,2022-10-14,2648.0,
2,2022-10-15,1954.0,
3,2022-10-16,2442.0,
4,2022-10-17,2012.0,
5,2022-10-18,3468.0,
6,2022-10-19,2732.0,
7,2022-10-20,2922.0,
8,2022-10-21,2467.0,
9,2022-10-22,2098.0,


In [26]:
test_12 = LSTM_backtesting(train_df, model, tscv)
test_12

Fold Nr. = 0
Fold Nr. = 1
Fold Nr. = 2
Fold Nr. = 3
Fold Nr. = 4


Unnamed: 0,date,actual,pred
0,2022-10-13,3142.0,
1,2022-10-14,2648.0,
2,2022-10-15,1954.0,
3,2022-10-16,2442.0,
4,2022-10-17,2012.0,
5,2022-10-18,3468.0,
6,2022-10-19,2732.0,
7,2022-10-20,2922.0,
8,2022-10-21,2467.0,
9,2022-10-22,2098.0,


In [14]:
# LSTM Forecast
def LSTM_forecast(train_data, model, n_steps_in, n_steps_out, epochs, batch_size):
    # Set up data
    # Get quantity column as np array
    sequence = train_data['quantity'].values

    # Split into input/output
    X, y = split_sequences(sequence, n_steps_in, n_steps_out)

    # Reshape X to fit the LSTM input shape (samples, timesteps, features)
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Train
    # Fit model
    model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=0)

    # Forecast
    # Get last n_steps_in quantities
    x_input = sequence[-n_steps_in:]
    x_input = x_input.reshape((1, n_steps_in, 1))

    # Predict
    yhat = model.predict(x_input, verbose=0)
    # Flatten yhat
    yhat_1d = yhat.flatten()

    return yhat_1d

df_test = test_data["L_3"]
df_test = df_test[df_test["warehouse_chain"] == "WH-Chain-3"]

In [15]:
%%time
train,test = ml_data_date_split(df_test, 8)
yhat2 = LSTM_forecast(train, model, n_steps_in, n_steps_out, epochs, batch_size)

CPU times: user 5min 39s, sys: 1min 7s, total: 6min 46s
Wall time: 1min 28s


In [17]:
yhat2

array([6096.3574, 6169.8594, 6185.2563, 6346.7397, 6415.6606, 6488.465 ,
       6739.0264, 6854.357 , 6686.984 ], dtype=float32)

In [28]:
train_df

Unnamed: 0,date,warehouse_chain,quantity,tm_w_end,tm_dy_sin,tm_dy_cos,tm_dm_sin,tm_dm_cos,tm_wy_sin,tm_wy_cos,...,holiday_Pfingstmontag,holiday_Reformationstag,holiday_Tag_der_Deutschen_Einheit,holiday_Zweiter_Weihnachtstag,blackweek,blackweekend,aftercyberweek,tm_y_0,tm_y_1,tm_y_2
379,2021-03-30,WH-Chain-3,1.0,0,-0.952574,-0.304308,-2.012985e-01,0.979530,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
382,2021-03-31,WH-Chain-3,0.0,0,-0.966457,-0.256827,-2.449294e-16,1.000000,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
385,2021-04-01,WH-Chain-3,10.0,0,-0.977976,-0.208718,2.012985e-01,0.979530,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
388,2021-04-02,WH-Chain-3,0.0,0,-0.987101,-0.160098,3.943559e-01,0.918958,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
391,2021-04-03,WH-Chain-3,0.0,1,-0.993811,-0.111087,5.712682e-01,0.820763,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2022-11-22,WH-Chain-3,11340.0,0,-0.316066,-0.948737,-9.680771e-01,-0.250653,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2188,2022-11-23,WH-Chain-3,9387.0,0,-0.362598,-0.931946,-9.987165e-01,-0.050649,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2191,2022-11-24,WH-Chain-3,10888.0,0,-0.408243,-0.912873,-9.884683e-01,0.151428,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2194,2022-11-25,WH-Chain-3,21906.0,0,-0.452888,-0.891567,-9.377521e-01,0.347305,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1


In [27]:
train

Unnamed: 0,date,warehouse_chain,quantity,tm_w_end,tm_dy_sin,tm_dy_cos,tm_dm_sin,tm_dm_cos,tm_wy_sin,tm_wy_cos,...,holiday_Pfingstmontag,holiday_Reformationstag,holiday_Tag_der_Deutschen_Einheit,holiday_Zweiter_Weihnachtstag,blackweek,blackweekend,aftercyberweek,tm_y_0,tm_y_1,tm_y_2
379,2021-03-30,WH-Chain-3,1.0,0,-0.952574,-0.304308,-2.012985e-01,0.979530,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
382,2021-03-31,WH-Chain-3,0.0,0,-0.966457,-0.256827,-2.449294e-16,1.000000,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
385,2021-04-01,WH-Chain-3,10.0,0,-0.977976,-0.208718,2.012985e-01,0.979530,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
388,2021-04-02,WH-Chain-3,0.0,0,-0.987101,-0.160098,3.943559e-01,0.918958,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
391,2021-04-03,WH-Chain-3,0.0,1,-0.993811,-0.111087,5.712682e-01,0.820763,0.999561,0.029633,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2022-11-22,WH-Chain-3,11340.0,0,-0.316066,-0.948737,-9.680771e-01,-0.250653,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2188,2022-11-23,WH-Chain-3,9387.0,0,-0.362598,-0.931946,-9.987165e-01,-0.050649,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2191,2022-11-24,WH-Chain-3,10888.0,0,-0.408243,-0.912873,-9.884683e-01,0.151428,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1
2194,2022-11-25,WH-Chain-3,21906.0,0,-0.452888,-0.891567,-9.377521e-01,0.347305,-0.652822,0.757511,...,0,0,0,0,1,0,0,0,0,1


In [19]:
%%time
error =  LSTM_backtesting(train, model, tscv)
error

Fold Nr. = 0
Fold Nr. = 1
Fold Nr. = 2
Fold Nr. = 3
Fold Nr. = 4
CPU times: user 28min 18s, sys: 5min 49s, total: 34min 7s
Wall time: 7min 27s


Unnamed: 0,date,actual,pred
0,2022-10-13,3142.0,2157.958984
1,2022-10-14,2648.0,2190.646973
2,2022-10-15,1954.0,2155.380127
3,2022-10-16,2442.0,2183.671875
4,2022-10-17,2012.0,2187.176758
5,2022-10-18,3468.0,2193.703369
6,2022-10-19,2732.0,2285.228027
7,2022-10-20,2922.0,2245.639648
8,2022-10-21,2467.0,2275.179688
9,2022-10-22,2098.0,1872.910889
