1. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import os, glob
import keras
from keras.layers import LSTM, Dense
from keras.models import Sequential

import numpy as np
import pandas as pd
import datetime as dt

from sklearn.preprocessing import MinMaxScaler

plt.rcParams['figure.figsize'] = [20, 10]

2. Reading

In [2]:
#2 possible datas : ../data/public/train.csv and ../data/public/clean_train.csv

DT_train = pd.read_csv("../data/public/train.csv", parse_dates=True)
DT_train["Day"] = pd.to_datetime(DT_train["Day"], format="%Y-%m-%d")
DT_train.set_index("Day", inplace=True)
DT_train = DT_train.asfreq("D")

In [3]:
# Filling missing values if using the original data
DT_train.fillna(method="backfill", inplace=True)
DT_train.isna().any().any()

False

3. Function for kaggle submission

In [4]:
# From Dataframe (HORIZON X SERIES) to Kaggle format (s001h3, ...)
def kaggle_forecasts(fcts):
    submission = fcts.copy()
    submission.index = 1 + np.arange(len(submission))
    submission = submission.stack()
    submission.name = "Forecasts"
    submission = submission.reset_index()

    submission["Id"] = submission["level_1"] + "h" + submission["level_0"].apply(str)
    submission.drop(["level_0", "level_1"], axis=1, inplace=True)
    submission = submission[["Id", "Forecasts"]]
    return submission

4. Define the Horizon of forecast

In [5]:
HORIZON = 7 * 4

`forecast_for_kaggle` has to be set to True for the Kaggle competition.

Setting it to False allows to have access to a test dataset using data from `DT_train`.

In [6]:
# Starting dates for validation and test data
valid_start_dt = "1997-09-20"

forecast_for_kaggle = True
if forecast_for_kaggle:
    test_start_dt = "1998-03-23"
else:
    test_start_dt = dt.datetime.strptime("1998-03-23", "%Y-%m-%d") - dt.timedelta(days=HORIZON)
    test_start_dt = test_start_dt.strftime("%Y-%m-%d")
    DT_test = DT_train.tail(HORIZON).copy()
    DT_train = DT_train.head(-HORIZON).copy()

last_day_train = DT_train.index[-1]
test_dates = pd.date_range(start=last_day_train, periods=HORIZON + 1)[1:]


# `use_subset_of_series` restricts the number of series to 2 for faster results.
use_subset_of_series = False
if use_subset_of_series:
    id_series_all = DT_train.columns[:2]
else:
    id_series_all = DT_train.columns

## Seasonal naive

In [8]:
period = 28
T = len(DT_train)

fcts_snaive_list = list()

for id_series in id_series_all:
    series_train = DT_train[id_series]
    f_snaive = [series_train[T + h - period * ((HORIZON - 1) // period + 1)] for h in range(0, HORIZON)]
    f_snaive = pd.Series(f_snaive, index=test_dates)
    f_snaive.name = id_series
    fcts_snaive_list.append(f_snaive)

fcts_snaive = pd.concat(fcts_snaive_list, axis=1)

kaggle_submission_naive = kaggle_forecasts(fcts_snaive)
kaggle_submission_naive.to_csv("../work/submission_snaive.csv", index=False)

## Auto Arima

In [17]:
from pmdarima.arima import auto_arima

fcts_arima_list = list()
# all_solvers = ['lbfgs','newton','bfgs','nm','cg','ncg']
solver = 'newton'
for id_series in id_series_all:
    print("======", id_series, "======")
    y = DT_train[id_series]
    model = auto_arima(
        y,
        d=0,
        start_p=0,
        max_p=2,
        start_q=0,
        max_q=2,
        D=0,
        start_P=0,
        max_P=2,
        start_Q=0,
        max_Q=2,
        m=7,
        trace=False,
        error_action="ignore",
        suppress_warnings=True,
        seasonal=True,
        solver=solver
    )
    f_arima = model.predict(HORIZON)
    f_arima.name = id_series
    fcts_arima_list.append(f_arima)

fcts_arima = pd.concat(fcts_arima_list, axis=1)

kaggle_submission_arima = kaggle_forecasts(fcts_arima)
kaggle_submission_arima.to_csv("../work/submission_arima.csv", index=False)



## Exponential Smoothing 


In [7]:
import statsmodels.api as sm

fcts_expsmooth_list = list()
for id_series in id_series_all:
    print("======", id_series, "======")
    y = DT_train[id_series]
    # Fit the model
    model = sm.tsa.ExponentialSmoothing(y, seasonal='add', seasonal_periods=7).fit()

    # Make forecasts
    f_expsmooth = model.forecast(HORIZON)
    f_expsmooth.name = id_series
    fcts_expsmooth_list.append(f_expsmooth)

fcts_expsmooth = pd.concat(fcts_expsmooth_list, axis=1)

kaggle_submission_arima = kaggle_forecasts(fcts_expsmooth)
kaggle_submission_arima.to_csv("../work/submission_expsmooth.csv", index=False)



## AI method

### LSTM without extracting any features

In [19]:
prediction_days = HORIZON
# Normalize data
scaler = MinMaxScaler(feature_range=(0,1))
fcts_lstm_list = list()
for id_series in id_series_all:
    print("======", id_series, "======")
    data = DT_train[id_series]

    scaled_data = scaler.fit_transform(data.values.reshape(-1,1))
    x_train = []
    y_train = []
    for x in range(prediction_days, len(scaled_data)):
        x_train.append(scaled_data[x - prediction_days:x, 0])
        y_train.append(scaled_data[x, 0])
        
    x_train, y_train = np.array(x_train), np.array(y_train)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

    # Define the model
    model = Sequential()
    model.add(LSTM(256, input_shape=(28, 1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    model.build()

    # Fit the model to the training data
    model.fit(x_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose = 0)

    model_inputs = data[len(data) - 2*prediction_days:].values
    model_inputs = model_inputs.reshape(-1,1)
    model_inputs = scaler.transform(model_inputs)

    x_test = []
    for x in range(prediction_days, len(model_inputs)):
        x_test.append(model_inputs[x-prediction_days:x, 0])

    x_test = np.array(x_test)
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1] ,1))

    predicted_prices = model.predict(x_test, verbose=0)
    predicted_prices = scaler.inverse_transform(predicted_prices)

    f_lstm = pd.Series(list(predicted_prices),index=pd.date_range("19980323", periods=28))
    for i in range(len(f_lstm)):
        f_lstm[i]= f_lstm[i][0]
    f_lstm.name = id_series
    fcts_lstm_list.append(f_lstm)

fcts_lstm = pd.concat(fcts_lstm_list, axis=1)

kaggle_submission_arima = kaggle_forecasts(fcts_lstm)
kaggle_submission_arima.to_csv("../work/submission_lstm.csv", index=False)

