## Meteo Bakery - Darts 
This notebook serves as an initial draft for getting acquainted with Darts

### import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from darts import TimeSeries
from darts.utils import missing_values
from darts.models import RegressionModel
from darts.metrics import rmse, mape
from sklearn.linear_model import LinearRegression

### load data

In [None]:
df_stacked = pd.read_csv('../data/data_combined.csv')
df_stacked['date'] = pd.to_datetime(df_stacked['date'])

In [None]:
# select only data before Covid19
df_stacked = df_stacked[df_stacked.year < 2020]
df_stacked.head()

### generate a multiple time series object
Using darts.TimeSeries class, we generate a multiple time series object from a stacked / grouped time series df, essentially representing a list of all individual time series.

In [None]:
# convert df into multiple time series object
series_multi = TimeSeries.from_group_dataframe(df=df_stacked, group_cols=['branch', 'product'], 
                                        value_cols=['turnover'], time_col='date', freq='d')

In [None]:
# fill NaNs in each series by interpolation from darts imputation functionalities using list comprehension
series_multi_nan = [missing_values.fill_missing_values(series_multi[i], fill='auto') for i in range(len(series_multi))]

In [None]:
# grouping IDs can be assessed through static_covariates attribute
branch = series_multi_nan[0].static_covariates
print(branch.iloc[:, 0][0])
print(branch.iloc[:, 1][0])

### access grouping IDs for each time series using static_covariates attribute

In [None]:
# print number of series in multiple series object
print(f"\n{len(series_multi)} series were extracted from the input DataFrame\n")

# iterate over each time series, print static_covariates (i.e. group IDs) and plot individual time series
for i, ts in enumerate(series_multi_nan):
    print('\n', ts.static_covariates)
    ts['turnover'].plot(label=f'{ts.static_covariates.iloc[:, 0][0]} | {ts.static_covariates.iloc[:, 1][0]}')

### Train-Validation split

In [None]:
# train-validation split by slicing each series within list using list comprehension
train = [series_multi_nan[i][:-364] for i in range(len(series_multi_nan))] # extract all time points except for last year
val = [series_multi_nan[i][-364:-357] for i in range(len(series_multi_nan))] # extract the next 7 days from end of training set

### fit a linear regression model w/o covariates and predict
We first fit a linear regression model to the whole training data, using lags of -7 days and -364 days, consistent with peaks in the partial autocorrelation plot.

In [None]:
# fit a regression model to series in the list and predict
model = RegressionModel(lags=[-7, -364],
                    model=LinearRegression())

model.fit(train)

y_pred = model.predict(n=7, series=train)

### fit a linear regression model w/ covariates and predict

In [None]:
# generate a multiple covariates time series object
covariates_multi = TimeSeries.from_group_dataframe(df=df_stacked, group_cols=['branch', 'product'], 
                                        value_cols=['month', 'day_of_week', 'school_holiday', 'public_holiday',
                                        'temp_mean', 'clouds_mean', 'wind_speed_mean', 'rain_1h_mean', 'snow_1h_mean'], 
                                        time_col='date', freq='d')

In [None]:
# fill in missing values using list comprehension
covariates_multi_nan = [missing_values.fill_missing_values(covariates_multi[i], fill='auto') for i in range(len(covariates_multi))]

In [None]:
# fit a regression model to series in the list including future covariates and predict
model_cov = RegressionModel(lags=[-7, -364], 
                    model=LinearRegression(), lags_future_covariates=[0]) # future covariates without lag

model_cov.fit(train, future_covariates=covariates_multi)

y_pred_cov = model_cov.predict(n=7, series=train, future_covariates=covariates_multi)

### plot actual sales vs. predicted sales from models w/ and w/o covariates for each time series

In [None]:
# plot predicted vs. actual values for each time series object
for i in range(len(y_pred_cov)):
    y_pred[i].plot(label='pred')
    y_pred_cov[i].plot(label='pred_covariates')
    val[i].plot(label='actual')
    plt.title(f'{val[i].static_covariates.iloc[:, 0][0]} | {val[i].static_covariates.iloc[:, 1][0]}')
    plt.show()

### use historical forecasts to compare models w/ and w/o covariates
In order to assess changes in prediction accuracy following inclusion of covariates, we will design a utility function that iterates over a multiple time series object and for each time series performs a historical forecast for a model w/o covariates and w/ covariates. Here, we will specifically implement future covariates, such as months, day_of_week, holidays and weather forecasts.

The forecasting horizon will be set to 7 days by default. Similarly, the stride will be set to 7 days by default, thus the training set for historical forecasts will always increase by 7 days.

For comparison, we will compute both RMSE and MAPE from those forecasts.

In [None]:
# define utility function for comparing historical forecasts on multiple time series between models w/ and w/o covariates
def validate_historically(model, model_cov, multiple_series, future_covariates=None, forecast_horizon=7, stride=7):

    # backtest the model on the last 50% of the series
    # iterate over every series in multiple time series object
    for i, series in enumerate(multiple_series):
        # perform historical forecasts on model w/o covariates and save result
        backtest_wo = model.historical_forecasts(series=series, 
                                          past_covariates=None,
                                          future_covariates=None,
                                          start=0.5, 
                                          stride=stride,
                                          retrain=False,
                                          verbose=True, 
                                          forecast_horizon=forecast_horizon)
        # perform historical forecasts on model w/ covariates and save result
        backtest_cov = model_cov.historical_forecasts(series=series, 
                                          past_covariates=None,
                                          future_covariates=future_covariates[i],
                                          start=0.5, 
                                          stride=stride,
                                          retrain=False,
                                          verbose=True, 
                                          forecast_horizon=forecast_horizon)

    
        #plot actual series values
        series.plot()
        # plot predicted values from historical model forecasts w/o covariates on top
        backtest_wo.plot(label='fh=7 - w/o covariates')
        # plot predicted values from historical model forecasts w/o covariates on top
        backtest_cov.plot(label='fh=7 - w/ covariates')
        # give title based on respective time series group IDs
        plt.title(f'{series.static_covariates.iloc[:, 0][0]} | {series.static_covariates.iloc[:, 1][0]}')
        plt.show()

        # calculate RMSE and MAPE for predictions w/ and w/o covariates
        print(f'Backtest w/o covariates: RMSE = {rmse(series, backtest_wo).round(2)}, MAPE = {mape(series, backtest_wo).round(2)}')
        print(f'Backtest w/ covariates: RMSE = {rmse(series, backtest_cov).round(2)}, MAPE = {mape(series, backtest_cov).round(2)}')


In [None]:
validate_historically(model, model_cov, train, future_covariates=covariates_multi_nan)