# Introduction #

Run this cell to set everything up!

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex6 import *

# Setup notebook
import warnings
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.tsa.tsatools import lagmat
from xgboost import XGBRegressor

warnings.simplefilter("ignore")

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


def plot_multistep(y, every=1, ax=None, palette_kwargs=None):
    palette_kwargs_ = dict(palette='husl', n_colors=16, desat=None)
    if palette_kwargs is not None:
        palette_kwargs_.update(palette_kwargs)
    palette = sns.color_palette(**palette_kwargs_)
    if ax is None:
        fig, ax = plt.subplots()
    ax.set_prop_cycle(plt.cycler('color', palette))
    for date, preds in y[::every].iterrows():
        preds.index = pd.period_range(start=date, periods=len(preds))
        preds.plot(ax=ax)
    return ax


comp_dir = Path('../input/store-sales-time-series-forecasting')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
mean_sales = store_sales.groupby('date').mean().squeeze().loc['2017']

-------------------------------------------------------------------------------

# 1) Define a forecasting task

Here are some forecasting situations:
- competing in a Kaggle forecasting competition
- manager to forecast call volume at a customer service center to schedule operators
- supervisor of a medical lab to forecast sample volume to schedule time on shared computer cluster
- economist to forecast retail sales of household goods
- manager of an ML team to forecast market prices of real estate; data availability is delayed

Identify the parts of the forecasting task.


In [None]:
a = 1
b = 2
c = 3

-------------------------------------------------------------------------------

Run the next cell to see three datasets, each belonging to one of the tasks above.

# 2) Match task to data

Which dataset belongs with which task?

In [None]:
a = 1
b = 2
c = 3

# 3) Prepare *Store Sales* for multistep forecasting

The test set comprises the 16 days between `2017-08-16` and `2017-08-31`, while the last day of the training set is `2017-08-15`. Prepare *Store Sales* for a multistep forecast using 4 days of lags.

In [None]:
def make_lag_features(y, lags):
    return pd.concat(
        [y.shift(i, freq='infer') for i in range(lags)],
        axis=1,
        join='inner',
        keys=[f'y_lag_{i+1}' for i in range(lags)],
    )


def make_multistep_target(y, steps):
    return pd.concat(
        [y.shift(-i, freq='infer') for i in range(steps)],
        axis=1,
        join='inner',
        keys=[f'y_{i+1}' for i in range(steps)],
    )


y = family_sales.copy()
X = make_lag_features(y, lags=30)
y = make_multistep_target(y, steps=16)
y, X = y.align(X, join='inner', axis=0)

# 4) Compare forecasting strategies
### MultiOutput Strategy


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1, shuffle=False)

model = LinearRegression()
model.fit(X_train, y_train)

y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
y_pred = pd.DataFrame(model.predict(X_valid), index=X_valid.index, columns=y.columns)
y_fit = y_fit.clip(0.0)
y_pred = y_pred.clip(0.0)

# Evaluate
rmsle_train = mean_squared_log_error(y_train, y_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, y_pred) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')

# Plot
STORE_NBR, FAMILY = '1', 'BEVERAGES'
y_ = store_sales.loc[:, 'sales'].unstack(['store_nbr', 'family']).loc['2017']
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6))
ax1 = y_.iloc[:-16].loc(axis=1)[STORE_NBR, FAMILY].plot(**plot_params, ax=ax1)
ax1 = plot_multistep(y_fit.loc(axis=1)[:, STORE_NBR, FAMILY], ax=ax1)
_ = ax1.legend(['Store Sales (train)', 'Forecast'])
ax2 = y_.iloc[-16:].loc(axis=1)[STORE_NBR, FAMILY].plot(**plot_params, ax=ax2)
ax2 = plot_multistep(y_pred.loc(axis=1)[:, STORE_NBR, FAMILY], ax=ax2)
_ = ax2.legend(['Store Sales (valid)', 'Forecast'])


### DirRec Strategy


In [None]:
y_long = y.stack(['store_nbr', 'family'])
X_long = X.stack(['store_nbr', 'family'])

# Turn row labels into categorical feature columns with a label encoding
X_long = X_long.reset_index(['store_nbr', 'family'])
for colname in X_long.select_dtypes(["object", "category"]):
    X_long[colname], _ = X_long[colname].factorize()

idx_train, idx_valid = train_test_split(y.index, test_size=1, shuffle=False)
X_train, X_valid = X_long.loc[idx_train], X_long.loc[idx_valid]
y_train, y_valid = y_long.loc[idx_train], y_long.loc[idx_valid]

model = RegressorChain(XGBRegressor(n_estimators=1000, tree_method='gpu_hist'))
model.fit(X_train, y_train)
y_fit = pd.DataFrame(model.predict(X_train), index=y_train.index, columns=y_long.columns)
y_pred = pd.DataFrame(model.predict(X_valid), index=y_valid.index, columns=y_long.columns)
y_fit = y_fit.clip(0.0)
y_pred = y_pred.clip(0.0)

# Evaluate
rmsle_train = mean_squared_log_error(y_train, y_fit) ** 0.5
rmsle_valid = mean_squared_log_error(y_valid, y_pred) ** 0.5
print(f'Training RMSLE: {rmsle_train:.5f}')
print(f'Validation RMSLE: {rmsle_valid:.5f}')

# Plot
STORE_NBR = '2'
FAMILY = 'PRODUCE'
key, level = (STORE_NBR, FAMILY), ['store_nbr', 'family']
y_fit_ = y_fit.xs(key, level=level, axis=0)
y_pred_ = y_pred.xs(key, level=level, axis=0)
y_ = store_sales.loc[(STORE_NBR, FAMILY), 'sales']['2017']
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11, 6))
ax1 = y_.iloc[:-16].plot(**plot_params, ax=ax1)
ax1 = plot_multistep(y_fit_, ax=ax1)
_ = ax1.legend(['Store Sales (train)', 'Forecast'])
ax2 = y_.iloc[-16:].plot(**plot_params, ax=ax2)
ax2 = plot_multistep(y_pred_, ax=ax2)
_ = ax2.legend(['Store Sales (valid)', 'Forecast'])


# (Optional) 
# Next Steps #

# References #

Here are some great resources you might like to consult for more on time series and forecasting. They all played a part in shaping this course:

- *Learnings from Kaggle's forecasting competitions*, an article by Casper Solheim Bojer and Jens Peder Meldgaard.
- *Forecasting: Principles and Practice*, a book by Rob J Hyndmann and George Athanasopoulos.
- *Practical Time Series Forecasting with R*, a book by Galit Shmueli and Kenneth C. Lichtendahl Jr.
- *Time Series Analysis and Its Applications*, a book by Robert H. Shumway and David S. Stoffer.
- *Machine learning strategies for time series forecasting*, an article by Gianluca Bontempi, Souhaib Ben Taieb, and Yann-Aël Le Borgne.
- *On the use of cross-validation for time series predictor evaluation*, an article by Christoph Bergmeir and José M. Benítez.
