In [127]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error

from skforecast.recursive._forecaster_recursive_multiseries import ForecasterRecursiveMultiSeries
import skforecast
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.model_selection import grid_search_forecaster_multiseries

In [121]:
backtesting_forecaster_multiseries



In [89]:
# Data load
# ==============================================================================
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [90]:
train['date'] = pd.to_datetime(train['date'], format='%Y-%m-%d')

In [91]:
train = train.set_index('date')

In [92]:
train.head()

Unnamed: 0_level_0,id,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,0,Canada,Discount Stickers,Holographic Goose,
2010-01-01,1,Canada,Discount Stickers,Kaggle,973.0
2010-01-01,2,Canada,Discount Stickers,Kaggle Tiers,906.0
2010-01-01,3,Canada,Discount Stickers,Kerneler,423.0
2010-01-01,4,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [93]:
grouped = train.groupby(['country','store','product'])
grouped_dataframes = {key: group for key, group in grouped}

In [94]:
grouped_dataframes_selected = dict()
for key, dataset in grouped_dataframes.items():
    if dataset.isnull().any().sum() == 0:
        grouped_dataframes_selected[key] = dataset

In [95]:
final_df = pd.DataFrame({'_'.join(key):df['num_sold'] for key, df in grouped_dataframes_selected.items()})

In [96]:
final_df

Unnamed: 0_level_0,Canada_Discount Stickers_Kaggle,Canada_Discount Stickers_Kaggle Tiers,Canada_Discount Stickers_Kerneler Dark Mode,Canada_Premium Sticker Mart_Kaggle,Canada_Premium Sticker Mart_Kaggle Tiers,Canada_Premium Sticker Mart_Kerneler,Canada_Premium Sticker Mart_Kerneler Dark Mode,Canada_Stickers for Less_Kaggle,Canada_Stickers for Less_Kaggle Tiers,Canada_Stickers for Less_Kerneler,...,Singapore_Premium Sticker Mart_Holographic Goose,Singapore_Premium Sticker Mart_Kaggle,Singapore_Premium Sticker Mart_Kaggle Tiers,Singapore_Premium Sticker Mart_Kerneler,Singapore_Premium Sticker Mart_Kerneler Dark Mode,Singapore_Stickers for Less_Holographic Goose,Singapore_Stickers for Less_Kaggle,Singapore_Stickers for Less_Kaggle Tiers,Singapore_Stickers for Less_Kerneler,Singapore_Stickers for Less_Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,973.0,906.0,491.0,2212.0,2013.0,950.0,1213.0,1837.0,1659.0,807.0,...,317.0,2112.0,1992.0,1045.0,1255.0,301.0,1981.0,1565.0,841.0,1080.0
2010-01-02,881.0,854.0,437.0,2183.0,1953.0,895.0,1217.0,1972.0,1541.0,783.0,...,364.0,2322.0,1938.0,1010.0,1224.0,268.0,1763.0,1689.0,781.0,971.0
2010-01-03,1003.0,839.0,495.0,2459.0,1938.0,1015.0,1188.0,1936.0,1770.0,832.0,...,388.0,2253.0,2315.0,1040.0,1314.0,289.0,1801.0,1690.0,813.0,987.0
2010-01-04,744.0,609.0,441.0,1714.0,1567.0,802.0,1046.0,1382.0,1243.0,646.0,...,269.0,1760.0,1656.0,846.0,927.0,256.0,1543.0,1205.0,602.0,839.0
2010-01-05,707.0,640.0,372.0,1593.0,1590.0,773.0,894.0,1416.0,1319.0,612.0,...,264.0,1536.0,1410.0,770.0,962.0,226.0,1364.0,1352.0,629.0,795.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-27,694.0,577.0,372.0,1611.0,1316.0,775.0,935.0,1518.0,1104.0,612.0,...,351.0,2169.0,1906.0,1079.0,1179.0,267.0,1948.0,1483.0,853.0,1016.0
2016-12-28,810.0,601.0,424.0,1812.0,1418.0,802.0,927.0,1702.0,1249.0,733.0,...,371.0,2558.0,1742.0,1050.0,1177.0,316.0,2127.0,1580.0,962.0,1062.0
2016-12-29,856.0,687.0,442.0,2157.0,1604.0,937.0,1110.0,1607.0,1137.0,746.0,...,406.0,2833.0,2238.0,1133.0,1421.0,357.0,2269.0,1788.0,980.0,1178.0
2016-12-30,939.0,704.0,498.0,2218.0,1597.0,971.0,1169.0,1851.0,1465.0,836.0,...,420.0,2822.0,2109.0,1253.0,1518.0,359.0,2357.0,1872.0,1094.0,1357.0


In [100]:
end_train = '2015-06-17'
final_df_train = final_df.loc[:end_train].copy()
final_df_test = final_df.loc[end_train:].copy()

print(
    f"Train dates : {final_df_train.index.min()} --- {final_df_train.index.max()}   "
    f"(n={len(final_df_train)})"
)
print(
    f"Test dates  : {final_df_test.index.min()} --- {final_df_test.index.max()}   "
    f"(n={len(final_df_test)})"
)

Train dates : 2010-01-01 00:00:00 --- 2015-06-17 00:00:00   (n=1994)
Test dates  : 2015-06-17 00:00:00 --- 2016-12-31 00:00:00   (n=564)


In [101]:
final_df_train

Unnamed: 0_level_0,Canada_Discount Stickers_Kaggle,Canada_Discount Stickers_Kaggle Tiers,Canada_Discount Stickers_Kerneler Dark Mode,Canada_Premium Sticker Mart_Kaggle,Canada_Premium Sticker Mart_Kaggle Tiers,Canada_Premium Sticker Mart_Kerneler,Canada_Premium Sticker Mart_Kerneler Dark Mode,Canada_Stickers for Less_Kaggle,Canada_Stickers for Less_Kaggle Tiers,Canada_Stickers for Less_Kerneler,...,Singapore_Premium Sticker Mart_Holographic Goose,Singapore_Premium Sticker Mart_Kaggle,Singapore_Premium Sticker Mart_Kaggle Tiers,Singapore_Premium Sticker Mart_Kerneler,Singapore_Premium Sticker Mart_Kerneler Dark Mode,Singapore_Stickers for Less_Holographic Goose,Singapore_Stickers for Less_Kaggle,Singapore_Stickers for Less_Kaggle Tiers,Singapore_Stickers for Less_Kerneler,Singapore_Stickers for Less_Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,973.0,906.0,491.0,2212.0,2013.0,950.0,1213.0,1837.0,1659.0,807.0,...,317.0,2112.0,1992.0,1045.0,1255.0,301.0,1981.0,1565.0,841.0,1080.0
2010-01-02,881.0,854.0,437.0,2183.0,1953.0,895.0,1217.0,1972.0,1541.0,783.0,...,364.0,2322.0,1938.0,1010.0,1224.0,268.0,1763.0,1689.0,781.0,971.0
2010-01-03,1003.0,839.0,495.0,2459.0,1938.0,1015.0,1188.0,1936.0,1770.0,832.0,...,388.0,2253.0,2315.0,1040.0,1314.0,289.0,1801.0,1690.0,813.0,987.0
2010-01-04,744.0,609.0,441.0,1714.0,1567.0,802.0,1046.0,1382.0,1243.0,646.0,...,269.0,1760.0,1656.0,846.0,927.0,256.0,1543.0,1205.0,602.0,839.0
2010-01-05,707.0,640.0,372.0,1593.0,1590.0,773.0,894.0,1416.0,1319.0,612.0,...,264.0,1536.0,1410.0,770.0,962.0,226.0,1364.0,1352.0,629.0,795.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-13,799.0,573.0,386.0,1825.0,1410.0,663.0,870.0,1431.0,1173.0,626.0,...,235.0,2051.0,1665.0,886.0,1191.0,220.0,1743.0,1495.0,832.0,997.0
2015-06-14,811.0,629.0,411.0,1707.0,1476.0,714.0,1062.0,1647.0,1234.0,637.0,...,289.0,2502.0,1700.0,1028.0,1276.0,227.0,1885.0,1740.0,827.0,1186.0
2015-06-15,626.0,473.0,358.0,1484.0,1216.0,675.0,861.0,1282.0,1110.0,547.0,...,226.0,1926.0,1361.0,830.0,1093.0,187.0,1536.0,1453.0,687.0,807.0
2015-06-16,637.0,519.0,345.0,1519.0,1299.0,615.0,757.0,1226.0,1038.0,533.0,...,261.0,1856.0,1611.0,844.0,1090.0,179.0,1661.0,1309.0,740.0,774.0


In [103]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = Ridge(random_state=123),
                 lags               = 564,
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None
             )

forecaster.fit(series=final_df_train)
forecaster



In [123]:
# Predict and predict_interval
# ==============================================================================
steps = 564

# Predictions for item_1
predictions_item_1 = forecaster.predict(steps=steps, levels='Canada_Discount Stickers_Kaggle')
display(predictions_item_1.head(5))

Unnamed: 0,Canada_Discount Stickers_Kaggle
1994,657.781253
1995,693.992923
1996,742.703195
1997,804.70685
1998,665.23072


In [130]:
error_mae = mean_absolute_error(final_df_test['Canada_Discount Stickers_Kaggle'], predictions_item_1)
error_rmse = root_mean_squared_error(
                y_true = final_df_test['Canada_Discount Stickers_Kaggle'],
                y_pred = predictions_item_1
            )
error_mape = mean_absolute_percentage_error(
                y_true = final_df_test['Canada_Discount Stickers_Kaggle'],
                y_pred = predictions_item_1
            )

In [131]:
print('mae: ', error_mae)
print('rmse: ', error_rmse)
print('mape: ', error_mape)

mae:  80.0536624141604
rmse:  91.9966675552214
mape:  0.12661847859062314


In [118]:
# Backtesting Multi Series
# ==============================================================================
metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
                                           forecaster         = forecaster,
                                           series             = final_df,
                                           levels             = None,
                                           steps              = 564,
                                           metric             = 'mean_absolute_error',
                                           initial_train_size = len(final_df_train),
                                           refit              = True,
                                           fixed_train_size   = True,
                                           verbose            = False
                                       )

print("Backtest metrics")
display(metrics_levels)
print("")
print("Backtest predictions")
backtest_predictions.head(4)

TypeError: `cv` must be a TimeSeriesFold object. Got str.