In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries

# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/simulated_items_sales.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
exog = data.copy()
exog.columns = [f'exog_{i}' for i in range(exog.shape[1])]

# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()
exog_train = exog.loc[:end_train, :].copy()
exog_test  = exog.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [3]:
data_train.head(2)

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863


In [4]:
y = data_train['item_1']
y

date
2012-01-01     8.253175
2012-01-02    22.777826
2012-01-03    27.549099
2012-01-04    25.895533
2012-01-05    21.379238
                ...    
2014-07-11    25.662128
2014-07-12    23.773923
2014-07-13    22.609388
2014-07-14    23.307307
2014-07-15    25.980745
Freq: D, Name: item_1, Length: 927, dtype: float64

In [10]:
mask = y.notna().to_numpy()
mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [11]:
y.loc[mask]

date
2012-01-01     8.253175
2012-01-02    22.777826
2012-01-03    27.549099
2012-01-04    25.895533
2012-01-05    21.379238
                ...    
2014-07-11    25.662128
2014-07-12    23.773923
2014-07-13    22.609388
2014-07-14    23.307307
2014-07-15    25.980745
Freq: D, Name: item_1, Length: 927, dtype: float64

In [9]:
data_train.iloc[mask.to_numpy(),]

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737
...,...,...,...
2014-07-11,25.662128,11.002083,10.396751
2014-07-12,23.773923,11.008333,16.139173
2014-07-13,22.609388,8.100000,13.028927
2014-07-14,23.307307,10.895833,9.315334


In [37]:
series = {
    'item_1': data_train['item_1'],
    'item_2': data_train['item_2'].tail(20).head(10),
    'item_3': data_train['item_3'],
}
series

{'item_1': date
 2012-01-01     8.253175
 2012-01-02    22.777826
 2012-01-03    27.549099
 2012-01-04    25.895533
 2012-01-05    21.379238
                 ...    
 2014-07-11    25.662128
 2014-07-12    23.773923
 2014-07-13    22.609388
 2014-07-14    23.307307
 2014-07-15    25.980745
 Freq: D, Name: item_1, Length: 927, dtype: float64,
 'item_2': date
 2014-06-26    13.377083
 2014-06-27    13.177083
 2014-06-28    12.260417
 2014-06-29    10.689583
 2014-06-30     8.402083
 2014-07-01    10.772917
 2014-07-02    12.397917
 2014-07-03    12.133333
 2014-07-04     9.637500
 2014-07-05    11.062500
 Freq: D, Name: item_2, dtype: float64,
 'item_3': date
 2012-01-01    19.429739
 2012-01-02    28.009863
 2012-01-03    32.078922
 2012-01-04    27.252276
 2012-01-05    20.357737
                 ...    
 2014-07-11    10.396751
 2014-07-12    16.139173
 2014-07-13    13.028927
 2014-07-14     9.315334
 2014-07-15     9.908915
 Freq: D, Name: item_3, Length: 927, dtype: float64}

In [40]:
levels = ['item_1', 'item_3']

b = {
    k: v for 
    k, v in series.items() 
    if not np.isnan(v.iat[-1]) and k in levels
}

In [41]:
b

{'item_1': date
 2012-01-01     8.253175
 2012-01-02    22.777826
 2012-01-03    27.549099
 2012-01-04    25.895533
 2012-01-05    21.379238
                 ...    
 2014-07-11    25.662128
 2014-07-12    23.773923
 2014-07-13    22.609388
 2014-07-14    23.307307
 2014-07-15    25.980745
 Freq: D, Name: item_1, Length: 927, dtype: float64,
 'item_3': date
 2012-01-01    19.429739
 2012-01-02    28.009863
 2012-01-03    32.078922
 2012-01-04    27.252276
 2012-01-05    20.357737
                 ...    
 2014-07-11    10.396751
 2014-07-12    16.139173
 2014-07-13    13.028927
 2014-07-14     9.315334
 2014-07-15     9.908915
 Freq: D, Name: item_3, Length: 927, dtype: float64}

In [36]:
a = pd.DataFrame(series).tail(15)
a

Unnamed: 0_level_0,item_1,item_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-07-01,25.643211,10.772917
2014-07-02,25.39251,12.397917
2014-07-03,24.527493,12.133333
2014-07-04,24.8644,9.6375
2014-07-05,23.672876,11.0625
2014-07-06,21.449169,
2014-07-07,22.473733,
2014-07-08,24.323068,
2014-07-09,24.367445,
2014-07-10,26.067678,


In [33]:
a.iloc[0,0] is series['item_1'].iloc[0]

False

In [25]:
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 encoding           = 'ordinal_category',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None,
                 #fit_kwargs={'categorical_feature':'auto'}
             )

results = forecaster.create_train_X_y(series={'item_1': data_train['item_1'], 'item_2': data_train['item_2'].head(6)},)

ValueError: The maximum lag (24) must be less than the length of the series 'item_2', (6).

In [13]:
results[0]

Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,...,lag_16,lag_17,lag_18,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24,_level_skforecast
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-25,28.018830,23.981037,20.794986,22.503533,24.018768,24.772249,29.245869,26.636444,20.228468,18.976196,...,20.006161,20.069327,20.533871,21.106643,21.379238,25.895533,27.549099,22.777826,8.253175,0
2012-01-26,28.747482,28.018830,23.981037,20.794986,22.503533,24.018768,24.772249,29.245869,26.636444,20.228468,...,21.620184,20.006161,20.069327,20.533871,21.106643,21.379238,25.895533,27.549099,22.777826,0
2012-01-27,23.908368,28.747482,28.018830,23.981037,20.794986,22.503533,24.018768,24.772249,29.245869,26.636444,...,21.717691,21.620184,20.006161,20.069327,20.533871,21.106643,21.379238,25.895533,27.549099,0
2012-01-28,21.423930,23.908368,28.747482,28.018830,23.981037,20.794986,22.503533,24.018768,24.772249,29.245869,...,21.751748,21.717691,21.620184,20.006161,20.069327,20.533871,21.106643,21.379238,25.895533,0
2012-01-29,24.786455,21.423930,23.908368,28.747482,28.018830,23.981037,20.794986,22.503533,24.018768,24.772249,...,21.758617,21.751748,21.717691,21.620184,20.006161,20.069327,20.533871,21.106643,21.379238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-07-11,9.677730,12.199832,14.072343,15.243359,12.280077,14.214074,4.360624,6.939518,17.807199,11.165200,...,16.153380,13.442053,12.734172,11.130242,12.040121,9.422098,13.621199,15.499619,9.453137,2
2014-07-12,10.396751,9.677730,12.199832,14.072343,15.243359,12.280077,14.214074,4.360624,6.939518,17.807199,...,15.183975,16.153380,13.442053,12.734172,11.130242,12.040121,9.422098,13.621199,15.499619,2
2014-07-13,16.139173,10.396751,9.677730,12.199832,14.072343,15.243359,12.280077,14.214074,4.360624,6.939518,...,15.332012,15.183975,16.153380,13.442053,12.734172,11.130242,12.040121,9.422098,13.621199,2
2014-07-14,13.028927,16.139173,10.396751,9.677730,12.199832,14.072343,15.243359,12.280077,14.214074,4.360624,...,15.207243,15.332012,15.183975,16.153380,13.442053,12.734172,11.130242,12.040121,9.422098,2


In [21]:
results[0].isnull().any().any()

False

In [23]:
results[0].notna().all(axis=1)

date
2012-01-25    True
2012-01-26    True
2012-01-27    True
2012-01-28    True
2012-01-29    True
              ... 
2014-07-11    True
2014-07-12    True
2014-07-13    True
2014-07-14    True
2014-07-15    True
Length: 2709, dtype: bool

In [14]:
mask = results[1].notna()
mask

date
2012-01-25    True
2012-01-26    True
2012-01-27    True
2012-01-28    True
2012-01-29    True
              ... 
2014-07-11    True
2014-07-12    True
2014-07-13    True
2014-07-14    True
2014-07-15    True
Name: y, Length: 2709, dtype: bool

In [18]:
results[1].iloc[mask.to_numpy()]

date
2012-01-25    28.747482
2012-01-26    23.908368
2012-01-27    21.423930
2012-01-28    24.786455
2012-01-29    24.615778
                ...    
2014-07-11    10.396751
2014-07-12    16.139173
2014-07-13    13.028927
2014-07-14     9.315334
2014-07-15     9.908915
Name: y, Length: 2709, dtype: float64

In [5]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 encoding           = 'ordinal_category',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None,
                 #fit_kwargs={'categorical_feature':'auto'}
             )

forecaster.fit(series=data_train, exog=exog_train)


print(forecaster.encoding_mapping)

# Show categorical features if present
cat_index = forecaster.regressor.booster_.params.get('categorical_column')
if cat_index is not None:
    features = forecaster.regressor.booster_.feature_name()
    print([features[i] for i in cat_index])

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train, exog_train)
print(X_train.dtypes)

forecaster.predict(steps=10, exog=exog_test.head(10))

{'item_1': 0, 'item_2': 1, 'item_3': 2}
['_level_skforecast']
lag_1                 float64
lag_2                 float64
lag_3                 float64
lag_4                 float64
lag_5                 float64
lag_6                 float64
lag_7                 float64
lag_8                 float64
lag_9                 float64
lag_10                float64
lag_11                float64
lag_12                float64
lag_13                float64
lag_14                float64
lag_15                float64
lag_16                float64
lag_17                float64
lag_18                float64
lag_19                float64
lag_20                float64
lag_21                float64
lag_22                float64
lag_23                float64
lag_24                float64
_level_skforecast    category
exog_0                float64
exog_1                float64
exog_2                float64
dtype: object


Unnamed: 0,item_1,item_2,item_3
2014-07-16,26.423579,10.961637,9.464633
2014-07-17,25.265638,11.744702,15.212656
2014-07-18,26.357317,9.460588,14.011157
2014-07-19,24.172871,10.151893,9.039376
2014-07-20,21.998063,10.781902,11.771893
2014-07-21,22.788777,8.765123,10.84002
2014-07-22,26.474959,8.563326,8.126697
2014-07-23,26.769765,8.184993,9.81339
2014-07-24,26.383186,9.361715,9.286824
2014-07-25,26.320084,12.638158,10.58183


In [16]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None,
                 #fit_kwargs={'categorical_feature':'auto'}
             )

forecaster.fit(series=data_train)


print(forecaster.encoding_mapping)

# Show categorical features if present
cat_index = forecaster.regressor.booster_.params.get('categorical_column')
if cat_index is not None:
    features = forecaster.regressor.booster_.feature_name()
    print([features[i] for i in cat_index])

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
print(X_train.dtypes)

forecaster.predict(steps=10)

{'item_1': 0, 'item_2': 1, 'item_3': 2}
lag_1                float64
lag_2                float64
lag_3                float64
lag_4                float64
lag_5                float64
lag_6                float64
lag_7                float64
lag_8                float64
lag_9                float64
lag_10               float64
lag_11               float64
lag_12               float64
lag_13               float64
lag_14               float64
lag_15               float64
lag_16               float64
lag_17               float64
lag_18               float64
lag_19               float64
lag_20               float64
lag_21               float64
lag_22               float64
lag_23               float64
lag_24               float64
_level_skforecast      int64
dtype: object


Unnamed: 0,item_1,item_2,item_3
2014-07-16,25.906323,10.522491,12.034587
2014-07-17,25.807194,10.623789,10.503966
2014-07-18,25.127355,11.299802,12.206434
2014-07-19,23.902609,11.441606,12.61874
2014-07-20,21.660527,11.658107,12.148873
2014-07-21,22.756076,11.377895,10.66693
2014-07-22,25.064381,10.869464,10.115581
2014-07-23,25.070926,12.064482,11.608842
2014-07-24,25.149565,10.882162,11.782225
2014-07-25,25.121728,10.755604,10.307792


In [6]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 encoding           = 'onehot',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None,
                 #fit_kwargs={'categorical_feature':'auto'}
             )

forecaster.fit(series=data_train)


print(forecaster.encoding_mapping)

# Show categorical features if present
cat_index = forecaster.regressor.booster_.params.get('categorical_column')
if cat_index is not None:
    features = forecaster.regressor.booster_.feature_name()
    print([features[i] for i in cat_index])

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
print(X_train.dtypes)

forecaster.predict(steps=10)

{'item_1': 0, 'item_2': 1, 'item_3': 2}
lag_1     float64
lag_2     float64
lag_3     float64
lag_4     float64
lag_5     float64
lag_6     float64
lag_7     float64
lag_8     float64
lag_9     float64
lag_10    float64
lag_11    float64
lag_12    float64
lag_13    float64
lag_14    float64
lag_15    float64
lag_16    float64
lag_17    float64
lag_18    float64
lag_19    float64
lag_20    float64
lag_21    float64
lag_22    float64
lag_23    float64
lag_24    float64
item_1      int32
item_2      int32
item_3      int32
dtype: object


Unnamed: 0,item_1,item_2,item_3
2014-07-16,25.860322,10.589852,11.821867
2014-07-17,25.710671,11.249642,10.875323
2014-07-18,25.25406,11.214945,12.383875
2014-07-19,24.135732,11.39277,11.752806
2014-07-20,21.717562,11.092514,11.108611
2014-07-21,22.869126,11.367802,10.028264
2014-07-22,25.297145,10.610384,9.839174
2014-07-23,25.480385,12.443352,12.550434
2014-07-24,25.52458,11.56397,11.957162
2014-07-25,25.376043,11.425697,10.186801


In [8]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = HistGradientBoostingRegressor(random_state=123, categorical_features=['_level_skforecast']),
                 lags               = 24,
                 encoding           = 'ordinal_category',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train)


print(forecaster.encoding_mapping)

# Show categorical features if present
cat_index = forecaster.regressor.is_categorical_
if cat_index is not None:
    features = forecaster.regressor.feature_names_in_
    print(features[cat_index])

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
print(X_train.dtypes)

forecaster.predict(steps=10)

{'item_1': 0, 'item_2': 1, 'item_3': 2}
['_level_skforecast']
lag_1                 float64
lag_2                 float64
lag_3                 float64
lag_4                 float64
lag_5                 float64
lag_6                 float64
lag_7                 float64
lag_8                 float64
lag_9                 float64
lag_10                float64
lag_11                float64
lag_12                float64
lag_13                float64
lag_14                float64
lag_15                float64
lag_16                float64
lag_17                float64
lag_18                float64
lag_19                float64
lag_20                float64
lag_21                float64
lag_22                float64
lag_23                float64
lag_24                float64
_level_skforecast    category
dtype: object


Unnamed: 0,item_1,item_2,item_3
2014-07-16,25.792675,11.173958,12.077978
2014-07-17,25.546403,10.943413,10.099237
2014-07-18,25.233413,11.594754,12.391873
2014-07-19,23.950772,11.603061,12.225715
2014-07-20,21.515123,11.582226,11.18319
2014-07-21,22.310733,11.005379,10.95011
2014-07-22,24.708569,11.527696,10.707694
2014-07-23,25.092788,11.724573,12.762186
2014-07-24,25.394611,10.466541,11.711561
2014-07-25,25.516455,10.938599,11.257149


In [22]:
# Series weights

forecaster = ForecasterAutoregMultiSeries(
                 regressor          = HistGradientBoostingRegressor(random_state=123, categorical_features=['_level_skforecast']),
                 lags               = 24,
                 encoding           = 'ordinal_category',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = {'item_1': 0, 'item_2': 1, 'item_3': 2},
                 forecaster_id      = None
             )

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
sample_weights = forecaster.create_sample_weights(series_col_names= series_col_names, X_train =  X_train)

results = X_train[['_level_skforecast']].assign(sample_weights = sample_weights)
(results['_level_skforecast'] == results['sample_weights']).all()

True

In [None]:

forecaster = ForecasterAutoregMultiSeries(
                 regressor          = HistGradientBoostingRegressor(random_state=123, categorical_features=['_level_skforecast']),
                 lags               = 24,
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = {'item_1': 0, 'item_2': 1, 'item_3': 2},
                 forecaster_id      = None
             )

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
sample_weights = forecaster.create_sample_weights(series_col_names= series_col_names, X_train =  X_train)

results = X_train[['_level_skforecast']].assign(sample_weights = sample_weights)
(results['_level_skforecast'] == results['sample_weights']).all()

In [None]:
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = HistGradientBoostingRegressor(random_state=123, categorical_features=['_level_skforecast']),
                 lags               = 24,
                 encoding           = 'onehot',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = {'item_1': 0, 'item_2': 1, 'item_3': 2},
                 forecaster_id      = None
             )

X_train, y_train, series_indexes, series_col_names, exog_col_names, exog_dtypes = forecaster.create_train_X_y(data_train)
sample_weights = forecaster.create_sample_weights(series_col_names= series_col_names, X_train =  X_train)

results = X_train[['_level_skforecast']].assign(sample_weights = sample_weights)
(results['_level_skforecast'] == results['sample_weights']).all()