In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [39]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.datasets import fetch_dataset
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries


# Data download
# ==============================================================================
#data = fetch_dataset(name="items_sales")
#data.to_parquet('items_sales.parquet', index=True)
data = pd.read_parquet('items_sales.parquet')
data = data.asfreq('D')
data.head()
exog = pd.DataFrame({
    'exog_1': np.random.normal(loc=0, scale=1, size=data.shape[0]),
    'exog_2': np.random.normal(loc=0, scale=1, size=data.shape[0]),
}, index=data.index)

end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()


# Create and train ForecasterAutoregMultiSeries
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 5,
                 encoding           = 'onehot',
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train, exog=exog.loc[data_train.index])
forecaster

ForecasterAutoregMultiSeries 
Regressor: LGBMRegressor(random_state=123, verbose=-1) 
Lags: [1 2 3 4 5] 
Transformer for series: StandardScaler() 
Transformer for exog: None 
Series encoding: onehot 
Window size: 5 
Series levels (names): ['item_1', 'item_2', 'item_3'] 
Series weights: None 
Weight function included: False 
Differentiation order: None 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: ["'item_1': ['2012-01-01', '2014-07-15']", "'item_2': ['2012-01-01', '2014-07-15']", "'item_3': ['2012-01-01', '2014-07-15']"] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: boosting_type: gbdt, class_weight: None, colsample_bytree: 1.0, importance_type: split, learning_rate: 0.1, ... 
fit_kwargs: {} 
Creation date: 2024-08-28 17:35:36 
Last fit date: 2024-08-28 17:35:36 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None

In [40]:
forecaster._create_predict_inputs_new(steps=5, exog=exog.loc[data_test.index])[0]

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-07-11,1.243388,-1.082748,-1.247452
2014-07-12,0.506499,-1.081498,-0.209864
2014-07-13,0.05203,-1.662996,-0.771849
2014-07-14,0.324398,-1.103992,-1.442852
2014-07-15,1.367731,-1.185218,-1.335598


In [41]:
exog_values = exog.loc[data_test.index].to_numpy()[:5]
exog_values

array([[-1.16086064,  1.13869053],
       [ 1.14521952, -0.02551577],
       [-0.45105887, -0.73163628],
       [ 0.73607151,  0.10636474],
       [ 1.23016068,  2.58262341]])

In [44]:
exog_values_all_levels = []
for level in forecaster.series_col_names:
    exog_values_all_levels.append(exog_values)

exog_values_all_levels

[array([[-1.16086064,  1.13869053],
        [ 1.14521952, -0.02551577],
        [-0.45105887, -0.73163628],
        [ 0.73607151,  0.10636474],
        [ 1.23016068,  2.58262341]]),
 array([[-1.16086064,  1.13869053],
        [ 1.14521952, -0.02551577],
        [-0.45105887, -0.73163628],
        [ 0.73607151,  0.10636474],
        [ 1.23016068,  2.58262341]]),
 array([[-1.16086064,  1.13869053],
        [ 1.14521952, -0.02551577],
        [-0.45105887, -0.73163628],
        [ 0.73607151,  0.10636474],
        [ 1.23016068,  2.58262341]])]

In [45]:
exog_values_all_levels = np.concatenate(exog_values_all_levels)
exog_values_all_levels

array([[-1.16086064,  1.13869053],
       [ 1.14521952, -0.02551577],
       [-0.45105887, -0.73163628],
       [ 0.73607151,  0.10636474],
       [ 1.23016068,  2.58262341],
       [-1.16086064,  1.13869053],
       [ 1.14521952, -0.02551577],
       [-0.45105887, -0.73163628],
       [ 0.73607151,  0.10636474],
       [ 1.23016068,  2.58262341],
       [-1.16086064,  1.13869053],
       [ 1.14521952, -0.02551577],
       [-0.45105887, -0.73163628],
       [ 0.73607151,  0.10636474],
       [ 1.23016068,  2.58262341]])

In [42]:
forecaster._create_predict_inputs_new(steps=5, exog=exog.loc[data_test.index])[1]

{1: array([[-1.16086064,  1.13869053],
        [-1.16086064,  1.13869053],
        [-1.16086064,  1.13869053]]),
 2: array([[ 1.14521952, -0.02551577],
        [ 1.14521952, -0.02551577],
        [ 1.14521952, -0.02551577]]),
 3: array([[-0.45105887, -0.73163628],
        [-0.45105887, -0.73163628],
        [-0.45105887, -0.73163628]]),
 4: array([[0.73607151, 0.10636474],
        [0.73607151, 0.10636474],
        [0.73607151, 0.10636474]]),
 5: array([[1.23016068, 2.58262341],
        [1.23016068, 2.58262341],
        [1.23016068, 2.58262341]])}

In [30]:
levels = ['item_1', 'item_2', 'item_3']
n_levels = len(levels)
lags = np.array([1, 2, 3, 4, 5])
steps = 2
lags_shape = len(lags)
exog_shape = 2
encoding = 'onehot'
series_col_names = levels
encoding_mapping = {'item_1': 0, 'item_2': 1, 'item_3': 2}
# Exog es un diccionario donde las claves son el step y los son numpy arrays en los
# que cada fila es un level y cada columna es una variable exogena.
exog = {
    1:np.full(shape=(steps, exog_shape), fill_value=99, dtype=float),
    2:np.full(shape=(steps, exog_shape), fill_value=999, dtype=float),
}
exog

{1: array([[99., 99.],
        [99., 99.]]),
 2: array([[999., 999.],
        [999., 999.]])}

In [4]:
if encoding is not None:
    if encoding == 'onehot':
        levels_encoded = np.zeros((n_levels, len(series_col_names)), dtype=float)
        for idx, level in enumerate(levels):
            if level in series_col_names:
                levels_encoded[idx, series_col_names.index(level)] = 1.
    else:
        levels_encoded = np.array([encoding_mapping.get(level, None) for level in levels], dtype='float64').reshape(-1, 1)
    levels_encoded_shape = levels_encoded.shape[1]
else:
    levels_encoded_shape = 0
levels_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [52]:
last_window = np.full(shape=(lags_shape, n_levels), fill_value=5, dtype=float)
predictions = np.full(shape=(steps, n_levels), fill_value=np.nan, dtype=float)
print(last_window)
print(predictions)
last_window = np.concatenate((last_window, predictions), axis=0)
last_window

[[5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]]
[[nan nan nan]
 [nan nan nan]]


array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [nan, nan, nan],
       [nan, nan, nan]])

In [6]:
features_shape = lags_shape + levels_encoded_shape + exog_shape
features = np.full(shape=(n_levels, features_shape), fill_value=np.nan, dtype=float)
features

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])

In [7]:
if encoding is not None:
    features[:, lags_shape:lags_shape + levels_encoded_shape] = levels_encoded
features

array([[nan, nan, nan, nan, nan,  1.,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan,  0.,  1.,  0., nan, nan],
       [nan, nan, nan, nan, nan,  0.,  0.,  1., nan, nan]])

In [8]:
step = 1
i = 0
features[:, :lags_shape] = last_window[-lags - (steps - i), :].transpose()
features


array([[ 5.,  5.,  5.,  5.,  5.,  1.,  0.,  0., nan, nan],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  1.,  0., nan, nan],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  0.,  1., nan, nan]])

In [9]:
if exog is not None:
    features[:, -exog_shape:] = exog[step][i, ].transpose()
features

array([[ 5.,  5.,  5.,  5.,  5.,  1.,  0.,  0., 99., 99.],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  1.,  0., 99., 99.],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  0.,  1., 99., 99.]])

In [10]:
predictions

array([[nan, nan, nan],
       [nan, nan, nan]])

In [11]:
pred = forecaster.regressor.predict(features)
pred

array([1.7669726 , 1.84544141, 1.79877931])

In [12]:
predictions[i, :] = pred
predictions

array([[1.7669726 , 1.84544141, 1.79877931],
       [       nan,        nan,        nan]])

In [13]:
last_window

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [nan, nan, nan],
       [nan, nan, nan]])

In [14]:
last_window[-(steps - i), :] = pred
last_window

array([[5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [1.7669726 , 1.84544141, 1.79877931],
       [       nan,        nan,        nan]])

In [15]:
# data = fetch_dataset(name="items_sales")
# data.to_parquet('items_sales_2.parquet', index=True)
data = pd.read_parquet("items_sales_2.parquet")
data = data.asfreq("D")
data.head()
exog = pd.DataFrame(
    {
        "exog_1": np.random.normal(loc=0, scale=1, size=data.shape[0]),
        "exog_2": np.random.normal(loc=0, scale=1, size=data.shape[0]),
    },
    index=data.index,
)

end_train = "2014-07-15 23:59:00"
data_train = data.loc[:end_train, :].copy()
data_test = data.loc[end_train:, :].copy()

forecaster.fit(series=data_train, exog=exog.loc[data_train.index])

(last_window_values_dict, exog_values_dict, levels, prediction_index, _) = (
    forecaster._create_predict_inputs(
        steps=2, levels=None, last_window=None, exog=exog.loc[data_test.index]
    )
)

In [16]:
exog_values_dict

{'item_1': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]]),
 'item_2': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]]),
 'item_3': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]])}

In [17]:
exog_values = np.concat(list(exog_values_dict.values()))
exog_values

AttributeError: module 'numpy' has no attribute 'concat'

In [None]:
exog_values_dict_2 = {}
for i in range(steps):
    exog_values_dict_2[i+1] = exog_values[i::steps, :]
exog_values_dict_2
    

{1: array([[-0.18765452, -0.87008195],
        [-0.18765452, -0.87008195],
        [-0.18765452, -0.87008195]]),
 2: array([[0.43221017, 0.4456721 ],
        [0.43221017, 0.4456721 ],
        [0.43221017, 0.4456721 ]])}

In [None]:
predictions_old = forecaster.predict(steps=50, exog=exog.loc[data_test.index])
predictions_new = forecaster.predict_new(steps=50, exog=exog.loc[data_test.index])
assert predictions_old.equals(predictions_new)

## Benchmark

In [23]:
n_series = 10
n=365
index = pd.date_range(start='2021-01-01',periods=n, freq="D")
series = [pd.Series(np.random.normal(size=n), index=index, name=f"series_{i+1}") for i in range(n_series)]
data = pd.concat(series, axis=1)
print(f"Data shape: {data.shape}")


forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1, n_estimators=20),
                 lags               = 5,
                 encoding           = 'onehot',
                 transformer_series = StandardScaler(),
                 differentiation    = 1,
             )
forecaster.fit(series = data)

Data shape: (365, 10)


In [28]:
forecaster._create_predict_inputs_new(steps=5, levels=None, last_window=None, exog=None)[0]

Unnamed: 0,series_1,series_2,series_3,series_4,series_5,series_6,series_7,series_8,series_9,series_10
2021-12-26,,,,,,,,,,
2021-12-27,1.764491,-2.015992,0.500346,2.835097,0.906547,2.672527,-0.587551,0.802951,-0.128567,1.662249
2021-12-28,-1.882484,2.266741,-0.538605,-1.759172,-0.209785,-0.912942,1.280256,-2.749229,1.325683,-0.672648
2021-12-29,-0.04345,-2.225327,0.388344,0.203108,0.611531,0.064906,-3.976492,1.197661,-2.788776,2.189782
2021-12-30,2.379408,-0.207626,0.143539,-1.197654,-0.570224,1.674584,1.133102,-0.379131,0.911555,-1.373248
2021-12-31,-2.574443,-1.376949,-0.710126,0.332241,2.095367,-1.336444,1.78623,1.666234,-0.968743,-0.085311


In [29]:
forecaster._create_predict_inputs(steps=5, levels=None, last_window=None, exog=None)[0]

{'series_1': array([        nan,  1.76449101, -1.8824843 , -0.04345037,  2.37940779,
        -2.57444287]),
 'series_2': array([        nan, -2.01599217,  2.26674075, -2.22532696, -0.20762575,
        -1.37694901]),
 'series_3': array([        nan,  0.50034638, -0.53860489,  0.38834378,  0.14353922,
        -0.71012572]),
 'series_4': array([        nan,  2.83509741, -1.75917227,  0.20310842, -1.19765425,
         0.33224091]),
 'series_5': array([        nan,  0.90654676, -0.20978491,  0.61153134, -0.57022371,
         2.09536746]),
 'series_6': array([        nan,  2.67252743, -0.91294244,  0.06490637,  1.67458375,
        -1.33644439]),
 'series_7': array([        nan, -0.58755089,  1.28025569, -3.97649202,  1.13310236,
         1.78623031]),
 'series_8': array([        nan,  0.80295136, -2.74922881,  1.19766088, -0.37913127,
         1.66623425]),
 'series_9': array([        nan, -0.12856697,  1.3256835 , -2.78877645,  0.91155492,
        -0.96874323]),
 'series_10': array([       

In [24]:
import timeit

def benchmark_function():
    forecaster.predict(steps=5)
times = timeit.repeat(benchmark_function, repeat=5, number=1)
times = np.array(times)
print(f"Mean time: {times.mean()} , std: {times.std()}, max: {times.max()}, min: {times.min()}")

def benchmark_function():
    forecaster.predict_new(steps=5)
times = timeit.repeat(benchmark_function, repeat=5, number=1)
times = np.array(times)
print(f"Mean time: {times.mean()} , std: {times.std()}, max: {times.max()}, min: {times.min()}")

Mean time: 0.04043924000288825 , std: 0.0035832025703343793, max: 0.04576779999479186, min: 0.036055500007933006
Mean time: 0.01889563999720849 , std: 0.001906353063487209, max: 0.020398299995576963, min: 0.015139499999349937


In [25]:
assert forecaster.predict(steps=5).equals(forecaster.predict_new(steps=5))