In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.datasets import fetch_dataset
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries


# Data download
# ==============================================================================
#data = fetch_dataset(name="items_sales")
#data.to_parquet('items_sales.parquet', index=True)
data = pd.read_parquet('items_sales.parquet')
data = data.asfreq('D')
data.head()
exog = pd.DataFrame({
    'exog_1': np.random.normal(loc=0, scale=1, size=data.shape[0]),
    'exog_2': np.random.normal(loc=0, scale=1, size=data.shape[0]),
    # 'exog_3': (['A'] * int(data.shape[0] / 2) + ['B'] * int(data.shape[0] / 2 + 1)),
}, index=data.index)

end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

transformer_exog = ColumnTransformer(
                        [('scale', StandardScaler(), ['exog_1', 'exog_2']),
                        ('onehot', OneHotEncoder(), ['exog_3'])],
                        remainder = 'passthrough',
                        verbose_feature_names_out = False
                    )

# Create and train ForecasterAutoregMultiSeries
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = [1, 5],
                 encoding           = 'onehot',
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train, exog=exog.loc[data_train.index])

In [3]:
last_window, exog_values_dict, levels, prediction_index, residuals = (
    forecaster._create_predict_inputs_new(steps=2, exog=exog.loc[data_test.index])
)

In [4]:
# create sample DataFrame
# ==============================================================================
df = pd.DataFrame({
    'series': np.random.normal(loc=0, scale=1, size=10),
    'exog_1': np.random.normal(loc=0, scale=1, size=10),
    'exog_2': np.random.normal(loc=0, scale=1, size=10),
    'exog_3': (['A'] * 5 + ['B'] * 5),
}, index=pd.date_range(start='2020-01-01', periods=10, freq='D'))
df

Unnamed: 0,series,exog_1,exog_2,exog_3
2020-01-01,-0.666854,-1.436774,1.89345,A
2020-01-02,-0.733807,-1.209973,-0.344568,A
2020-01-03,1.35158,0.596933,0.012947,A
2020-01-04,0.003248,-0.301152,-1.177887,A
2020-01-05,1.287025,-1.155264,1.792421,A
2020-01-06,-1.445481,0.52789,-1.305115,B
2020-01-07,-1.117211,-1.749041,0.019879,B
2020-01-08,1.157582,-0.447523,0.229614,B
2020-01-09,1.213744,-0.407021,1.049825,B
2020-01-10,0.155931,-1.495649,-0.340978,B


In [5]:
predictions = np.full(shape=(2, 4), fill_value=np.nan, dtype=float)
np.concatenate((df, predictions), axis=0)

array([[-0.6668540721965589, -1.4367735373685158, 1.893449528243407, 'A'],
       [-0.7338068695031076, -1.2099733335032965, -0.3445679772214376,
        'A'],
       [1.3515801136612737, 0.5969327451348766, 0.012947296294516275,
        'A'],
       [0.003248452986988212, -0.3011518145437802, -1.1778868397491524,
        'A'],
       [1.287024537513369, -1.1552641368760248, 1.7924212374777957, 'A'],
       [-1.4454805780216438, 0.5278896515944603, -1.3051149776763384,
        'B'],
       [-1.1172110715526042, -1.749040707888334, 0.019879024489918726,
        'B'],
       [1.1575824387882823, -0.44752289990589117, 0.22961353297568346,
        'B'],
       [1.2137444790208047, -0.4070206811778494, 1.0498249148728203, 'B'],
       [0.15593085313263452, -1.4956487424474096, -0.3409775531011584,
        'B'],
       [nan, nan, nan, nan],
       [nan, nan, nan, nan]], dtype=object)

In [6]:
_ = forecaster._recursive_predict_new(
    steps=2,
    levels=levels,
    last_window=last_window,
    exog=exog_values_dict
)

TypeError: ForecasterAutoregMultiSeries._recursive_predict_new() got an unexpected keyword argument 'exog'

In [42]:
forecaster._create_predict_inputs_new(steps=5, exog=exog.loc[data_test.index])[1]

{1: array([[-1.16086064,  1.13869053],
        [-1.16086064,  1.13869053],
        [-1.16086064,  1.13869053]]),
 2: array([[ 1.14521952, -0.02551577],
        [ 1.14521952, -0.02551577],
        [ 1.14521952, -0.02551577]]),
 3: array([[-0.45105887, -0.73163628],
        [-0.45105887, -0.73163628],
        [-0.45105887, -0.73163628]]),
 4: array([[0.73607151, 0.10636474],
        [0.73607151, 0.10636474],
        [0.73607151, 0.10636474]]),
 5: array([[1.23016068, 2.58262341],
        [1.23016068, 2.58262341],
        [1.23016068, 2.58262341]])}

In [30]:
levels = ['item_1', 'item_2', 'item_3']
n_levels = len(levels)
lags = np.array([1, 2, 3, 4, 5])
steps = 2
lags_shape = len(lags)
exog_shape = 2
encoding = 'onehot'
series_col_names = levels
encoding_mapping = {'item_1': 0, 'item_2': 1, 'item_3': 2}
# Exog es un diccionario donde las claves son el step y los son numpy arrays en los
# que cada fila es un level y cada columna es una variable exogena.
exog = {
    1:np.full(shape=(steps, exog_shape), fill_value=99, dtype=float),
    2:np.full(shape=(steps, exog_shape), fill_value=999, dtype=float),
}
exog

{1: array([[99., 99.],
        [99., 99.]]),
 2: array([[999., 999.],
        [999., 999.]])}

In [4]:
if encoding is not None:
    if encoding == 'onehot':
        levels_encoded = np.zeros((n_levels, len(series_col_names)), dtype=float)
        for idx, level in enumerate(levels):
            if level in series_col_names:
                levels_encoded[idx, series_col_names.index(level)] = 1.
    else:
        levels_encoded = np.array([encoding_mapping.get(level, None) for level in levels], dtype='float64').reshape(-1, 1)
    levels_encoded_shape = levels_encoded.shape[1]
else:
    levels_encoded_shape = 0
levels_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [52]:
last_window = np.full(shape=(lags_shape, n_levels), fill_value=5, dtype=float)
predictions = np.full(shape=(steps, n_levels), fill_value=np.nan, dtype=float)
print(last_window)
print(predictions)
last_window = np.concatenate((last_window, predictions), axis=0)
last_window

[[5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]
 [5. 5. 5.]]
[[nan nan nan]
 [nan nan nan]]


array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [nan, nan, nan],
       [nan, nan, nan]])

In [6]:
features_shape = lags_shape + levels_encoded_shape + exog_shape
features = np.full(shape=(n_levels, features_shape), fill_value=np.nan, dtype=float)
features

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])

In [7]:
if encoding is not None:
    features[:, lags_shape:lags_shape + levels_encoded_shape] = levels_encoded
features

array([[nan, nan, nan, nan, nan,  1.,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan,  0.,  1.,  0., nan, nan],
       [nan, nan, nan, nan, nan,  0.,  0.,  1., nan, nan]])

In [8]:
step = 1
i = 0
features[:, :lags_shape] = last_window[-lags - (steps - i), :].transpose()
features


array([[ 5.,  5.,  5.,  5.,  5.,  1.,  0.,  0., nan, nan],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  1.,  0., nan, nan],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  0.,  1., nan, nan]])

In [9]:
if exog is not None:
    features[:, -exog_shape:] = exog[step][i, ].transpose()
features

array([[ 5.,  5.,  5.,  5.,  5.,  1.,  0.,  0., 99., 99.],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  1.,  0., 99., 99.],
       [ 5.,  5.,  5.,  5.,  5.,  0.,  0.,  1., 99., 99.]])

In [10]:
predictions

array([[nan, nan, nan],
       [nan, nan, nan]])

In [11]:
pred = forecaster.regressor.predict(features)
pred

array([1.7669726 , 1.84544141, 1.79877931])

In [12]:
predictions[i, :] = pred
predictions

array([[1.7669726 , 1.84544141, 1.79877931],
       [       nan,        nan,        nan]])

In [13]:
last_window

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [nan, nan, nan],
       [nan, nan, nan]])

In [14]:
last_window[-(steps - i), :] = pred
last_window

array([[5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [5.        , 5.        , 5.        ],
       [1.7669726 , 1.84544141, 1.79877931],
       [       nan,        nan,        nan]])

In [15]:
# data = fetch_dataset(name="items_sales")
# data.to_parquet('items_sales_2.parquet', index=True)
data = pd.read_parquet("items_sales_2.parquet")
data = data.asfreq("D")
data.head()
exog = pd.DataFrame(
    {
        "exog_1": np.random.normal(loc=0, scale=1, size=data.shape[0]),
        "exog_2": np.random.normal(loc=0, scale=1, size=data.shape[0]),
    },
    index=data.index,
)

end_train = "2014-07-15 23:59:00"
data_train = data.loc[:end_train, :].copy()
data_test = data.loc[end_train:, :].copy()

forecaster.fit(series=data_train, exog=exog.loc[data_train.index])

(last_window_values_dict, exog_values_dict, levels, prediction_index, _) = (
    forecaster._create_predict_inputs(
        steps=2, levels=None, last_window=None, exog=exog.loc[data_test.index]
    )
)

In [16]:
exog_values_dict

{'item_1': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]]),
 'item_2': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]]),
 'item_3': array([[-0.90467518,  1.57613029],
        [-0.84942286,  0.36144276]])}

In [17]:
exog_values = np.concat(list(exog_values_dict.values()))
exog_values

AttributeError: module 'numpy' has no attribute 'concat'

In [None]:
exog_values_dict_2 = {}
for i in range(steps):
    exog_values_dict_2[i+1] = exog_values[i::steps, :]
exog_values_dict_2
    

{1: array([[-0.18765452, -0.87008195],
        [-0.18765452, -0.87008195],
        [-0.18765452, -0.87008195]]),
 2: array([[0.43221017, 0.4456721 ],
        [0.43221017, 0.4456721 ],
        [0.43221017, 0.4456721 ]])}

In [None]:
predictions_old = forecaster.predict(steps=50, exog=exog.loc[data_test.index])
predictions_new = forecaster.predict_new(steps=50, exog=exog.loc[data_test.index])
assert predictions_old.equals(predictions_new)

## Benchmark

In [8]:
n_series = 10
n=365
index = pd.date_range(start='2021-01-01',periods=n, freq="D")
series = [pd.Series(np.random.normal(size=n), index=index, name=f"series_{i+1}") for i in range(n_series)]
data = pd.concat(series, axis=1)
print(f"Data shape: {data.shape}")


forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1, n_estimators=20),
                 lags               = 5,
                 encoding           = 'onehot',
                 transformer_series = StandardScaler(),
                 differentiation    = 1,
             )
forecaster.fit(series = data)

Data shape: (365, 10)


In [9]:
forecaster._create_predict_inputs_new(steps=5, levels=None, last_window=None, exog=None)[0]

Unnamed: 0,series_1,series_2,series_3,series_4,series_5,series_6,series_7,series_8,series_9,series_10
2021-12-26,,,,,,,,,,
2021-12-27,0.487551,-2.356213,-0.83133,0.453255,-1.076585,0.040146,0.659862,-2.538427,0.388393,1.037757
2021-12-28,-1.809225,1.570264,-1.089376,2.196489,-1.308448,-1.385764,-0.475214,-0.683102,-1.037519,-4.339165
2021-12-29,-0.30573,-1.154117,1.053818,-2.792496,2.222869,1.536936,0.902564,2.167814,0.951248,3.308403
2021-12-30,0.976394,0.149849,-0.64108,-0.108283,-2.644264,-1.769008,0.774342,-0.949039,-0.507695,-0.699486
2021-12-31,1.959613,1.298234,0.792547,0.107901,1.252684,0.933853,-2.795712,-0.733838,0.709666,-0.330176


In [10]:
forecaster._create_predict_inputs(steps=5, levels=None, last_window=None, exog=None)[0]

{'series_1': array([        nan,  0.48755142, -1.80922485, -0.30573044,  0.97639386,
         1.95961329]),
 'series_2': array([        nan, -2.35621326,  1.57026427, -1.15411683,  0.14984901,
         1.29823392]),
 'series_3': array([        nan, -0.83133045, -1.08937591,  1.05381769, -0.64107978,
         0.79254667]),
 'series_4': array([        nan,  0.45325508,  2.19648881, -2.79249597, -0.10828251,
         0.10790121]),
 'series_5': array([        nan, -1.07658462, -1.30844772,  2.22286942, -2.64426408,
         1.25268428]),
 'series_6': array([        nan,  0.04014555, -1.38576375,  1.53693558, -1.7690084 ,
         0.93385291]),
 'series_7': array([        nan,  0.65986166, -0.47521419,  0.90256409,  0.7743416 ,
        -2.79571169]),
 'series_8': array([        nan, -2.53842714, -0.6831017 ,  2.1678137 , -0.94903912,
        -0.73383762]),
 'series_9': array([        nan,  0.38839326, -1.0375193 ,  0.95124795, -0.50769477,
         0.70966636]),
 'series_10': array([       

In [13]:
import timeit

def benchmark_function():
    forecaster.predict(steps=5)
times = timeit.repeat(benchmark_function, repeat=5, number=1)
times = np.array(times)
print(f"Old mean time: {times.mean()} , std: {times.std()}, max: {times.max()}, min: {times.min()}")

def benchmark_function():
    forecaster.predict_new(steps=5)
times = timeit.repeat(benchmark_function, repeat=5, number=1)
times = np.array(times)
print(f"New mean time: {times.mean()} , std: {times.std()}, max: {times.max()}, min: {times.min()}")

Old mean time: 0.03509813999990001 , std: 0.0026509583920882674, max: 0.040010099997743964, min: 0.032092200010083616
New mean time: 0.010175140015780926 , std: 0.0009696960908206146, max: 0.012085900001693517, min: 0.009443200018722564


In [14]:
assert forecaster.predict(steps=5).equals(forecaster.predict_new(steps=5))

In [15]:
from sklearn.linear_model import LinearRegression

series_2 = pd.DataFrame({'1': pd.Series(np.arange(start=0, stop=50)), 
                         '2': pd.Series(np.arange(start=50, stop=100))})

In [16]:
last_window = pd.DataFrame(
                      {'1': [45, 46, 47, 48, 49], 
                       '2': [95, 96, 97, 98, 99], 
                       '3': [1, 2, 3, 4, 5]}, 
                      index = pd.RangeIndex(start=45, stop=50, step=1)
                  )

forecaster = ForecasterAutoregMultiSeries(LinearRegression(), lags=5)
forecaster.fit(series=series_2)



ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 3 and the array at index 1 has size 1

In [24]:
df = forecaster._create_predict_inputs_new(steps=5, levels='1', last_window=last_window)[0]
df

Unnamed: 0,1
45,45
46,46
47,47
48,48
49,49


In [23]:
# Get column position in DataFrame using iloc
columns_to_get = ['1', '2']
df.iloc[:, df.columns.get_indexer(columns_to_get)]

Unnamed: 0,1,2
45,45,95
46,46,96
47,47,97
48,48,98
49,49,99


In [19]:
forecaster._create_predict_inputs(steps=5, levels='1', last_window=last_window)[0]

{'1': array([45, 46, 47, 48, 49])}

In [None]:

predictions_1 = forecaster.predict(steps=5, levels='1', last_window=last_window)