In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [62]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries

In [46]:
# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/simulated_items_sales.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data.head()

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [47]:
data.index.min(), data.index.max()

(Timestamp('2012-01-01 00:00:00'), Timestamp('2015-01-01 00:00:00'))

In [48]:
# Delete observations of item_2 and item_3
# ==============================================================================
data['item_2'].loc[: '2013-01-01'] = np.nan
data['item_3'].loc[: '2013-07-13'] = np.nan

data.isna().sum()

item_1      0
item_2    367
item_3    560
dtype: int64

In [49]:
# Add exog
# ==============================================================================
data['exog_1'] = np.arange(len(data))
data.head()

Unnamed: 0_level_0,item_1,item_2,item_3,exog_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,8.253175,,,0
2012-01-02,22.777826,,,1
2012-01-03,27.549099,,,2
2012-01-04,25.895533,,,3
2012-01-05,21.379238,,,4


In [50]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [51]:
# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = Ridge(random_state=123),
                 lags               = 5,
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None
             )

forecaster.fit(series=data_train[['item_1', 'item_2', 'item_3']],
               exog=data_train['exog_1'])
forecaster

ForecasterAutoregMultiSeries 
Regressor: Ridge(random_state=123) 
Lags: [1 2 3 4 5] 
Transformer for series: None 
Transformer for exog: None 
Window size: 5 
Series levels (names): ['item_1', 'item_2', 'item_3'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.series.Series'> 
Exogenous variables names: ['exog_1'] 
Training range: [Timestamp('2012-01-01 00:00:00'), Timestamp('2014-07-15 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2023-06-06 18:35:24 
Last fit date: 2023-06-06 18:35:24 
Skforecast version: 0.9.0 
Python version: 3.10.11 
Forecaster id: None 

In [56]:
X_train, y_train, y_index, y_train_index = forecaster.create_train_X_y(data_train[['item_1', 'item_2', 'item_3']])

# data_train

print("data_train")
print(data_train.shape)
print("")

# X_train

len_1 = len(X_train.loc[X_train['item_1'] == 1.])
len_2 = len(X_train.loc[X_train['item_2'] == 1.])
len_3 = len(X_train.loc[X_train['item_3'] == 1.])

print("X_train")
print(X_train.shape)
print('item_1 len:', len_1)
print('item_2 len:', len_2)
print('item_3 len:', len_3)
print("len_1 + len_2 + len_3:", len_1+len_2+len_3)
print("")

# y_train

print("y_train")
print(y_train.shape)
print("")

# y_index

print("y_index")
print(y_index.shape)
print("data_train:", data_train.index[0], data_train.index[-1])
print("y_index:", y_index[0], y_index[-1])
print("")

# y_train_index

print("y_train_index")
print(y_train_index.shape)
print(y_train_index[0], y_train_index[len_1-1])
print(y_train_index[len_1], y_train_index[len_1+len_2-1])
print(y_train_index[len_1+len_2], y_train_index[len_1+len_2+len_3-1])
print("")


data_train
(927, 4)

X_train
(1839, 8)
item_1 len: 922
item_2 len: 555
item_3 len: 362
len_1 + len_2 + len_3: 1839

y_train
(1839,)

y_index
(927,)
data_train: 2012-01-01 00:00:00 2014-07-15 00:00:00
y_index: 2012-01-01 00:00:00 2014-07-15 00:00:00

y_train_index
(1839,)
2012-01-06 00:00:00 2014-07-15 00:00:00
2013-01-07 00:00:00 2014-07-15 00:00:00
2013-07-19 00:00:00 2014-07-15 00:00:00



In [57]:
forecaster.last_window

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-07-11,25.662128,11.002083,10.396751
2014-07-12,23.773923,11.008333,16.139173
2014-07-13,22.609388,8.1,13.028927
2014-07-14,23.307307,10.895833,9.315334
2014-07-15,25.980745,10.489583,9.908915


In [58]:
forecaster.predict(5, exog=data_test['exog_1'])

Unnamed: 0,item_1,item_2,item_3
2014-07-16,24.96464,11.386619,11.927721
2014-07-17,23.862185,12.156144,13.466882
2014-07-18,23.376065,12.224168,13.771249
2014-07-19,23.253302,12.649076,13.537742
2014-07-20,23.440808,12.958848,13.657885


## Tests

In [64]:
import re
import pytest

In [65]:


series = pd.DataFrame({'1': pd.Series(np.arange(7)), 
                           '2': pd.Series([np.nan]*7)})
series.index = pd.date_range(start='2022-01-01', periods=7, freq='1D')
forecaster = ForecasterAutoregMultiSeries(LinearRegression(), lags=5)

err_msg = re.escape("All values of series '2' are missing.")
with pytest.raises(ValueError, match = err_msg):
    forecaster.fit(series=series)