In [2]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [14]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from skforecast.datasets import fetch_dataset
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries

In [4]:
# Data download
# ==============================================================================
data = fetch_dataset(name="items_sales")
data.head()

items_sales
-----------
Simulated time series for the sales of 3 different items.
Simulated data.
Shape of the dataset: (1097, 3)


Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [5]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [16]:
# Create and fit a Forecaster Multi-Series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123),
                 lags               = 3,
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 768
[LightGBM] [Info] Number of data points in the train set: 2772, number of used features: 4
[LightGBM] [Info] Start training from score 18.713730


In [59]:
X_train = forecaster.create_train_X_y(series=data_train)[0]
X_train

Unnamed: 0_level_0,lag_1,lag_2,lag_3,_level_skforecast
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-04,27.549099,22.777826,8.253175,0
2012-01-05,25.895533,27.549099,22.777826,0
2012-01-06,21.379238,25.895533,27.549099,0
2012-01-07,21.106643,21.379238,25.895533,0
2012-01-08,20.533871,21.106643,21.379238,0
...,...,...,...,...
2014-07-11,9.677730,12.199832,14.072343,2
2014-07-12,10.396751,9.677730,12.199832,2
2014-07-13,16.139173,10.396751,9.677730,2
2014-07-14,13.028927,16.139173,10.396751,2


In [64]:
d = {np.nan: [1,2,3]}
d

{nan: [1, 2, 3]}

In [65]:
d[np.nan]

[1, 2, 3]

In [60]:
# X_train_no_encoding = X_train.drop(columns='_level_skforecast')
X_train_no_encoding = X_train[['lag_1', 'lag_2', 'lag_3']]
X_train_no_encoding

Unnamed: 0_level_0,lag_1,lag_2,lag_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-04,27.549099,22.777826,8.253175
2012-01-05,25.895533,27.549099,22.777826
2012-01-06,21.379238,25.895533,27.549099
2012-01-07,21.106643,21.379238,25.895533
2012-01-08,20.533871,21.106643,21.379238
...,...,...,...
2014-07-11,9.677730,12.199832,14.072343
2014-07-12,10.396751,9.677730,12.199832
2014-07-13,16.139173,10.396751,9.677730
2014-07-14,13.028927,16.139173,10.396751


In [61]:
id(X_train)

2138948260496

In [62]:
id(X_train_no_encoding)

2138945366416

In [36]:
lw = forecaster.last_window['item_1'] + 10
lw

date
2014-07-13    32.609388
2014-07-14    33.307307
2014-07-15    35.980745
Freq: D, Name: item_1, dtype: float64

In [18]:
forecaster.last_window['item_1']

date
2014-07-13    22.609388
2014-07-14    23.307307
2014-07-15    25.980745
Freq: D, Name: item_1, dtype: float64

In [37]:
X_predict = forecaster.create_predict_X(steps=1, levels='item_4', last_window=lw)['item_4']
X_predict

ValueError: `levels` names must be included in the series used during fit (['item_1', 'item_2', 'item_3']). Got ['item_4'].

In [31]:
forecaster.regressor.predict(X_predict)

array([10.62834768])

In [32]:
X_predict['_level_skforecast'] = np.nan
X_predict

Unnamed: 0,lag_1,lag_2,lag_3,_level_skforecast
2014-07-16,10.489583,10.895833,8.1,


In [33]:
forecaster.regressor.predict(X_predict)

array([10.6616944])

In [8]:
forecaster.X_train_col_names

['lag_1',
 'lag_2',
 'lag_3',
 'lag_4',
 'lag_5',
 'lag_6',
 'lag_7',
 'lag_8',
 'lag_9',
 'lag_10',
 'lag_11',
 'lag_12',
 'lag_13',
 'lag_14',
 'lag_15',
 'lag_16',
 'lag_17',
 'lag_18',
 'lag_19',
 'lag_20',
 'lag_21',
 'lag_22',
 'lag_23',
 'lag_24',
 '_level_skforecast']