In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom

In [3]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o.csv')
data = pd.read_csv(url, sep=',', header=0, names=['y', 'date'])
data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.set_index('date')
data = data.asfreq('MS')
data = data.y
data = pd.concat((data, data*10), axis=1)
data.columns = ['series_1', 'series_2']
exog_1 = pd.Series(np.arange(len(data)), index=data.index)
exog_2 = exog_1 * 10
exog = pd.concat((exog_1, exog_2), axis=1)
exog.columns = ['exog_1', 'exog_2']

data_train = data.loc[:'2007-12-31']
exog_train = exog.loc[:'2007-12-31']
data_test = data.loc['2008-01-01 00:00:00':]
exog_test = exog.loc['2008-01-01 00:00:00':]

In [4]:
# Custom function to create predictors
# ==============================================================================
def create_predictors(y):
    """
    Create first 3 lags of a time series.
    Calculate moving average with window 20.
    """

    lags = y[-1:-3:-1]
    mean = np.mean(y[-20:])
    predictors = np.hstack([lags, mean])

    return predictors

In [5]:
# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoregMultiSeriesCustom(
                 regressor       = Ridge(random_state=123),
                 fun_predictors  = create_predictors,
                 window_size     = 20,
                 name_predictors = ['lag_1', 'lag_2', 'lag_3']
             )

In [6]:
X_train, y_train, y_index, y_train_index = forecaster.create_train_X_y(series=data, exog=exog)

In [7]:
forecaster.name_predictors

['lag_1', 'lag_2', 'lag_3']

In [8]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_1,exog_2,series_1,series_2
0,0.387554,0.751503,0.496401,20.0,200.0,1.0,0.0
1,0.427283,0.387554,0.496275,21.0,210.0,1.0,0.0
2,0.413890,0.427283,0.496924,22.0,220.0,1.0,0.0
3,0.428859,0.413890,0.496759,23.0,230.0,1.0,0.0
4,0.470126,0.428859,0.495638,24.0,240.0,1.0,0.0
...,...,...,...,...,...,...,...
363,12.199410,11.765890,9.803897,199.0,1990.0,0.0,1.0
364,7.618220,12.199410,9.785823,200.0,2000.0,0.0,1.0
365,6.494350,7.618220,9.668384,201.0,2010.0,0.0,1.0
366,8.278870,6.494350,9.557504,202.0,2020.0,0.0,1.0


In [9]:
forecaster.name_predictors

['lag_1', 'lag_2', 'lag_3']

In [10]:
forecaster.fit(series=data_train, exog=exog_train)
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=123) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 20 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-03 20:38:25 
Last fit date: 2023-03-03 20:38:26 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [11]:
forecaster.predict(levels=None, steps=3, exog=exog_test, last_window=data_train)

Unnamed: 0,series_1,series_2
2008-01-01,1.363601,10.883469
2008-02-01,1.500655,10.273039
2008-03-01,1.589148,9.934138


In [12]:
forecaster.predict_interval(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True)

Unnamed: 0,series_1,series_1_lower_bound,series_1_upper_bound,series_2,series_2_lower_bound,series_2_upper_bound
2008-01-01,1.363601,0.883801,1.621091,10.883469,7.613457,12.641747
2008-02-01,1.500655,1.117235,1.867066,10.273039,6.931031,12.626412
2008-03-01,1.589148,1.129733,1.96853,9.934138,6.840337,12.292837


In [13]:
from scipy.stats import norm

forecaster.predict_dist(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True, distribution=norm)

Unnamed: 0,series_1_loc,series_1_scale,series_2_loc,series_2_scale
2008-01-01,1.354973,0.205648,10.857572,1.424304
2008-02-01,1.508291,0.233441,10.417242,1.645392
2008-03-01,1.588666,0.250132,10.068625,1.709521


In [14]:
forecaster.predict_bootstrapping(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True)

{'series_1':             pred_boot_0  pred_boot_1  pred_boot_2  pred_boot_3  pred_boot_4  \
 2008-01-01     1.438828     1.369708     1.342008     1.266740     1.368328   
 2008-02-01     1.683688     1.532451     1.616588     1.535570     1.578656   
 2008-03-01     1.779712     1.674101     1.434168     1.781432     1.901560   
 
             pred_boot_5  pred_boot_6  pred_boot_7  pred_boot_8  pred_boot_9  \
 2008-01-01     1.675864     1.649412     1.237521     1.523739     0.883801   
 2008-02-01     1.881057     1.681632     1.514862     1.892050     1.165323   
 2008-03-01     2.076429     1.700308     1.594826     1.709442     1.455770   
 
             ...  pred_boot_490  pred_boot_491  pred_boot_492  pred_boot_493  \
 2008-01-01  ...       1.289885       1.308392       1.335334       1.539796   
 2008-02-01  ...       1.351547       1.565091       1.715707       1.911346   
 2008-03-01  ...       1.534464       1.420947       1.578282       1.677592   
 
             pred_boot

In [15]:
forecaster.set_out_sample_residuals(forecaster.in_sample_residuals)

In [16]:
forecaster.get_feature_importance()

Unnamed: 0,feature,importance
0,lag_1,0.702683
1,lag_2,-0.082468
2,lag_3,0.121407
3,exog_1,3.3e-05
4,exog_2,0.000331
5,series_1,-0.946917
6,series_2,0.946917


In [17]:
forecaster.set_params({'random_state':9999})
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=9999) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 20 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 9999, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-03 20:38:25 
Last fit date: 2023-03-03 20:38:26 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [18]:
forecaster.summary()

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=9999) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 20 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 9999, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-03 20:38:25 
Last fit date: 2023-03-03 20:38:26 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 



In [19]:
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries

In [20]:
# Data download
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/' +
       'data/simulated_items_sales.csv')
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data.head()

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [21]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(f"Train dates : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00  (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00  (n=170)


In [22]:

# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeriesCustom(
                 regressor          = Ridge(random_state=123),
                 fun_predictors     = create_predictors, 
                 window_size        = 10,
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None
             )

forecaster.fit(series=data_train)
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=123) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 10 
Series levels (names): ['item_1', 'item_2', 'item_3'] 
Series weights: None 
Weight function included: False 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2012-01-01 00:00:00'), Timestamp('2014-07-15 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-03 20:38:32 
Last fit date: 2023-03-03 20:38:32 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [23]:

# Predict and predict_interval
# ==============================================================================
steps = 24

# Predictions for item_1
predictions_item_1 = forecaster.predict(steps=steps, levels='item_1')
display(predictions_item_1.head(3))

# Interval predictions for item_1
predictions_intervals = forecaster.predict_interval(steps=steps, levels=['item_1', 'item_2'])
display(predictions_intervals.head(3))

Unnamed: 0,item_1
2014-07-16,24.889346
2014-07-17,24.213885
2014-07-18,24.09308


Unnamed: 0,item_1,item_1_lower_bound,item_1_upper_bound,item_2,item_2_lower_bound,item_2_upper_bound
2014-07-16,24.889346,21.882081,28.050743,11.270633,7.073703,16.240086
2014-07-17,24.213885,20.824267,27.560137,11.661207,6.691828,17.358172
2014-07-18,24.09308,20.809975,27.795983,11.787952,6.217752,17.597177


In [24]:
# Backtesting Multi Series
# ==============================================================================
metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
                                           forecaster         = forecaster,
                                           series             = data,
                                           levels             = None,
                                           steps              = 24,
                                           metric             = 'mean_absolute_error',
                                           initial_train_size = len(data_train),
                                           refit              = True,
                                           fixed_train_size   = True,
                                           verbose            = False
                                       )

print("Backtest metrics")
display(metrics_levels)
print("")
print("Backtest predictions")
backtest_predictions.head(4)

Backtest metrics


Unnamed: 0,levels,mean_absolute_error
0,item_1,1.554776
1,item_2,2.483191
2,item_3,3.268641



Backtest predictions


Unnamed: 0,item_1,item_2,item_3
2014-07-16,24.889346,11.270633,11.981894
2014-07-17,24.213885,11.661207,12.923297
2014-07-18,24.09308,11.787952,13.033283
2014-07-19,24.09974,11.816027,12.926981


In [28]:

# Grid search Multi Series
# ==============================================================================
param_grid = {'alpha': [0.01, 0.1, 1]}

levels = ['item_1', 'item_2', 'item_3']

results = grid_search_forecaster_multiseries(
              forecaster          = forecaster,
              series              = data,
              exog                = None,
              levels              = levels,
              param_grid          = param_grid,
              steps               = 24,
              metric              = 'mean_absolute_error',
              initial_train_size  = len(data_train),
              refit               = True,
              fixed_train_size    = True,
              return_best         = False,
              verbose             = False
          )

results

3 models compared for 3 level(s). Number of iterations: 3.


loop lags_grid: 100%|███████████████████████████████████████| 1/1 [00:03<00:00,  3.08s/it]


Unnamed: 0,levels,lags,params,mean_absolute_error,alpha
2,"[item_1, item_2, item_3]",custom predictors,{'alpha': 1},2.435536,1.0
1,"[item_1, item_2, item_3]",custom predictors,{'alpha': 0.1},2.435667,0.1
0,"[item_1, item_2, item_3]",custom predictors,{'alpha': 0.01},2.43568,0.01
