In [2]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom

In [4]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o.csv')
data = pd.read_csv(url, sep=',', header=0, names=['y', 'date'])
data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.set_index('date')
data = data.asfreq('MS')
data = data.y
data = pd.concat((data, data*10), axis=1)
data.columns = ['series_1', 'series_2']
exog_1 = pd.Series(np.arange(len(data)), index=data.index)
exog_2 = exog_1 * 10
exog = pd.concat((exog_1, exog_2), axis=1)
exog.columns = ['exog_1', 'exog_2']

data_train = data.loc[:'2007-12-31']
exog_train = exog.loc[:'2007-12-31']
data_test = data.loc['2008-01-01 00:00:00':]
exog_test = exog.loc['2008-01-01 00:00:00':]

In [5]:
# Custom function to create predictors
# ==============================================================================
def create_predictors(y):
    """
    Create first 3 lags of a time series.
    """

    lags = y[-1:-4:-1]

    return lags

In [6]:
# Create forecaster
# ==============================================================================
forecaster = ForecasterAutoregMultiSeriesCustom(
                 regressor       = Ridge(random_state=123),
                 fun_predictors  = create_predictors,
                 window_size     = 3,
                 name_predictors = ['lag_1', 'lag_2', 'lag_3']
             )

In [7]:
X_train, y_train, y_index, y_train_index = forecaster.create_train_X_y(series=data, exog=exog)

In [8]:
forecaster.name_predictors

['lag_1', 'lag_2', 'lag_3']

In [9]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_1,exog_2,series_1,series_2
0,0.432159,0.400906,0.429795,3.0,30.0,1.0,0.0
1,0.492543,0.432159,0.400906,4.0,40.0,1.0,0.0
2,0.502369,0.492543,0.432159,5.0,50.0,1.0,0.0
3,0.602652,0.502369,0.492543,6.0,60.0,1.0,0.0
4,0.660119,0.602652,0.502369,7.0,70.0,1.0,0.0
...,...,...,...,...,...,...,...
397,12.199410,11.765890,11.635343,199.0,1990.0,0.0,1.0
398,7.618220,12.199410,11.765890,200.0,2000.0,0.0,1.0
399,6.494350,7.618220,12.199410,201.0,2010.0,0.0,1.0
400,8.278870,6.494350,7.618220,202.0,2020.0,0.0,1.0


In [10]:
forecaster.name_predictors

['lag_1', 'lag_2', 'lag_3']

In [11]:
forecaster.fit(series=data_train, exog=exog_train)
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=123) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 3 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-05 10:37:18 
Last fit date: 2023-03-05 10:37:19 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [12]:
forecaster.predict(levels=None, steps=3, exog=exog_test, last_window=data_train)

Unnamed: 0,series_1,series_2
2008-01-01,1.551018,10.876073
2008-02-01,1.819297,10.182508
2008-03-01,2.028878,9.642368


In [13]:
forecaster.predict_interval(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True)

Unnamed: 0,series_1,series_1_lower_bound,series_1_upper_bound,series_2,series_2_lower_bound,series_2_upper_bound
2008-01-01,1.551018,1.141742,1.92583,10.876073,7.474456,12.612808
2008-02-01,1.819297,1.252981,2.358647,10.182508,6.556179,12.376842
2008-03-01,2.028878,1.390835,2.589698,9.642368,6.032644,11.977512


In [14]:
from scipy.stats import norm

forecaster.predict_dist(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True, distribution=norm)

Unnamed: 0,series_1_loc,series_1_scale,series_2_loc,series_2_scale
2008-01-01,1.549241,0.271071,10.933677,1.296312
2008-02-01,1.807914,0.331204,10.137603,1.728709
2008-03-01,2.00067,0.372886,9.546498,1.844489


In [15]:
forecaster.predict_bootstrapping(levels=None, steps=3, exog=exog_test, last_window=data_train, in_sample_residuals=True)

{'series_1':             pred_boot_0  pred_boot_1  pred_boot_2  pred_boot_3  pred_boot_4  \
 2008-01-01     1.730726     1.634672     1.620305     1.923553     1.584633   
 2008-02-01     1.965297     1.626658     2.090960     2.041083     1.789307   
 2008-03-01     2.186154     1.869448     1.881356     2.450151     2.271959   
 
             pred_boot_5  pred_boot_6  pred_boot_7  pred_boot_8  pred_boot_9  \
 2008-01-01     1.816138     1.882047     1.299869     1.814463     1.603736   
 2008-02-01     2.240179     2.126442     1.592939     2.310003     1.918924   
 2008-03-01     2.684806     2.322991     1.752885     2.059541     2.454869   
 
             ...  pred_boot_490  pred_boot_491  pred_boot_492  pred_boot_493  \
 2008-01-01  ...       1.205852       1.443138       1.325146       1.852428   
 2008-02-01  ...       2.158258       1.695884       2.183525       2.366901   
 2008-03-01  ...       2.404712       1.798815       2.088458       2.063198   
 
             pred_boot

In [16]:
forecaster.set_out_sample_residuals(forecaster.in_sample_residuals)

In [17]:
forecaster.get_feature_importance()

Unnamed: 0,feature,importance
0,lag_1,0.718542
1,lag_2,0.035249
2,lag_3,-0.119443
3,exog_1,5.1e-05
4,exog_2,0.000512
5,series_1,-1.270146
6,series_2,1.270146


In [18]:
forecaster.set_params({'random_state':9999})
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=9999) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 3 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 9999, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-05 10:37:18 
Last fit date: 2023-03-05 10:37:19 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [19]:
forecaster.summary()

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=9999) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 3 
Series levels (names): ['series_1', 'series_2'] 
Series weights: None 
Weight function included: False 
Exogenous included: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2007-12-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 9999, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-05 10:37:18 
Last fit date: 2023-03-05 10:37:19 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 



In [20]:
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries

In [21]:
# Data download
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/' +
       'data/simulated_items_sales.csv')
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data.head()

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [22]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(f"Train dates : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00  (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00  (n=170)


In [23]:

# Create and fit forecaster multi series
# ==============================================================================
forecaster = ForecasterAutoregMultiSeriesCustom(
                 regressor          = Ridge(random_state=123),
                 fun_predictors     = create_predictors, 
                 window_size        = 10,
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None
             )

forecaster.fit(series=data_train)
forecaster

ForecasterAutoregMultiSeriesCustom 
Regressor: Ridge(random_state=123) 
Predictors created with function: create_predictors 
Transformer for series: None 
Transformer for exog: None 
Window size: 10 
Series levels (names): ['item_1', 'item_2', 'item_3'] 
Series weights: None 
Weight function included: False 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2012-01-01 00:00:00'), Timestamp('2014-07-15 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
Creation date: 2023-03-05 10:37:25 
Last fit date: 2023-03-05 10:37:25 
Skforecast version: 0.7.0 
Python version: 3.10.9 
Forecaster id: None 

In [24]:

# Predict and predict_interval
# ==============================================================================
steps = 24

# Predictions for item_1
predictions_item_1 = forecaster.predict(steps=steps, levels='item_1')
display(predictions_item_1.head(3))

# Interval predictions for item_1
predictions_intervals = forecaster.predict_interval(steps=steps, levels=['item_1', 'item_2'])
display(predictions_intervals.head(3))

Unnamed: 0,item_1
2014-07-16,24.676247
2014-07-17,23.89091
2014-07-18,23.894093


Unnamed: 0,item_1,item_1_lower_bound,item_1_upper_bound,item_2,item_2_lower_bound,item_2_upper_bound
2014-07-16,24.676247,21.408197,27.712555,11.442835,7.286725,16.825325
2014-07-17,23.89091,20.189607,27.424957,12.527598,7.14971,18.692713
2014-07-18,23.894093,20.169258,28.029639,13.107934,7.244975,19.436806


In [25]:
# Backtesting Multi Series
# ==============================================================================
metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
                                           forecaster         = forecaster,
                                           series             = data,
                                           levels             = None,
                                           steps              = 24,
                                           metric             = 'mean_absolute_error',
                                           initial_train_size = len(data_train),
                                           refit              = True,
                                           fixed_train_size   = True,
                                           verbose            = False
                                       )

print("Backtest metrics")
display(metrics_levels)
print("")
print("Backtest predictions")
backtest_predictions.head(4)

Backtest metrics


Unnamed: 0,levels,mean_absolute_error
0,item_1,1.72998
1,item_2,3.206235
2,item_3,3.819735



Backtest predictions


Unnamed: 0,item_1,item_2,item_3
2014-07-16,24.676247,11.442835,12.172567
2014-07-17,23.89091,12.527598,12.94245
2014-07-18,23.894093,13.107934,13.452817
2014-07-19,23.700115,13.598445,14.131863


In [26]:

# Grid search Multi Series
# ==============================================================================
param_grid = {'alpha': [0.01, 0.1, 1]}

levels = ['item_1', 'item_2', 'item_3']

results = grid_search_forecaster_multiseries(
              forecaster          = forecaster,
              series              = data,
              exog                = None,
              levels              = levels,
              param_grid          = param_grid,
              steps               = 24,
              metric              = 'mean_absolute_error',
              initial_train_size  = len(data_train),
              refit               = True,
              fixed_train_size    = True,
              return_best         = False,
              verbose             = False
          )

results

3 models compared for 3 level(s). Number of iterations: 3.


loop lags_grid: 100%|███████████████████████████████████████| 1/1 [00:00<00:00,  1.06it/s]


Unnamed: 0,levels,lags,params,mean_absolute_error,alpha
2,"[item_1, item_2, item_3]",custom predictors,{'alpha': 1},2.91865,1.0
1,"[item_1, item_2, item_3]",custom predictors,{'alpha': 0.1},2.918977,0.1
0,"[item_1, item_2, item_3]",custom predictors,{'alpha': 0.01},2.91901,0.01


In [27]:
# Unit test get_feature_importance ForecasterAutoregMultiSeriesCustom
# ==============================================================================
import re
import pytest
from pytest import approx
import numpy as np
import pandas as pd
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from sklearn.exceptions import NotFittedError
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# Fixtures
series = pd.DataFrame({'1': pd.Series(np.arange(10)), 
                       '2': pd.Series(np.arange(10))})


def create_predictors(y): # pragma: no cover
    """
    Create first 3 lags of a time series.
    """
    lags = y[-1:-4:-1]

    return lags

In [31]:
# Fixtures ForecasterAutoregMultiSeriesCustom
# ==============================================================================
import numpy as np
import pandas as pd

# Fixtures
# np.random.seed(123)
# series_1 = np.random.rand(50)
# series_2 = np.random.rand(50)
# exog = np.random.rand(50)
series = pd.DataFrame({'1': pd.Series(np.array(
                                [0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897,
                                 0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752,
                                 0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426,
                                 0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759,
                                 0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338,
                                 0.32295891, 0.36178866, 0.22826323, 0.29371405, 0.63097612,
                                 0.09210494, 0.43370117, 0.43086276, 0.4936851 , 0.42583029,
                                 0.31226122, 0.42635131, 0.89338916, 0.94416002, 0.50183668,
                                 0.62395295, 0.1156184 , 0.31728548, 0.41482621, 0.86630916,
                                 0.25045537, 0.48303426, 0.98555979, 0.51948512, 0.61289453]
                                )
                            ), 
                       '2': pd.Series(np.array(
                                [0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
                                 0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234,
                                 0.66931378, 0.58593655, 0.6249035 , 0.67468905, 0.84234244,
                                 0.08319499, 0.76368284, 0.24366637, 0.19422296, 0.57245696,
                                 0.09571252, 0.88532683, 0.62724897, 0.72341636, 0.01612921,
                                 0.59443188, 0.55678519, 0.15895964, 0.15307052, 0.69552953,
                                 0.31876643, 0.6919703 , 0.55438325, 0.38895057, 0.92513249,
                                 0.84167   , 0.35739757, 0.04359146, 0.30476807, 0.39818568,
                                 0.70495883, 0.99535848, 0.35591487, 0.76254781, 0.59317692,
                                 0.6917018 , 0.15112745, 0.39887629, 0.2408559 , 0.34345601]
                                )
                            )
                      }
         )
    
exog = pd.DataFrame({'col_1': pd.Series(np.array(
                                [0.51312815, 0.66662455, 0.10590849, 0.13089495, 0.32198061,
                                 0.66156434, 0.84650623, 0.55325734, 0.85445249, 0.38483781,
                                 0.3167879 , 0.35426468, 0.17108183, 0.82911263, 0.33867085,
                                 0.55237008, 0.57855147, 0.52153306, 0.00268806, 0.98834542,
                                 0.90534158, 0.20763586, 0.29248941, 0.52001015, 0.90191137,
                                 0.98363088, 0.25754206, 0.56435904, 0.80696868, 0.39437005,
                                 0.73107304, 0.16106901, 0.60069857, 0.86586446, 0.98352161,
                                 0.07936579, 0.42834727, 0.20454286, 0.45063649, 0.54776357,
                                 0.09332671, 0.29686078, 0.92758424, 0.56900373, 0.457412  ,
                                 0.75352599, 0.74186215, 0.04857903, 0.7086974 , 0.83924335]
                                )
                              ),
                     'col_2': ['a']*25 + ['b']*25}
       )

exog_predict = exog.copy()
exog_predict.index = pd.RangeIndex(start=50, stop=100)

def create_predictors(y): # pragma: no cover
    """
    Create first 3 lags of a time series.
    """
    lags = y[-1:-4:-1]

    return lags


In [43]:
def test_predict_interval_output_when_regressor_is_LinearRegression_with_transform_series_and_transform_exog():
    """
    Test predict_interval output when using LinearRegression as regressor, StandardScaler
    as transformer_series and transformer_exog as transformer_exog.
    """
    transformer_exog = ColumnTransformer(
                            [('scale', StandardScaler(), ['col_1']),
                             ('onehot', OneHotEncoder(), ['col_2'])],
                            remainder = 'passthrough',
                            verbose_feature_names_out = False
                       )
    forecaster = ForecasterAutoregMultiSeriesCustom(
                     regressor          = LinearRegression(),
                     fun_predictors     = create_predictors,
                     window_size        = 3,
                     transformer_series = StandardScaler(),
                     transformer_exog   = transformer_exog,
                 )
    forecaster.fit(series=series, exog=exog)
    predictions = forecaster.predict_interval(steps=5, levels=['1', '2'], exog=exog_predict)
    expected = pd.DataFrame(
                   data = np.array([[0.50201669, 0.10891904, 0.90089875, 0.52531076, 0.10696675, 0.9156779 ],
                                    [0.49804821, 0.09776535, 0.88356798, 0.51683613, 0.08385857, 0.92154261],
                                    [0.59201747, 0.1880649 , 0.96148336, 0.61509778, 0.18270673, 1.02114822],
                                    [0.60179565, 0.216096  , 0.9880778 , 0.60446614, 0.18396533, 0.98997181],
                                    [0.56736867, 0.17944505, 0.97036278, 0.56524059, 0.13232921, 0.97872883]]),
                   index = pd.RangeIndex(start=50, stop=55, step=1),
                   columns = ['1', '1_lower_bound', '1_upper_bound', '2', '2_lower_bound', '2_upper_bound']
               )
    
    pd.testing.assert_frame_equal(predictions, expected)

test_predict_interval_output_when_regressor_is_LinearRegression_with_transform_series_and_transform_exog()


In [42]:
transformer_exog = ColumnTransformer(
                        [('scale', StandardScaler(), ['col_1']),
                            ('onehot', OneHotEncoder(), ['col_2'])],
                        remainder = 'passthrough',
                        verbose_feature_names_out = False
                    )
forecaster = ForecasterAutoregMultiSeriesCustom(
                    regressor          = LinearRegression(),
                    fun_predictors     = create_predictors,
                    window_size        = 3,
                    transformer_series = StandardScaler(),
                    transformer_exog   = transformer_exog,
                )
forecaster.fit(series=series, exog=exog)
predictions = forecaster.predict_interval(steps=5, levels=['1', '2'], exog=exog_predict)

predictions.values

array([[0.50201669, 0.10891904, 0.90089875, 0.52531076, 0.10696675,
        0.9156779 ],
       [0.49804821, 0.09776535, 0.88356798, 0.51683613, 0.08385857,
        0.92154261],
       [0.59201747, 0.1880649 , 0.96148336, 0.61509778, 0.18270673,
        1.02114822],
       [0.60179565, 0.216096  , 0.9880778 , 0.60446614, 0.18396533,
        0.98997181],
       [0.56736867, 0.17944505, 0.97036278, 0.56524059, 0.13232921,
        0.97872883]])

In [69]:
forecaster = ForecasterAutoregMultiSeriesCustom(
                    regressor       = LinearRegression(),
                    fun_predictors  = create_predictors,
                    window_size     = 3
                )
series = pd.DataFrame({'1': pd.Series(np.arange(7, dtype=float)), 
                        '2': pd.Series(np.arange(7, dtype=float))
                        })

results = forecaster.create_train_X_y(series=series)
results[0]

Unnamed: 0,custom_predictor_0,custom_predictor_1,custom_predictor_2,1,2
0,2.0,1.0,0.0,1.0,0.0
1,3.0,2.0,1.0,1.0,0.0
2,4.0,3.0,2.0,1.0,0.0
3,5.0,4.0,3.0,1.0,0.0
4,2.0,1.0,0.0,0.0,1.0
5,3.0,2.0,1.0,0.0,1.0
6,4.0,3.0,2.0,0.0,1.0
7,5.0,4.0,3.0,0.0,1.0
