In [1]:
%load_ext autoreload
%autoreload 2
import sys
#sys.path.insert(1, '/home/ximo/Documents/GitHub/skforecast')
%config Completer.use_jedi = False

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster

Since version 0.4.0, skforecast allows using sklearn pipelines as regressors. This is useful since, many machine learning models, need specific data preprocessing transformations. For example, linear models with Ridge or Lasso regularization benefits from features been scaled.

> **⚠ WARNING:**  
> Version 0.4 of the skforecast library does not allow including ColumnTransformer in the pipeline used as regressor, so if the preprocessing transformations only apply to some specific columns, they have to be applied on the data set before training the model. A more detailed example can be found [here](https://www.cienciadedatos.net/documentos/py39-forecasting-time-series-with-skforecast-xgboost-lightgbm-catboost.html#Exogenous-variables).

In [3]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o_exog.csv')
data = pd.read_csv(url, sep=',', header=0, names=['date', 'y', 'exog_1', 'exog_2'])

# Data preprocessing
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.set_index('date')
data = data.asfreq('MS')

In [4]:
pipe = make_pipeline(StandardScaler(), Ridge())
pipe

Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [5]:
# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                    regressor = pipe,
                    lags = 10
                )

forecaster.fit(y=data['y'], exog=data[['exog_1', 'exog_2']])
forecaster

ForecasterAutoreg 
Regressor: Pipeline(steps=[('standardscaler', StandardScaler()), ('ridge', Ridge())]) 
Lags: [ 1  2  3  4  5  6  7  8  9 10] 
Window size: 10 
Included exogenous: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [Timestamp('1992-04-01 00:00:00'), Timestamp('2008-06-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'ridge__alpha': 1.0, 'ridge__copy_X': True, 'ridge__fit_intercept': True, 'ridge__max_iter': None, 'ridge__normalize': 'deprecated', 'ridge__positive': False, 'ridge__random_state': None, 'ridge__solver': 'auto', 'ridge__tol': 0.001} 
Creation date: 2022-01-02 16:47:46 
Last fit date: 2022-01-02 16:47:46 
Skforecast version: 0.4.2 

When performing grid search over a sklearn pipeline, the name of the parameters is preceded by the name of the model.

In [6]:
# Hyperparameter Grid search
# ==============================================================================
pipe = make_pipeline(StandardScaler(), Ridge())
forecaster = ForecasterAutoreg(
                    regressor = pipe,
                    lags = 10  # This value will be replaced in the grid search
                )

# Regressor's hyperparameters
param_grid = {'ridge__alpha': np.logspace(-3, 5, 10)}

# Lags used as predictors
lags_grid = [5, 24, [1, 2, 3, 23, 24]]

results_grid = grid_search_forecaster(
                        forecaster  = forecaster,
                        y           = data['y'],
                        exog        = data[['exog_1', 'exog_2']],
                        param_grid  = param_grid,
                        lags_grid   = lags_grid,
                        steps       = 5,
                        metric      = 'mean_absolute_error',
                        refit       = False,
                        initial_train_size = len(data.loc[:'2000-04-01']),
                        return_best = True,
                        verbose     = False
                  )

Number of models compared: 30


loop lags_grid:   0%|                                               | 0/3 [00:00<?, ?it/s]
loop param_grid:   0%|                                             | 0/10 [00:00<?, ?it/s][A
loop param_grid:  10%|███▋                                 | 1/10 [00:00<00:01,  7.71it/s][A
loop param_grid:  30%|███████████                          | 3/10 [00:00<00:00, 10.74it/s][A
loop param_grid:  50%|██████████████████▌                  | 5/10 [00:00<00:00, 12.45it/s][A
loop param_grid:  70%|█████████████████████████▉           | 7/10 [00:00<00:00, 13.30it/s][A
loop param_grid:  90%|█████████████████████████████████▎   | 9/10 [00:00<00:00, 12.91it/s][A
loop lags_grid:  33%|█████████████                          | 1/3 [00:00<00:01,  1.17it/s][A
loop param_grid:   0%|                                             | 0/10 [00:00<?, ?it/s][A
loop param_grid:  20%|███████▍                             | 2/10 [00:00<00:00, 11.57it/s][A
loop param_grid:  40%|██████████████▊                      | 4/

Refitting `forecaster` using the best found lags and parameters and the whole data set: 
  Lags: [1 2 3 4 5] 
  Parameters: {'ridge__alpha': 0.001}
  Backtesting metric: 6.845311709573567e-05



In [7]:
print(results_grid.to_markdown(tablefmt="github"))

|    | lags                                                                      | params                                 |      metric |    ridge__alpha |
|----|---------------------------------------------------------------------------|----------------------------------------|-------------|-----------------|
|  0 | [1 2 3 4 5]                                                               | {'ridge__alpha': 0.001}                | 6.84531e-05 |      0.001      |
| 10 | [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] | {'ridge__alpha': 0.001}                | 0.000187797 |      0.001      |
|  1 | [1 2 3 4 5]                                                               | {'ridge__alpha': 0.007742636826811269} | 0.000526168 |      0.00774264 |
| 11 | [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] | {'ridge__alpha': 0.007742636826811269} | 0.00141293  |      0.00774264 |
|  2 | [1 2 3 4 5]                                              