In [17]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import random_search_forecaster_multiseries

In [3]:
# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/guangyuan_air_pollution.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
data = data[['CO', 'SO2', 'PM2.5']]
data.head()

Unnamed: 0_level_0,CO,SO2,PM2.5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-03-01,9600.0,204.0,181.0
2013-03-02,20198.0,674.0,633.0
2013-03-03,47195.0,1661.0,1956.0
2013-03-04,15000.0,485.0,438.0
2013-03-05,59594.0,2001.0,3388.0


In [4]:
# Create and fit forecaster MultiVariate
# ==============================================================================
forecaster = ForecasterAutoregMultiVariate(
                 regressor          = Ridge(random_state=123),
                 level              = 'PM2.5',
                 lags               = {'CO': 2, 'SO2': 2, 'PM2.5': None},
                 steps              = 7,
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 n_jobs             = 'auto'
             )

forecaster.fit(series=data)
forecaster

ForecasterAutoregMultiVariate 
Regressor: Ridge(random_state=123) 
Lags: {'CO': array([1, 2]), 'SO2': array([1, 2]), 'PM2.5': None} 
Transformer for series: None 
Transformer for exog: None 
Weight function included: False 
Window size: 2 
Target series, level: PM2.5 
Multivariate series (names): ['CO', 'SO2', 'PM2.5'] 
Maximum steps predicted: 7 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('2013-03-01 00:00:00'), Timestamp('2017-02-28 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': 123, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-02-25 18:41:19 
Last fit date: 2024-02-25 18:41:19 
Skforecast version: 0.12.0 
Python version: 3.11.5 
Forecaster id: None 

In [5]:
forecaster.last_window

Unnamed: 0_level_0,CO,SO2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-02-27,29700.0,451.0
2017-02-28,13200.0,162.0


In [6]:
forecaster.predict(steps=7).head()

Unnamed: 0,PM2.5
2017-03-01,1269.395659
2017-03-02,1658.617492
2017-03-03,1865.249156
2017-03-04,1945.549921
2017-03-05,1920.196962


In [7]:
lw = data.iloc[-2:]

forecaster.predict(steps=7, last_window=lw).head()

ValueError: `last_window` columns must be the same as the `series` column names used to create the X_train matrix.
    `last_window` columns : ['CO', 'SO2', 'PM2.5'].
    `series` columns      : ['CO', 'SO2'].

In [None]:
forecaster.lags_

{'l1': None, 'l2': array([1, 2, 3, 4, 5])}

In [None]:
forecaster.lags

{'l1': None, 'l2': array([1, 2, 3, 4, 5])}

In [None]:
np.hstack([np.ones(5), np.zeros(5)])

array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0.])

In [None]:
series_col_names = ['l1', 'l2']
level = 'l2'
other_dict = {'l1': 3, 'l2': None}

In [None]:
cols_to_matrix = {
    col: ('both' if col == level else 'X')
    for col in series_col_names
    if col == level or other_dict.get(col) is not None
}

# Ajustamos el valor de 'level' en caso de que su valor en other_dict sea None
if other_dict.get(level) is None:
    cols_to_matrix[level] = 'y'

cols_to_matrix

{'l1': 'X', 'l2': 'y'}

In [None]:
result_dict = {
    col: 'X' if col != level else ('both' if other_dict.get(level) is not None else 'y')
    for col in series_col_names
}
result_dict

{'l1': 'y', 'l2': 'X'}

In [None]:
other_dict.get(level)

In [None]:
X_train_col_names_list = []

In [None]:
X_train_col_names_list.extend([1, 2, 3])
X_train_col_names_list

[1, 2, 3]

In [None]:
X_train_col_names_list.extend([4, 5, 6])
X_train_col_names_list

[1, 2, 3, 4, 5, 6]

In [None]:
series = pd.DataFrame({'l1': pd.Series(np.arange(10)), 
                           'l2': pd.Series(np.arange(100, 110))})
exog = None

forecaster = ForecasterAutoregMultiVariate(Ridge(), level='l2',
                                            lags={'l1': 3, 'l2': None}, 
                                            steps=2, transformer_series=None)
results = forecaster.create_train_X_y(series=series, exog=exog)

In [None]:
results[0]

Unnamed: 0,l1_lag_1,l1_lag_2,l1_lag_3
4,2.0,1.0,0.0
5,3.0,2.0,1.0
6,4.0,3.0,2.0
7,5.0,4.0,3.0
8,6.0,5.0,4.0
9,7.0,6.0,5.0


In [None]:
results[1]

{1: 3    103.0
 4    104.0
 5    105.0
 6    106.0
 7    107.0
 8    108.0
 Name: l2_step_1, dtype: float64,
 2: 4    104.0
 5    105.0
 6    106.0
 7    107.0
 8    108.0
 9    109.0
 Name: l2_step_2, dtype: float64}

In [None]:
forecaster.transformer_series

In [None]:
forecaster.transformer_series_

{'l1': StandardScaler(), 'l2': StandardScaler()}

In [None]:
forecaster.regressor

In [None]:
forecaster.lags

{'l1': None, 'l2': array([1, 2, 3, 4])}

In [None]:
forecaster.series_col_names

In [None]:
d = {'l1': 3, 'l2': None}
level = 'l1'

series_col_names = ['l1']

In [None]:
d = {'l1': None, 'l2': 3}
level = 'l1'

series_col_names = ['l1', 'l2']

In [None]:
cols_to_create_lags = {
    'l1': 'X',
    'l2': 'both',
    'l3': 'y'
}

series_col_names = ['l1', 'l2', 'l3']

In [None]:
[col for col in series_col_names 
  if cols_to_create_lags[col] in ['X', 'both']]

['l1', 'l2']

In [9]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor

from skforecast.ForecasterAutoregMultiVariate.tests.fixtures_ForecasterAutoregMultiVariate import series

In [16]:
forecaster = ForecasterAutoregMultiVariate(LinearRegression(), level='l1',
                                               lags={'l1': 5, 'l2': None}, steps=3)
forecaster.fit(series=series)
results = forecaster.predict(steps=3)
results.to_numpy()

array([[0.61119488],
       [0.48858659],
       [0.46753222]])

In [15]:
forecaster.last_window

Unnamed: 0,l1
45,0.250455
46,0.483034
47,0.98556
48,0.519485
49,0.612895


In [20]:
len(set(['l1', 'l2', 'l4']) - set(['l1', 'l2', 'l3'])) > 0

True

In [15]:
import warnings

X_train_col_names = ['l1', 'l2', 'l3']
last_window_cols = ['l1', 'l2', 'l3', 'l4']
last_window_cols = ['l1', 'l2']

print(X_train_col_names, last_window_cols)

['l1', 'l2', 'l3'] ['l1', 'l2']


In [16]:
if len(set(X_train_col_names) - set(last_window_cols)) > 0:
    raise ValueError(
        (f"`last_window` columns must be the same as the `series` "
            f"column names used to create the X_train matrix.\n"
            f"    `last_window` columns    : {list(last_window_cols)}.\n"
            f"    `series` columns X train : {X_train_col_names}.")
    )

if len(set(last_window_cols) - set(X_train_col_names)) > 0:
    warnings.warn(
        (f"`last_window` contains columns that are not used to create the "
         f"X_train matrix. These columns will be ignored.\n"
         f"    `last_window` columns    : {list(last_window_cols)}.\n"
         f"    `series` columns X train : {X_train_col_names}.")
    )

ValueError: `last_window` columns must be the same as the `series` column names used to create the X_train matrix.
    `last_window` columns    : ['l1', 'l2'].
    `series` columns X train : ['l1', 'l2', 'l3'].

In [18]:
import os
import re
import sys
import pytest
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries.model_selection_multiseries import _bayesian_search_optuna_multiseries
import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm
from functools import partialmethod

optuna.logging.set_verbosity(optuna.logging.WARNING)
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # hide progress bar

# Fixtures
from skforecast.model_selection_multiseries.tests.fixtures_model_selection_multiseries import series

In [20]:
series.head(2)

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [50]:
forecaster = ForecasterAutoregMultiVariate(
                     regressor          = Ridge(random_state=123),
                     level              = 'l1',
                     lags               = 2,
                     steps              = 3,
                     transformer_series = None
                 )
steps = 3
n_validation = 12

def search_space(trial):
    search_space  = {
        'alpha': trial.suggest_float('alpha', 1e-2, 1.0),
        'lags' : trial.suggest_categorical('lags', [{'l1': 2, 'l2': [1, 3]}, 
                                                    {'l1': None, 'l2': [1, 3]}, 
                                                    {'l1': [1, 3], 'l2': None}])
    }
    
    return search_space

In [51]:
results = _bayesian_search_optuna_multiseries(
                  forecaster         = forecaster,
                  series             = series,
                  steps              = steps,
                  search_space       = search_space,
                  metric             = 'mean_absolute_error',
                  refit              = True,
                  initial_train_size = len(series) - n_validation,
                  fixed_train_size   = True,
                  n_trials           = 10,
                  random_state       = 123,
                  return_best        = False,
                  verbose            = False
              )[0]

  0%|          | 0/10 [00:00<?, ?it/s]

    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l2'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l2'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l2'] 


In [52]:
results

Unnamed: 0,levels,lags,params,mean_absolute_error,alpha
7,[l1],"{'l1': [1, 2], 'l2': [1, 3]}",{'alpha': 0.30077690592494105},0.208448,0.300777
8,[l1],"{'l1': [1, 2], 'l2': [1, 3]}",{'alpha': 0.4365541356963474},0.208803,0.436554
5,[l1],"{'l1': [1, 2], 'l2': [1, 3]}",{'alpha': 0.6380569489658079},0.209237,0.638057
6,[l1],"{'l1': None, 'l2': [1, 3]}",{'alpha': 0.7252189487445193},0.216851,0.725219
1,[l1],"{'l1': None, 'l2': [1, 3]}",{'alpha': 0.7222742800877074},0.216855,0.722274
9,[l1],"{'l1': None, 'l2': [1, 3]}",{'alpha': 0.43208779389318014},0.217337,0.432088
0,[l1],"{'l1': [1, 3], 'l2': None}",{'alpha': 0.6995044937418831},0.220668,0.699504
2,[l1],"{'l1': [1, 3], 'l2': None}",{'alpha': 0.48612258246951734},0.221598,0.486123
3,[l1],"{'l1': [1, 3], 'l2': None}",{'alpha': 0.4441865222328282},0.221803,0.444187
4,[l1],"{'l1': [1, 3], 'l2': None}",{'alpha': 0.190666813148965},0.22324,0.190667


In [53]:
results.dtypes

levels                  object
lags                    object
params                  object
mean_absolute_error    float64
alpha                  float64
dtype: object

In [54]:
results.index

Index([7, 8, 5, 6, 1, 9, 0, 2, 3, 4], dtype='int64')

In [55]:
results.to_numpy()

array([[list(['l1']), {'l1': array([1, 2]), 'l2': array([1, 3])},
        {'alpha': 0.30077690592494105}, 0.20844762947854312,
        0.30077690592494105],
       [list(['l1']), {'l1': array([1, 2]), 'l2': array([1, 3])},
        {'alpha': 0.4365541356963474}, 0.20880336411565956,
        0.4365541356963474],
       [list(['l1']), {'l1': array([1, 2]), 'l2': array([1, 3])},
        {'alpha': 0.6380569489658079}, 0.2092371153650312,
        0.6380569489658079],
       [list(['l1']), {'l1': None, 'l2': array([1, 3])},
        {'alpha': 0.7252189487445193}, 0.21685083725475654,
        0.7252189487445193],
       [list(['l1']), {'l1': None, 'l2': array([1, 3])},
        {'alpha': 0.7222742800877074}, 0.2168551702095223,
        0.7222742800877074],
       [list(['l1']), {'l1': None, 'l2': array([1, 3])},
        {'alpha': 0.43208779389318014}, 0.21733651515831423,
        0.43208779389318014],
       [list(['l1']), {'l1': array([1, 3]), 'l2': None},
        {'alpha': 0.6995044937418831},

In [33]:
forecaster = ForecasterAutoregMultiVariate(
                     regressor          = Ridge(random_state=123),
                     level              = 'l1',
                     lags               = 2,
                     steps              = 3,
                     transformer_series = None
                 )

lags_grid = {
    'lags_1': {'l1': 2, 'l2': 3},
    'lags_2': {'l1': [1, 3], 'l2': 3},
    'lags_3': {'l1': 2, 'l2': [1, 4]},
    'lags_4': {'l1': 2, 'l2': None},
    'lags_5': {'l1': None, 'l2': 2},
    'lags_6': 3
}
steps = 3
n_validation = 12
param_grid = [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}]

In [34]:
from skforecast.model_selection_multiseries.model_selection_multiseries import _evaluate_grid_hyperparameters_multiseries

results = _evaluate_grid_hyperparameters_multiseries(
                  forecaster         = forecaster,
                  series             = series,
                  param_grid         = param_grid,
                  steps              = steps,
                  metric             = mean_absolute_error,
                  initial_train_size = len(series) - n_validation,
                  fixed_train_size   = False,
                  levels             = None,
                  exog               = None,
                  lags_grid          = lags_grid,
                  refit              = False,
                  return_best        = False,
                  verbose            = False
              )

18 models compared for 1 level(s). Number of iterations: 18.


lags grid:   0%|          | 0/6 [00:00<?, ?it/s]

params grid:   0%|          | 0/3 [00:00<?, ?it/s]

    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    `last_window` columns    : ['l1', 'l2']
    `series` columns X train : ['l1'] 
    

In [35]:
results

Unnamed: 0,levels,lags,lags_label,params,mean_absolute_error,alpha
9,[l1],"{'l1': [1, 2], 'l2': None}",lags_4,{'alpha': 0.01},0.201553,0.01
10,[l1],"{'l1': [1, 2], 'l2': None}",lags_4,{'alpha': 0.1},0.202082,0.1
11,[l1],"{'l1': [1, 2], 'l2': None}",lags_4,{'alpha': 1},0.205161,1.0
0,[l1],"{'l1': [1, 2], 'l2': [1, 2, 3]}",lags_1,{'alpha': 0.01},0.20532,0.01
1,[l1],"{'l1': [1, 2], 'l2': [1, 2, 3]}",lags_1,{'alpha': 0.1},0.205552,0.1
2,[l1],"{'l1': [1, 2], 'l2': [1, 2, 3]}",lags_1,{'alpha': 1},0.206778,1.0
12,[l1],"{'l1': None, 'l2': [1, 2]}",lags_5,{'alpha': 0.01},0.210052,0.01
13,[l1],"{'l1': None, 'l2': [1, 2]}",lags_5,{'alpha': 0.1},0.210075,0.1
14,[l1],"{'l1': None, 'l2': [1, 2]}",lags_5,{'alpha': 1},0.210719,1.0
17,[l1],"[1, 2, 3]",lags_6,{'alpha': 1},0.213537,1.0


In [38]:
results['mean_absolute_error'].to_numpy()

array([0.20155258, 0.20208154, 0.20516149, 0.2053202 , 0.20555199,
       0.20677802, 0.21005165, 0.21007475, 0.21071924, 0.21353688,
       0.21443621, 0.21622784, 0.2166998 , 0.21801147, 0.21863968,
       0.22401526, 0.22830217, 0.22878132])

In [37]:
results.index

Index([9, 10, 11, 0, 1, 2, 12, 13, 14, 17, 5, 16, 15, 4, 3, 8, 7, 6], dtype='int64')