In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import select_features_multiseries

# Data download
# ==============================================================================
url = (
       'https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/'
       'data/simulated_items_sales.csv'
)
data = pd.read_csv(url, sep=',')

# Data preparation
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data = data.set_index('date')
data = data.asfreq('D')
data = data.sort_index()
exog = data.copy()
exog.columns = [f'exog_{i}' for i in range(exog.shape[1])]

# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()
exog_train = exog.loc[:end_train, :].copy()
exog_test  = exog.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


In [3]:
data

Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737
...,...,...,...
2014-12-28,17.329233,18.189583,20.586030
2014-12-29,19.611623,24.539583,28.127390
2014-12-30,18.857026,17.677083,21.555782
2014-12-31,18.721223,17.391667,18.605453


In [4]:
exog

Unnamed: 0_level_0,exog_0,exog_1,exog_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737
...,...,...,...
2014-12-28,17.329233,18.189583,20.586030
2014-12-29,19.611623,24.539583,28.127390
2014-12-30,18.857026,17.677083,21.555782
2014-12-31,18.721223,17.391667,18.605453


In [5]:
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 forecaster_id      = None,
                 #fit_kwargs={'categorical_feature':'auto'}
             )

forecaster

ForecasterAutoregMultiSeries 
Regressor: LGBMRegressor(random_state=123, verbose=-1) 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
Transformer for series: None 
Transformer for exog: None 
Series encoding: ordinal 
Window size: 24 
Series levels (names): None 
Series weights: None 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Regressor parameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': 123, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'verbos

In [6]:
output = forecaster._create_train_X_y(data_train, exog_train)
X_train = output[0]
y_train = output[1]
series_col_names = output[3]


In [7]:
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit

In [8]:
# Feature selection (autoregressive and exog) with scikit-learn RFECV
# ==============================================================================
regressor = LGBMRegressor(n_estimators=100, max_depth=5, random_state=15926, verbose=-1)

selector = RFECV(
    estimator=regressor, step=1, cv=3, min_features_to_select=10, n_jobs=-1
)

selected_autoreg, selected_exog = select_features_multiseries(
    forecaster      = forecaster,
    selector        = selector,
    series          = data,
    exog            = exog,
    select_only     = None,
    force_inclusion = None,
    subsample       = 0.5,
    random_state    = 123,
    verbose         = True,
)

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 3219
Total number of records used for feature selection: 4827
Number of features available: 27
    Autoreg (n=24)
    Exog    (n=3)
Number of features selected: 11
    Autoreg (n=7) : [1, 5, 7, 14, 15, 18, 21]
    Exog    (n=3) : ['exog_0', 'exog_1', 'exog_2']


In [9]:
import pytest
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.model_selection_multiseries import select_features_multiseries


In [10]:
# Fixtures model_selection_multiseries
# ==============================================================================
import numpy as np
import pandas as pd

# Fixtures
# series_1 = np.random.rand(50)
# series_2 = np.random.rand(50)
# exog_1   = series_1 + np.random.normal(0, 0.1, 50)
# exog_2   = np.random.rand(50)
# exog_3   = np.random.rand(50)
# exog_4   = series_2 + np.random.normal(0, 0.1, 50)

series = pd.DataFrame({
     'l1': pd.Series(np.array(
               [0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897,
               0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752,
               0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426,
               0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759,
               0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338,
               0.32295891, 0.36178866, 0.22826323, 0.29371405, 0.63097612,
               0.09210494, 0.43370117, 0.43086276, 0.4936851 , 0.42583029,
               0.31226122, 0.42635131, 0.89338916, 0.94416002, 0.50183668,
               0.62395295, 0.1156184 , 0.31728548, 0.41482621, 0.86630916,
               0.25045537, 0.48303426, 0.98555979, 0.51948512, 0.61289453]
                    )
          ), 
     'l2': pd.Series(np.array(
               [0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
               0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234,
               0.66931378, 0.58593655, 0.6249035 , 0.67468905, 0.84234244,
               0.08319499, 0.76368284, 0.24366637, 0.19422296, 0.57245696,
               0.09571252, 0.88532683, 0.62724897, 0.72341636, 0.01612921,
               0.59443188, 0.55678519, 0.15895964, 0.15307052, 0.69552953,
               0.31876643, 0.6919703 , 0.55438325, 0.38895057, 0.92513249,
               0.84167   , 0.35739757, 0.04359146, 0.30476807, 0.39818568,
               0.70495883, 0.99535848, 0.35591487, 0.76254781, 0.59317692,
               0.6917018 , 0.15112745, 0.39887629, 0.2408559 , 0.34345601]
                    )
          )
})

exog = pd.DataFrame({
     'exog1': pd.Series(np.array(
               [0.81362466, 0.18065237, 0.23475578, 0.65981251, 0.77626016,
               0.41868245, 0.96643556, 0.67516195, 0.4466783 , 0.44168289,
               0.35158485, 0.75925757, 0.40625296, 0.19853505, 0.34611829,
               0.61579746, 0.07568532, 0.06880357, 0.59517447, 0.57993728,
               0.79023383, 0.66388325, 0.67390093, 0.68257639, 0.47676493,
               0.3977578 , 0.56785123, 0.34779524, 0.26016756, 0.70679266,
               0.04774995, 0.39197318, 0.46367839, 0.4370433 , 0.41450122,
               0.37961077, 0.47225148, 0.79649699, 1.0426137 , 0.48792391,
               0.50458267, 0.20520444, 0.23720236, 0.39452153, 0.85171668,
               0.15336444, 0.4738726 , 1.17622403, 0.53176631, 0.55083837])
               ),
     'exog2': pd.Series(np.array(
          [0.22529048, 0.97937984, 0.17235964, 0.24529647, 0.7127206 ,
          0.62075889, 0.11435243, 0.91700825, 0.5831643 , 0.04030412,
          0.65716865, 0.30658072, 0.46395434, 0.48452434, 0.46278193,
          0.18426942, 0.26344748, 0.23883066, 0.72779828, 0.07488211,
          0.94899474, 0.44163677, 0.25777764, 0.90987212, 0.38569441,
          0.77341595, 0.70397664, 0.61953314, 0.66083099, 0.24472837,
          0.37629717, 0.09605039, 0.9278549 , 0.3234861 , 0.272853  ,
          0.73149366, 0.84567995, 0.03531591, 0.25809087, 0.53148211,
          0.83952041, 0.26563705, 0.60606193, 0.0181884 , 0.60906828,
          0.19357335, 0.16319844, 0.58427611, 0.81820829, 0.08336001])
     ),
     'exog3': pd.Series(np.array(
          [0.14173692, 0.19838271, 0.48780824, 0.08424373, 0.08567588,
          0.70887437, 0.04705875, 0.22284741, 0.92726105, 0.99537986,
          0.83901923, 0.29050851, 0.05909171, 0.27456474, 0.25750109,
          0.43010083, 0.29702035, 0.56146054, 0.1388417 , 0.84215781,
          0.83955923, 0.96457563, 0.66089525, 0.15896887, 0.61108399,
          0.27603516, 0.92614876, 0.48826627, 0.32792168, 0.87529287,
          0.6452307 , 0.50767682, 0.26563346, 0.54585537, 0.80222916,
          0.88929714, 0.86435062, 0.71305703, 0.18654522, 0.41497294,
          0.66556244, 0.71263307, 0.98795819, 0.6306933 , 0.14902407,
          0.16021244, 0.66550264, 0.77537995, 0.07558725, 0.21460743])
     ),
     'exog4': pd.Series(np.array(
          [-0.06574814,  0.90956685,  0.62142653,  0.82637769,  0.56159495,
          0.34376551,  0.45848779,  0.38489167,  0.89467872,  0.54904309,
          0.8185359 ,  0.48700256,  0.68844761,  0.69774397,  0.76506425,
          0.00440831,  0.79692897,  0.12322595,  0.12812382,  0.62445613,
          0.2433968 ,  0.84983804,  0.6108338 ,  0.68502064,  0.01186788,
          0.54597677,  0.44377848,  0.29329784, -0.02316005,  0.45542694,
          0.221795  ,  0.7963138 ,  0.57343061,  0.33216654,  0.91545132,
          0.93857233,  0.39897906,  0.16776203,  0.31379519,  0.3557038 ,
          0.83869898,  0.99444057,  0.32071535,  0.80514198,  0.52945861,
          0.76451127,  0.37253453,  0.37938831,  0.12500821,  0.30680189])
     ),
})

In [11]:
def test_TypeError_select_features_raise_when_forecaster_is_not_supported():
    """
    Test TypeError is raised in select_features when forecaster is not supported.
    """
    
    err_msg = re.escape(
        "`forecaster` must be one of the following classes: ['ForecasterAutoregMultiSeries', "
        "'ForecasterAutoregMultiSeriesCustom']."
    )
    with pytest.raises(TypeError, match = err_msg):
        select_features_multiseries(
            selector   = object(),
            forecaster = object(),
            series     = object(),
            exog       = object(),
        )

test_TypeError_select_features_raise_when_forecaster_is_not_supported()

In [12]:
def test_ValueError_select_features_raise_when_select_only_is_not_autoreg_exog_None(select_only):
    """
    Test ValueError is raised in select_features when `select_only` is not 'autoreg',
    'exog' or None.
    """
    forecaster = ForecasterAutoregMultiSeries(
                     regressor = LinearRegression(),
                     lags      = 5,
                 )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    err_msg = re.escape(
        "`select_only` must be one of the following values: 'autoreg', 'exog', None."
    )
    with pytest.raises(ValueError, match = err_msg):
        select_features_multiseries(
            selector    = selector,
            forecaster  = forecaster,
            series      = object(),
            exog        = object(),
            select_only = select_only,
        )

test_ValueError_select_features_raise_when_select_only_is_not_autoreg_exog_None(False)

In [23]:
forecaster = ForecasterAutoregMultiSeries(
                    regressor = LinearRegression(),
                    lags      = 5,
                    encoding  = 'onehot'
                )
selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

selected_autoreg, selected_exog = select_features_multiseries(
    selector        = selector,
    forecaster      = forecaster,
    series          = series,
    exog            = exog,
    select_only     = None,
    force_inclusion = ['lag_1'],
    verbose         = True,
)


Recursive feature elimination (RFE)
-----------------------------------
Total number of records available: 90
Total number of records used for feature selection: 90
Number of features available: 9
    Autoreg (n=5)
    Exog    (n=4)
Number of features selected: 3
    Autoreg (n=1) : [1]
    Exog    (n=3) : ['exog1', 'exog3', 'exog4']


In [14]:
import warnings
warnings.warn(
            ("No autoregressive features has been selected. Since a Forecaster "
             "cannot be created without them, be sure to include at least one "
             "using the `force_inclusion` parameter."), UserWarning
        )



In [104]:
forecaster = ForecasterAutoregMultiSeries(
                    regressor = LinearRegression(),
                    lags      = 5,
                    encoding  = 'onehot'
                )
selector = RFE(estimator=LinearRegression(), n_features_to_select=2)

selected_autoreg, selected_exog = select_features_multiseries(
    selector    = selector,
    forecaster  = forecaster,
    series      = series,
    exog        = exog,
    select_only = 'exog',
    verbose     = True,
)

Recursive feature elimination (RFE)
-----------------------------------
Total number of records available: 90
Total number of records used for feature selection: 90
Number of features available: 9
    Autoreg (n=5)
    Exog    (n=4)
Number of features selected: 2
    Autoreg (n=5) : [1, 2, 3, 4, 5]
    Exog    (n=2) : ['exog1', 'exog4']


In [52]:
selector.n_features_to_select

13

In [51]:
if hasattr(selector, 'n_features_to_select'):
    n_features_to_select = selector.n_features_to_select + 10
    selector.set_params(n_features_to_select=n_features_to_select)
elif hasattr(selector, 'min_features_to_select'):
    min_features_to_select = selector.min_features_to_select + 10
    selector.set_params(min_features_to_select=min_features_to_select)

In [64]:
np.random.rand(50)

array([0.18920282, 0.71740442, 0.74196793, 0.84316573, 0.73553999,
       0.47425836, 0.22983873, 0.39374686, 0.58599894, 0.69600811,
       0.58230332, 0.17794978, 0.39182429, 0.24260633, 0.54306653,
       0.05084278, 0.3429293 , 0.94857519, 0.39354258, 0.1036654 ,
       0.01501619, 0.35612163, 0.12183972, 0.49380503, 0.9915272 ,
       0.75792333, 0.94115699, 0.05275798, 0.51981235, 0.06665954,
       0.12124489, 0.01495866, 0.23089716, 0.27690914, 0.71819339,
       0.38557835, 0.53317773, 0.46772104, 0.98604754, 0.04593223,
       0.88395942, 0.56223342, 0.80942671, 0.41684799, 0.95302716,
       0.97216776, 0.34489461, 0.79841948, 0.55978385, 0.65491623])