In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
from typing import Union, Optional
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
import re

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.datasets import fetch_dataset
import numpy as np
import pandas as pd
from astral.sun import sun
from astral import LocationInfo
from skforecast.datasets import fetch_dataset
from skforecast.model_selection import select_features


In [3]:
# Downloading data
# ==============================================================================
data = fetch_dataset('bike_sharing_extended_features', raw=False)

bike_sharing_extended_features
------------------------------
Hourly usage of the bike share system in the city of Washington D.C. during the
years 2011 and 2012. In addition to the number of users per hour, the dataset
was enriched by introducing supplementary features. Addition includes calendar-
based variables (day of the week, hour of the day, month, etc.), indicators for
sunlight, incorporation of rolling temperature averages, and the creation of
polynomial features generated from variable pairs. All cyclic variables are
encoded using sine and cosine functions to ensure accurate representation.
Fanaee-T,Hadi. (2013). Bike Sharing Dataset. UCI Machine Learning Repository.
https://doi.org/10.24432/C5W894.
Shape of the dataset: (17352, 90)


In [4]:
# Select exogenous variables to be included in the model
# ==============================================================================
df_exogenous_features = data.drop(columns=['users'])
exog_features = []
# Columns that ends with _sin or _cos are selected
exog_features.extend(df_exogenous_features.filter(regex='_sin$|_cos$').columns.tolist())
# columns that start with temp_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^temp_.*').columns.tolist())
# Columns that start with holiday_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^holiday_.*').columns.tolist())
exog_features.extend(['temp', 'holiday'])

df_exogenous_features = df_exogenous_features.filter(exog_features, axis=1)
df_exogenous_features['dummy_variable'] = 1

In [8]:
forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 100,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only          = None,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)    

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 17252
Total number of records used for feature selection: 1725
Number of features available: 189
    Autoreg (n=100)
    Exog    (n=89)
Number of features selected: 128
    Autoreg (n=43) : [1, 2, 3, 6, 8, 9, 10, 14, 16, 17, 18, 23, 24, 25, 26, 31, 32, 33, 37, 41, 52, 56, 57, 62, 63, 65, 72, 73, 74, 75, 77, 78, 79, 80, 81, 91, 93, 94, 95, 96, 97, 98, 99]
    Exog    (n=86) : ['month_sin', 'month_cos', 'week_of_year_sin', 'week_of_year_cos', 'week_day_sin', 'week_day_cos', 'hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'sunset_hour_cos', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunrise_hour_sin', 'poly_month_sin__sunrise_hour_cos', '

In [9]:
def create_predictors(y):
    lags = y[-1:-6:-1]                
    return lags


forecaster = ForecasterAutoregCustom(
                regressor = Ridge(random_state=123),
                fun_predictors = create_predictors,
                window_size = 5,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only          = None,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 17347
Total number of records used for feature selection: 1734
Number of features available: 94
    Autoreg (n=5)
    Exog    (n=89)
Number of features selected: 91
    Autoreg (n=3) : ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2']
    Exog    (n=89) : ['month_sin', 'month_cos', 'week_of_year_sin', 'week_of_year_cos', 'week_day_sin', 'week_day_cos', 'hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'sunset_hour_cos', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunrise_hour_sin', 'poly_month_sin__sunrise_hour_cos', 'poly_month_sin__sunset_hour_sin', 'poly_month_sin__sunset_hour_cos', 'poly_month_cos__week_of_year_sin', 

In [10]:
from sklearn.feature_selection import SelectFromModel

forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 100,
            )

selector = SelectFromModel(
    estimator    = forecaster.regressor,
    threshold    = 0.25,
    max_features = 25
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only         = None,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination (SelectFromModel)
-----------------------------------------------
Total number of records available: 17252
Total number of records used for feature selection: 1725
Number of features available: 189
    Autoreg (n=100)
    Exog    (n=89)
Number of features selected: 25
    Autoreg (n=0) : []
    Exog    (n=26) : ['month_sin', 'week_of_year_sin', 'week_day_sin', 'sunset_hour_sin', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__sunset_hour_sin', 'poly_month_sin__sunset_hour_cos', 'poly_month_cos__week_of_year_sin', 'poly_month_cos__week_of_year_cos', 'poly_month_cos__week_day_sin', 'poly_week_of_year_sin__week_day_sin', 'poly_week_of_year_sin__week_day_cos', 'poly_week_of_year_sin__sunrise_hour_cos', 'poly_week_of_year_sin__sunset_hour_sin', 'poly_week_of_year_cos__sunset_hour_cos', 'poly_week_day_sin__sunrise_hour_sin', 'poly



In [11]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import ShuffleSplit

forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 10,
            )

selector = SequentialFeatureSelector(
    estimator    = forecaster.regressor,
    n_features_to_select = 25,
    direction    = 'forward',
    cv           = ShuffleSplit(n_splits=1, test_size=0.3, random_state=951),
    scoring      = 'neg_mean_absolute_error',
    n_jobs       = -1
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only          = None,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination (SequentialFeatureSelector)
---------------------------------------------------------
Total number of records available: 17342
Total number of records used for feature selection: 1734
Number of features available: 99
    Autoreg (n=10)
    Exog    (n=89)
Number of features selected: 25
    Autoreg (n=5) : [1, 2, 3, 6, 7]
    Exog    (n=21) : ['sunset_hour_sin', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_cos__week_of_year_sin', 'poly_month_cos__hour_day_cos', 'poly_week_of_year_sin__week_day_sin', 'poly_week_of_year_sin__week_day_cos', 'poly_week_of_year_cos__week_day_cos', 'poly_week_of_year_cos__hour_day_cos', 'poly_week_of_year_cos__sunrise_hour_cos', 'poly_week_day_cos__hour_day_sin', 'poly_week_day_cos__hour_day_cos', 'poly_hour_day_sin__hour_day_cos', 'poly_hour_day_cos__sunrise_hour_sin', 'poly_hour_day_cos__sunrise_hour_cos', 'poly_sunrise_hour_sin__sunset_hour_sin', 'temp_roll_max

In [24]:
# Simulate time series with hourly frequency with reproducible results
# ==============================================================================
from sklearn.feature_selection import RFE
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
import pytest

exog, y = make_regression(n_samples=500, n_features=5, n_informative=2, random_state=123)
exog = pd.DataFrame(
    exog,
    index = pd.date_range(start='2020-01-01', periods=len(exog), freq='H'),
    columns=[f"exog_{i}" for i in range(exog.shape[1])]
)
y = pd.Series(y, index=exog.index, name="y")

In [40]:
from skforecast.model_selection.fixtures_model_selection.fixtures_model_selection import y_feature_selection as y
from .fixtures_model_selection import exog_feature_selection as exog

ModuleNotFoundError: No module named 'skforecast.model_selection.fixtures_model_selection'

In [None]:
def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']

def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_regressor():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=LinearRegression(), n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_False():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=5)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = False,
        verbose              = False,
    )

    assert selected_lags == [5]
    assert selected_exog == ['exog_0', 'exog_1', 'exog_2', 'exog_3']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_False_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=5)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = False,
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_4']
    assert selected_exog == ['exog_0', 'exog_1', 'exog_2', 'exog_3']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = "^exog_3",
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = "^exog_3",
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = ['exog_3'],
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = ['exog_3'],
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']

def test_select_features_raise_error_when_forecaster_is_not_supported():
    """
    Test that select_features raises an error when forecaster is not supported.
    """
    forecaster = ForecasterAutoregDirect(
                    regressor = LinearRegression(),
                    lags      = 5,
                    steps     = 3
                )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)
    err_msg = re.escape(
            "`forecaster` must be one of the following classes: ['ForecasterAutoreg', "
            "'ForecasterAutoregCustom']."
        )
    with pytest.raises(Exception, match = err_msg):
        selected_lags, selected_exog = select_features(
            selector             = selector,
            forecaster           = forecaster,
            y                    = y,
            exog                 = exog,
            select_only_exog     = True,
            verbose              = False,
        )

def test_select_features_raise_error_when_subsamle_is_not_between_0_and_1():
    """
    Test that select_features raises an error when subsample is not between 0 and 1.
    """
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)
    err_msg = re.escape(
            "`subsample` must be a number between 0 and 1."
        )
    with pytest.raises(Exception, match = err_msg):
        selected_lags, selected_exog = select_features(
            selector             = selector,
            forecaster           = forecaster,
            y                    = y,
            exog                 = exog,
            select_only_exog     = True,
            subsample            = 2,
            verbose              = False,
        )

test_select_features_when_selector_is_RFE_and_select_only_exog_is_True()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_regressor()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_False()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_False_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list_ForecasterAutoregCustom()
test_select_features_raise_error_when_forecaster_is_not_supported()
test_select_features_raise_error_when_subsamle_is_not_between_0_and_1()

AssertionError: 

In [16]:
def test_select_features_when_selector_is_RFE_and_select_only_is_exog():
    """
    Test that select_features returns the expected values when selector is RFE
    and select_only is 'exog'.
    """
    forecaster = ForecasterAutoreg(
                     regressor = LinearRegression(),
                     lags      = 5,
                 )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_autoreg, selected_exog = select_features(
        selector    = selector,
        forecaster  = forecaster,
        y           = y,
        exog        = exog,
        select_only = 'exog',
        verbose     = True,
    )

    assert selected_autoreg == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']


def test_select_features_when_selector_is_RFE_and_select_only_is_exog_regressor():
    """
    Test that select_features returns the expected values when selector is RFE
    and select_only is 'exog' and regressor is passed to the selector instead
    of forecaster.regressor.
    """
    forecaster = ForecasterAutoreg(
                     regressor = LinearRegression(),
                     lags      = 5,
                 )
    selector = RFE(estimator=LinearRegression(), n_features_to_select=3)

    selected_autoreg, selected_exog = select_features(
        selector    = selector,
        forecaster  = forecaster,
        y           = y,
        exog        = exog,
        select_only = 'exog',
        verbose     = False,
    )

    assert selected_autoreg == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_0', 'exog_1', 'exog_2']


test_select_features_when_selector_is_RFE_and_select_only_is_exog()
test_select_features_when_selector_is_RFE_and_select_only_is_exog_regressor()

Recursive feature elimination (RFE)
-----------------------------------
Total number of records available: 495
Total number of records used for feature selection: 247
Number of features available: 5
    Autoreg (n=5)
    Exog    (n=5)
Number of features selected: 3
    Autoreg (n=5) : [1, 2, 3, 4, 5]
    Exog    (n=3) : ['exog_1', 'exog_2', 'exog_4']


AssertionError: 

In [23]:
forecaster = ForecasterAutoreg(
                     regressor = LinearRegression(),
                     lags      = 5,
                 )
selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

selected_autoreg, selected_exog = select_features(
    selector        = selector,
    forecaster      = forecaster,
    y               = y,
    exog            = exog,
    select_only     = 'exog',
    force_inclusion = "^exog_3",
    verbose         = False,
)


selected_autoreg
selected_exog

['exog_1', 'exog_2', 'exog_3', 'exog_4']

In [16]:
import re
import pytest
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import select_features

# Fixtures
exog, y = make_regression(n_samples=500, n_features=5, n_informative=2, random_state=124)
exog = pd.DataFrame(
           data    = exog,
           index   = pd.date_range(start='2020-01-01', periods=len(exog), freq='H'),
           columns = [f"exog_{i}" for i in range(exog.shape[1])]
       )
y = pd.Series(y, index=exog.index, name="y")

In [21]:
forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )
selector = RFE(estimator=forecaster.regressor, n_features_to_select=5)

selected_lags, selected_exog = select_features(
    selector         = selector,
    forecaster       = forecaster,
    y                = y,
    exog             = exog,
    select_only_exog = False,
    force_inclusion  = "^exog_0",
    verbose          = False,
)

{'exog_0'}
{'exog_3', 'lag_4', 'exog_1', 'lag_5', 'exog_2'}
{'exog_0'}
['exog_1', 'exog_2', 'exog_3', 'exog_0']
['exog_0', 'exog_1', 'exog_2', 'exog_3', 'exog_4']
['exog_0', 'exog_1', 'exog_2', 'exog_3']


In [12]:
selected_lags

[5]

In [13]:
selected_exog

['exog_0', 'exog_1', 'exog_2', 'exog_3', 'exog_4']

In [23]:
set(['lag1', 'lag2']) - set(['exog_3', 'lag1'])

{'lag2'}