In [10]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
c:\Users\Joaquín Amat\Documents\GitHub\skforecast


In [44]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from skforecast.datasets import fetch_dataset

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import select_features
from skforecast.preprocessing import RollingFeatures

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import select_features_multiseries

In [58]:
# Download data
# ==============================================================================
data = fetch_dataset(name="bike_sharing_extended_features")
data.head(3)

bike_sharing_extended_features
------------------------------
Hourly usage of the bike share system in the city of Washington D.C. during the
years 2011 and 2012. In addition to the number of users per hour, the dataset
was enriched by introducing supplementary features. Addition includes calendar-
based variables (day of the week, hour of the day, month, etc.), indicators for
sunlight, incorporation of rolling temperature averages, and the creation of
polynomial features generated from variable pairs. All cyclic variables are
encoded using sine and cosine functions to ensure accurate representation.
Fanaee-T,Hadi. (2013). Bike Sharing Dataset. UCI Machine Learning Repository.
https://doi.org/10.24432/C5W894.
Shape of the dataset: (17352, 90)


Unnamed: 0_level_0,users,weather,month_sin,month_cos,week_of_year_sin,week_of_year_cos,week_day_sin,week_day_cos,hour_day_sin,hour_day_cos,...,temp_roll_mean_1_day,temp_roll_mean_7_day,temp_roll_max_1_day,temp_roll_min_1_day,temp_roll_max_7_day,temp_roll_min_7_day,holiday_previous_day,holiday_next_day,temp,holiday
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-08 00:00:00,25.0,mist,0.5,0.866025,0.120537,0.992709,-0.781832,0.62349,0.258819,0.965926,...,8.063334,10.127976,9.02,6.56,18.86,4.92,0.0,0.0,7.38,0.0
2011-01-08 01:00:00,16.0,mist,0.5,0.866025,0.120537,0.992709,-0.781832,0.62349,0.5,0.866025,...,8.029166,10.113334,9.02,6.56,18.86,4.92,0.0,0.0,7.38,0.0
2011-01-08 02:00:00,16.0,mist,0.5,0.866025,0.120537,0.992709,-0.781832,0.62349,0.707107,0.707107,...,7.995,10.103572,9.02,6.56,18.86,4.92,0.0,0.0,7.38,0.0


In [59]:

# Data selection (reduce data size to speed up the example)
# ==============================================================================
data = data.drop(columns="weather")
data = data.loc["2012-01-01 00:00:00":]

In [65]:
# Create forecaster
# ==============================================================================
roll = RollingFeatures(stats= ['mean', 'std', 'min', 'max'], window_sizes=10)

forecaster = ForecasterAutoreg(
                 regressor = LGBMRegressor(
                                 n_estimators = 900,
                                 random_state = 15926,
                                 max_depth    = 7,
                                 verbose      = -1
                             ),
                 lags      = 48,
                 window_features=roll
             )

# Feature selection (autoregressive and exog) with scikit-learn RFECV
# ==============================================================================
regressor = LGBMRegressor(n_estimators=10, max_depth=3, random_state=15926, verbose=-1)

selector = RFECV(
    estimator=regressor, step=1, cv=3, min_features_to_select=25, n_jobs=-1
)

selected_autoreg, selected_exog = select_features(
    forecaster      = forecaster,
    selector        = selector,
    y               = data["users"],
    exog            = data.drop(columns="users"),
    select_only     = None,
    force_inclusion = None,
    subsample       = 0.1,
    random_state    = 123,
    verbose         = True,
)

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 8712
Total number of records used for feature selection: 871
Number of features available: 140
    Autoreg (n=52)
    Exog    (n=88)
Number of features selected: 25
    Autoreg (n=15) : [1, 2, 3, 4, 5, 7, 10, 14, 23, 24, 25, 27, 35, 44, 'roll_max_10']
    Exog    (n=10) : ['hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'poly_month_sin__week_of_year_cos', 'poly_week_day_sin__hour_day_cos', 'poly_hour_day_sin__hour_day_cos', 'poly_hour_day_sin__sunset_hour_sin', 'poly_hour_day_cos__sunset_hour_sin']


In [57]:
# Feature selection (autoregressive and exog) with scikit-learn RFECV
# ==============================================================================
regressor = LGBMRegressor(n_estimators=10, max_depth=3, random_state=15926, verbose=-1)

selector = RFECV(
    estimator=regressor, step=1, cv=3, min_features_to_select=25, n_jobs=-1
)

selected_autoreg, selected_exog = select_features(
    forecaster      = forecaster,
    selector        = selector,
    y               = data["users"],
    exog            = data.drop(columns="users"),
    select_only     = None,
    force_inclusion = None,
    subsample       = 0.1,
    random_state    = 123,
    verbose         = True,
)

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 8712
Total number of records used for feature selection: 871
Number of features available: 140
    Autoreg (n=52)
    Exog    (n=88)
Number of features selected: 25
    Autoreg (n=15) : [1, 2, 3, 4, 5, 7, 10, 14, 23, 24, 25, 27, 35, 44, 'roll_max_10']
    Exog    (n=10) : ['hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'poly_month_sin__week_of_year_cos', 'poly_week_day_sin__hour_day_cos', 'poly_hour_day_sin__hour_day_cos', 'poly_hour_day_sin__sunset_hour_sin', 'poly_hour_day_cos__sunset_hour_sin']


In [64]:
forecaster = ForecasterAutoregDirect(
                 regressor = LGBMRegressor(
                                 n_estimators = 900,
                                 random_state = 15926,
                                 max_depth    = 7,
                                 verbose      = -1
                             ),
                 lags      = 48,
                 steps=24
             )

forecaster.window_features = roll
window_features_names = 

In [67]:
forecaster = ForecasterAutoregDirect(
                 regressor = LGBMRegressor(
                                 n_estimators = 900,
                                 random_state = 15926,
                                 max_depth    = 7,
                                 verbose      = -1
                             ),
                 lags      = 48,
                 steps=24
             )

# Mock attibutes
forecaster.window_features = roll
forecaster.window_features_names = ['roll_mean_10', 'roll_std_10', 'roll_min_10', 'roll_max_10']

# Feature selection (autoregressive and exog) with scikit-learn RFECV
# ==============================================================================
regressor = LGBMRegressor(n_estimators=10, max_depth=3, random_state=15926, verbose=-1)

selector = RFECV(
    estimator=regressor, step=1, cv=3, min_features_to_select=25, n_jobs=-1
)

selected_autoreg, selected_exog = select_features(
    forecaster      = forecaster,
    selector        = selector,
    y               = data["users"],
    exog            = data.drop(columns="users"),
    select_only     = None,
    force_inclusion = None,
    subsample       = 0.1,
    random_state    = 123,
    verbose         = True,
)

Recursive feature elimination (RFECV)
-------------------------------------
Total number of records available: 8689
Total number of records used for feature selection: 868
Number of features available: 136
    Autoreg (n=52)
    Exog    (n=88)
Number of features selected: 25
    Autoreg (n=11) : [1, 3, 4, 5, 6, 9, 10, 11, 12, 24, 35]
    Exog    (n=14) : ['week_day_cos', 'hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'temp_roll_mean_1_day', 'temp_roll_mean_7_day', 'temp_roll_max_1_day', 'temp_roll_min_1_day', 'temp_roll_max_7_day', 'temp_roll_min_7_day', 'holiday_previous_day', 'holiday_next_day', 'temp', 'holiday']
