In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

# Descripción

Propuesta para que los forecasters tengan un argumento `calendar_features` con el que pasar una función o un transformer que extraiga características de un índice datetime.

Cuando existes otras exógenas, esto puede hacerse con transformer_exog, pero pensando en el usuario, y dado que en series temporales las features de calendario son muy comunes, sería interesante tener un argumento que permita hacerlo de forma más sencilla.

Cuando no hay otras exógenas, no se puede hacer mediante un transformer_exog.

Comparar las posibilidades utilizando sklearn y con feature-engine.

# Aproximación con sklearn + skforecast

In [2]:
import pandas as pd
import numpy as np
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import Ridge
from skforecast.preprocessing import extract_calendar_features
from skforecast.preprocessing import CalendarFeatures

date_range = pd.date_range(start='2021-01-01', periods=10, freq='D')
y = pd.Series(np.random.rand(10), index=date_range)
df = pd.DataFrame(np.random.rand(10, 2), index=date_range, columns=['feature1', 'feature2'])
display(y.head(3))
display(df.head(3))

2021-01-01    0.171336
2021-01-02    0.458110
2021-01-03    0.088386
Freq: D, dtype: float64

Unnamed: 0,feature1,feature2
2021-01-01,0.662735,0.338198
2021-01-02,0.321158,0.337484
2021-01-03,0.441785,0.578962


## Using function

In [3]:
calendar_features = extract_calendar_features(X=y, features = ['month', 'day_of_week'], encoding='cyclic')
calendar_features

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,-0.8660254,-0.5
2021-01-02,0.5,0.866025,-0.8660254,0.5
2021-01-03,0.5,0.866025,-2.449294e-16,1.0
2021-01-04,0.5,0.866025,0.0,1.0
2021-01-05,0.5,0.866025,0.8660254,0.5
2021-01-06,0.5,0.866025,0.8660254,-0.5
2021-01-07,0.5,0.866025,1.224647e-16,-1.0
2021-01-08,0.5,0.866025,-0.8660254,-0.5
2021-01-09,0.5,0.866025,-0.8660254,0.5
2021-01-10,0.5,0.866025,-2.449294e-16,1.0


## Custom Transformer

In [5]:
# Version skforecast CalendarFeatures
# ======================================================================================
datetime_transformer = CalendarFeatures(features=['month', 'day_of_week'], encoding='cyclic')
datetime_features = datetime_transformer.transform(df)
display(datetime_features)

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,-0.8660254,-0.5
2021-01-02,0.5,0.866025,-0.8660254,0.5
2021-01-03,0.5,0.866025,-2.449294e-16,1.0
2021-01-04,0.5,0.866025,0.0,1.0
2021-01-05,0.5,0.866025,0.8660254,0.5
2021-01-06,0.5,0.866025,0.8660254,-0.5
2021-01-07,0.5,0.866025,1.224647e-16,-1.0
2021-01-08,0.5,0.866025,-0.8660254,-0.5
2021-01-09,0.5,0.866025,-0.8660254,0.5
2021-01-10,0.5,0.866025,-2.449294e-16,1.0


# Utilizando feature-engine

In [6]:
from feature_engine.datetime import DatetimeFeatures
from feature_engine.creation import CyclicalFeatures
from sklearn.pipeline import Pipeline


dtf = DatetimeFeatures(
    features_to_extract = ["year", "month", "day_of_month"],
    variables = "index",
    drop_original=True
    )
cf = CyclicalFeatures(
    variables=["year", "month", "day_of_month"],
    max_values={"month": 12, "day_of_month": 31},
    drop_original=True
)

dtf.fit_transform(df)

# pipeline with both transformers
pipe = Pipeline([
    ("dtf", dtf),
    ("cf", cf)
])
pipe.fit(df)
X_transformed = pipe.transform(df)
X_transformed

Unnamed: 0,feature1,feature2,year,month_sin,month_cos,day_of_month_sin,day_of_month_cos
2021-01-01,0.662735,0.338198,2021,0.5,0.866025,0.201299,0.97953
2021-01-02,0.321158,0.337484,2021,0.5,0.866025,0.394356,0.918958
2021-01-03,0.441785,0.578962,2021,0.5,0.866025,0.571268,0.820763
2021-01-04,0.944855,0.603428,2021,0.5,0.866025,0.724793,0.688967
2021-01-05,0.212865,0.029051,2021,0.5,0.866025,0.848644,0.528964
2021-01-06,0.459289,0.00178,2021,0.5,0.866025,0.937752,0.347305
2021-01-07,0.795934,0.28448,2021,0.5,0.866025,0.988468,0.151428
2021-01-08,0.01667,0.260544,2021,0.5,0.866025,0.998717,-0.050649
2021-01-09,0.274621,0.144996,2021,0.5,0.866025,0.968077,-0.250653
2021-01-10,0.331288,0.789159,2021,0.5,0.866025,0.897805,-0.440394


# Examples

In [23]:
n = 100
date_range = pd.date_range(start='2021-01-01', periods=n, freq='D')
y = pd.Series(np.random.rand(n), index=date_range)
display(y.head(3))

2021-01-01    0.349659
2021-01-02    0.715197
2021-01-03    0.664452
Freq: D, dtype: float64

In [24]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=extract_calendar_features
)
forecaster.fit(y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train.head(3)

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {})] 
Datetime features names: ['year', 'weekend', 'month_sin', 'month_cos', 'week_sin', 'week_cos', 'day_of_week_sin', 'day_of_week_cos', 'day_of_month_sin', 'day_of_month_cos', 'day_of_year_sin', 'day_of_year_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'second_sin', 'second_cos'] 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-15 11:11:52 
Last fit date: 2024-07-15 11:11:52 

Unnamed: 0,lag_1,lag_2,lag_3,year,weekend,month_sin,month_cos,week_sin,week_cos,day_of_week_sin,...,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,hour_sin,hour_cos,minute_sin,minute_cos,second_sin,second_cos
2021-01-04,0.664452,0.715197,0.349659,2021,0,0.5,0.866025,0.120537,0.992709,0.0,...,0.724793,0.688967,0.068615,0.997643,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-05,0.82578,0.664452,0.715197,2021,0,0.5,0.866025,0.120537,0.992709,0.866025,...,0.848644,0.528964,0.085731,0.996318,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-06,0.03138,0.82578,0.664452,2021,0,0.5,0.866025,0.120537,0.992709,0.866025,...,0.937752,0.347305,0.102821,0.9947,0.0,1.0,0.0,1.0,0.0,1.0


In [26]:
exog = extract_calendar_features(X=y)
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
)
forecaster.fit(y, exog)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y, exog)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: True 
Exogenous variables names: ['year', 'weekend', 'month_sin', 'month_cos', 'week_sin', 'week_cos', 'day_of_week_sin', 'day_of_week_cos', 'day_of_month_sin', 'day_of_month_cos', 'day_of_year_sin', 'day_of_year_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'second_sin', 'second_cos'] 
Datetime features: None 
Datetime features names: None 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-15 11:11:56 
Last fit date: 2024-07-15 11:11:56 
Skforecast version: 0.13.0 
Pyt

Unnamed: 0,lag_1,lag_2,lag_3,year,weekend,month_sin,month_cos,week_sin,week_cos,day_of_week_sin,...,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,hour_sin,hour_cos,minute_sin,minute_cos,second_sin,second_cos
2021-01-04,0.664452,0.715197,0.349659,2021,0,0.500000,0.866025,0.120537,0.992709,0.000000e+00,...,0.724793,0.688967,0.068615,0.997643,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-05,0.825780,0.664452,0.715197,2021,0,0.500000,0.866025,0.120537,0.992709,8.660254e-01,...,0.848644,0.528964,0.085731,0.996318,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-06,0.031380,0.825780,0.664452,2021,0,0.500000,0.866025,0.120537,0.992709,8.660254e-01,...,0.937752,0.347305,0.102821,0.994700,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-07,0.009675,0.031380,0.825780,2021,0,0.500000,0.866025,0.120537,0.992709,1.224647e-16,...,0.988468,0.151428,0.119881,0.992788,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-08,0.263764,0.009675,0.031380,2021,0,0.500000,0.866025,0.120537,0.992709,-8.660254e-01,...,0.998717,-0.050649,0.136906,0.990584,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-06,0.078752,0.282912,0.859246,2021,0,0.866025,-0.500000,0.992709,-0.120537,8.660254e-01,...,0.937752,0.347305,0.997018,-0.077175,0.0,1.0,0.0,1.0,0.0,1.0
2021-04-07,0.782119,0.078752,0.282912,2021,0,0.866025,-0.500000,0.992709,-0.120537,8.660254e-01,...,0.988468,0.151428,0.995546,-0.094279,0.0,1.0,0.0,1.0,0.0,1.0
2021-04-08,0.921301,0.782119,0.078752,2021,0,0.866025,-0.500000,0.992709,-0.120537,1.224647e-16,...,0.998717,-0.050649,0.993781,-0.111355,0.0,1.0,0.0,1.0,0.0,1.0
2021-04-09,0.833510,0.921301,0.782119,2021,0,0.866025,-0.500000,0.992709,-0.120537,-8.660254e-01,...,0.968077,-0.250653,0.991723,-0.128398,0.0,1.0,0.0,1.0,0.0,1.0


In [29]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=[(extract_calendar_features, {"features": ["month"], "encoding": "cyclic"})]
)
forecaster.fit(y=y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {'features': ['month'], 'encoding': 'cyclic'})] 
Datetime features names: ['month_sin', 'month_cos'] 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-15 11:13:01 
Last fit date: 2024-07-15 11:13:01 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None 



Unnamed: 0,lag_1,lag_2,lag_3,month_sin,month_cos
2021-01-04,0.664452,0.715197,0.349659,0.500000,0.866025
2021-01-05,0.825780,0.664452,0.715197,0.500000,0.866025
2021-01-06,0.031380,0.825780,0.664452,0.500000,0.866025
2021-01-07,0.009675,0.031380,0.825780,0.500000,0.866025
2021-01-08,0.263764,0.009675,0.031380,0.500000,0.866025
...,...,...,...,...,...
2021-04-06,0.078752,0.282912,0.859246,0.866025,-0.500000
2021-04-07,0.782119,0.078752,0.282912,0.866025,-0.500000
2021-04-08,0.921301,0.782119,0.078752,0.866025,-0.500000
2021-04-09,0.833510,0.921301,0.782119,0.866025,-0.500000


In [30]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=[(extract_calendar_features, {"features": ["month"], "encoding": "cyclic"}), (extract_calendar_features, {"features": ["day_of_week"], "encoding": "cyclic"})]
)
forecaster.fit(y=y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {'features': ['month'], 'encoding': 'cyclic'}), ('extract_calendar_features', {'features': ['day_of_week'], 'encoding': 'cyclic'})] 
Datetime features names: ['month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos'] 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-15 11:13:03 
Last fit date: 2024-07-15 11:13:03 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None 


Unnamed: 0,lag_1,lag_2,lag_3,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-04,0.664452,0.715197,0.349659,0.500000,0.866025,0.000000e+00,1.0
2021-01-05,0.825780,0.664452,0.715197,0.500000,0.866025,8.660254e-01,0.5
2021-01-06,0.031380,0.825780,0.664452,0.500000,0.866025,8.660254e-01,-0.5
2021-01-07,0.009675,0.031380,0.825780,0.500000,0.866025,1.224647e-16,-1.0
2021-01-08,0.263764,0.009675,0.031380,0.500000,0.866025,-8.660254e-01,-0.5
...,...,...,...,...,...,...,...
2021-04-06,0.078752,0.282912,0.859246,0.866025,-0.500000,8.660254e-01,0.5
2021-04-07,0.782119,0.078752,0.282912,0.866025,-0.500000,8.660254e-01,-0.5
2021-04-08,0.921301,0.782119,0.078752,0.866025,-0.500000,1.224647e-16,-1.0
2021-04-09,0.833510,0.921301,0.782119,0.866025,-0.500000,-8.660254e-01,-0.5
