In [2]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

# Descripción

Propuesta para que los forecasters tengan un argumento `calendar_features` con el que pasar una función o un transformer que extraiga características de un índice datetime.

Cuando existes otras exógenas, esto puede hacerse con transformer_exog, pero pensando en el usuario, y dado que en series temporales las features de calendario son muy comunes, sería interesante tener un argumento que permita hacerlo de forma más sencilla.

Cuando no hay otras exógenas, no se puede hacer mediante un transformer_exog.

Comparar las posibilidades utilizando sklearn y con feature-engine.

# Aproximación con sklearn + skforecast

In [3]:
import pandas as pd
import numpy as np
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import Ridge
from skforecast.preprocessing import extract_calendar_features
from skforecast.preprocessing import CalendarFeatures

date_range = pd.date_range(start='2021-01-01', periods=10, freq='D')
y = pd.Series(np.random.rand(10), index=date_range)
df = pd.DataFrame(np.random.rand(10, 2), index=date_range, columns=['feature1', 'feature2'])
display(y.head(3))
display(df.head(3))

2021-01-01    0.444147
2021-01-02    0.367673
2021-01-03    0.713494
Freq: D, dtype: float64

Unnamed: 0,feature1,feature2
2021-01-01,0.407161,0.307888
2021-01-02,0.144428,0.61718
2021-01-03,0.044078,0.436368


## Using function

In [8]:
calendar_features = extract_calendar_features(X=y, features = ['month', 'day_of_week'], cyclic_encoding=True)
calendar_features

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,-0.8660254,-0.5
2021-01-02,0.5,0.866025,-0.8660254,0.5
2021-01-03,0.5,0.866025,-2.449294e-16,1.0
2021-01-04,0.5,0.866025,0.0,1.0
2021-01-05,0.5,0.866025,0.8660254,0.5
2021-01-06,0.5,0.866025,0.8660254,-0.5
2021-01-07,0.5,0.866025,1.224647e-16,-1.0
2021-01-08,0.5,0.866025,-0.8660254,-0.5
2021-01-09,0.5,0.866025,-0.8660254,0.5
2021-01-10,0.5,0.866025,-2.449294e-16,1.0


## Custom Transformer

In [9]:
# Version skforecast CalendarFeatures
# ======================================================================================
datetime_transformer = CalendarFeatures(features=['month', 'day_of_week'], cyclic_encoding=True)
datetime_features = datetime_transformer.transform(df)
display(datetime_features)

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,-0.8660254,-0.5
2021-01-02,0.5,0.866025,-0.8660254,0.5
2021-01-03,0.5,0.866025,-2.449294e-16,1.0
2021-01-04,0.5,0.866025,0.0,1.0
2021-01-05,0.5,0.866025,0.8660254,0.5
2021-01-06,0.5,0.866025,0.8660254,-0.5
2021-01-07,0.5,0.866025,1.224647e-16,-1.0
2021-01-08,0.5,0.866025,-0.8660254,-0.5
2021-01-09,0.5,0.866025,-0.8660254,0.5
2021-01-10,0.5,0.866025,-2.449294e-16,1.0


# Utilizando feature-engine

In [12]:
from feature_engine.datetime import DatetimeFeatures
from feature_engine.creation import CyclicalFeatures
from sklearn.pipeline import Pipeline


dtf = DatetimeFeatures(
    features_to_extract = ["year", "month", "day_of_month"],
    variables = "index",
    drop_original=True
    )
cf = CyclicalFeatures(
    variables=["year", "month", "day_of_month"],
    max_values={"month": 12, "day_of_month": 31}
)

dtf.fit_transform(df)

# pipeline with both transformers
pipe = Pipeline([
    ("dtf", dtf),
    ("cf", cf)
])
pipe.fit(df)
X_transformed = pipe.transform(df)
X_transformed

Unnamed: 0,feature1,feature2,year,month,day_of_month,month_sin,month_cos,day_of_month_sin,day_of_month_cos
2021-01-01,0.407161,0.307888,2021,1,1,0.5,0.866025,0.201299,0.97953
2021-01-02,0.144428,0.61718,2021,1,2,0.5,0.866025,0.394356,0.918958
2021-01-03,0.044078,0.436368,2021,1,3,0.5,0.866025,0.571268,0.820763
2021-01-04,0.452743,0.565892,2021,1,4,0.5,0.866025,0.724793,0.688967
2021-01-05,0.112728,0.897489,2021,1,5,0.5,0.866025,0.848644,0.528964
2021-01-06,0.822022,0.00667,2021,1,6,0.5,0.866025,0.937752,0.347305
2021-01-07,0.288623,0.446365,2021,1,7,0.5,0.866025,0.988468,0.151428
2021-01-08,0.550883,0.080085,2021,1,8,0.5,0.866025,0.998717,-0.050649
2021-01-09,0.685144,0.080707,2021,1,9,0.5,0.866025,0.968077,-0.250653
2021-01-10,0.946271,0.261136,2021,1,10,0.5,0.866025,0.897805,-0.440394


# Examples

In [79]:
n = 100
date_range = pd.date_range(start='2021-01-01', periods=n, freq='D')
y = pd.Series(np.random.rand(n), index=date_range)
df = pd.DataFrame(np.random.rand(n, 2), index=date_range, columns=['feature1', 'feature2'])
display(y.head(3))
display(df.head(3))

2021-01-01    0.543937
2021-01-02    0.127005
2021-01-03    0.292044
Freq: D, dtype: float64

Unnamed: 0,feature1,feature2
2021-01-01,0.004303,0.807228
2021-01-02,0.771435,0.674816
2021-01-03,0.42593,0.485995


In [81]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=extract_calendar_features
)

print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {})] 
Datetime features names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-14 20:47:36 
Last fit date: None 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None 



Unnamed: 0,lag_1,lag_2,lag_3,year,month,week,day_of_week,day_of_year,day_of_month,weekend,hour,minute,second
2021-01-04,0.292044,0.127005,0.543937,2021,1,1,0,4,4,False,0,0,0
2021-01-05,0.013112,0.292044,0.127005,2021,1,1,1,5,5,False,0,0,0
2021-01-06,0.485419,0.013112,0.292044,2021,1,1,2,6,6,False,0,0,0
2021-01-07,0.930663,0.485419,0.013112,2021,1,1,3,7,7,False,0,0,0
2021-01-08,0.816790,0.930663,0.485419,2021,1,1,4,8,8,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-06,0.738870,0.170019,0.443217,2021,4,14,1,96,6,False,0,0,0
2021-04-07,0.989179,0.738870,0.170019,2021,4,14,2,97,7,False,0,0,0
2021-04-08,0.559607,0.989179,0.738870,2021,4,14,3,98,8,False,0,0,0
2021-04-09,0.897082,0.559607,0.989179,2021,4,14,4,99,9,False,0,0,0


In [85]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=extract_calendar_features
)
forecaster.fit(y=y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {})] 
Datetime features names: ['year', 'month', 'week', 'day_of_week', 'day_of_year', 'day_of_month', 'weekend', 'hour', 'minute', 'second'] 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-14 20:48:36 
Last fit date: 2024-07-14 20:48:36 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None 



Unnamed: 0,lag_1,lag_2,lag_3,year,month,week,day_of_week,day_of_year,day_of_month,weekend,hour,minute,second
2021-01-04,0.292044,0.127005,0.543937,2021,1,1,0,4,4,False,0,0,0
2021-01-05,0.013112,0.292044,0.127005,2021,1,1,1,5,5,False,0,0,0
2021-01-06,0.485419,0.013112,0.292044,2021,1,1,2,6,6,False,0,0,0
2021-01-07,0.930663,0.485419,0.013112,2021,1,1,3,7,7,False,0,0,0
2021-01-08,0.816790,0.930663,0.485419,2021,1,1,4,8,8,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-06,0.738870,0.170019,0.443217,2021,4,14,1,96,6,False,0,0,0
2021-04-07,0.989179,0.738870,0.170019,2021,4,14,2,97,7,False,0,0,0
2021-04-08,0.559607,0.989179,0.738870,2021,4,14,3,98,8,False,0,0,0
2021-04-09,0.897082,0.559607,0.989179,2021,4,14,4,99,9,False,0,0,0


In [88]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    datetime_features=[(extract_calendar_features, {"features": ["month"], "cyclic_encoding": True}), (extract_calendar_features, {"features": ["day_of_week"], "cyclic_encoding": True})]
)
forecaster.fit(y=y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y)
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: None 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: [('extract_calendar_features', {'features': ['month'], 'cyclic_encoding': True}), ('extract_calendar_features', {'features': ['day_of_week'], 'cyclic_encoding': True})] 
Datetime features names: ['month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos'] 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-14 20:49:48 
Last fit date: 2024-07-14 20:49:48 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: 

Unnamed: 0,lag_1,lag_2,lag_3,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-04,0.292044,0.127005,0.543937,0.500000,0.866025,0.000000e+00,1.0
2021-01-05,0.013112,0.292044,0.127005,0.500000,0.866025,8.660254e-01,0.5
2021-01-06,0.485419,0.013112,0.292044,0.500000,0.866025,8.660254e-01,-0.5
2021-01-07,0.930663,0.485419,0.013112,0.500000,0.866025,1.224647e-16,-1.0
2021-01-08,0.816790,0.930663,0.485419,0.500000,0.866025,-8.660254e-01,-0.5
...,...,...,...,...,...,...,...
2021-04-06,0.738870,0.170019,0.443217,0.866025,-0.500000,8.660254e-01,0.5
2021-04-07,0.989179,0.738870,0.170019,0.866025,-0.500000,8.660254e-01,-0.5
2021-04-08,0.559607,0.989179,0.738870,0.866025,-0.500000,1.224647e-16,-1.0
2021-04-09,0.897082,0.559607,0.989179,0.866025,-0.500000,-8.660254e-01,-0.5


In [91]:
forecaster = ForecasterAutoreg(
    regressor = Ridge(),
    lags = 3,
    transformer_exog=CalendarFeatures(features=['month', 'day_of_week'], cyclic_encoding=True)
)
forecaster.fit(y=y)
print(forecaster)
X_train, y_train = forecaster.create_train_X_y(y, exog=pd.DataFrame(index=y.index, columns=['month', 'day_of_week']))
X_train

ForecasterAutoreg 
Regressor: Ridge() 
Lags: [1 2 3] 
Transformer for y: None 
Transformer for exog: CalendarFeatures(cyclic_encoding=True, features=['month', 'day_of_week']) 
Window size: 3 
Weight function included: False 
Differentiation order: None 
Exogenous included: False 
Exogenous variables names: None 
Datetime features: None 
Datetime features names: None 
Training range: [Timestamp('2021-01-01 00:00:00'), Timestamp('2021-04-10 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.0001} 
fit_kwargs: {} 
Creation date: 2024-07-14 20:51:04 
Last fit date: 2024-07-14 20:51:04 
Skforecast version: 0.13.0 
Python version: 3.12.4 
Forecaster id: None 





Unnamed: 0,lag_1,lag_2,lag_3,month,day_of_week
2021-01-04,0.292044,0.127005,0.543937,,
2021-01-05,0.013112,0.292044,0.127005,,
2021-01-06,0.485419,0.013112,0.292044,,
2021-01-07,0.930663,0.485419,0.013112,,
2021-01-08,0.816790,0.930663,0.485419,,
...,...,...,...,...,...
2021-04-06,0.738870,0.170019,0.443217,,
2021-04-07,0.989179,0.738870,0.170019,,
2021-04-08,0.559607,0.989179,0.738870,,
2021-04-09,0.897082,0.559607,0.989179,,
