In [15]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ubuntu/varios/skforecast'

# Descripción

Propuesta para que los forecasters tengan un argumento `datetime_features` con el que pasar una función o un transformer que extraiga características de un indice datetime.

Tecnicamente esto puede hacerse con un transformer en exog, pero pensando en el usuario, y dado que en series temporales las features de calendario son muy comunes, sería interesante tener un argumento que permita hacerlo de forma más sencilla.

Comparar posinilidades utilizando sklearn  y con feature-engine

# Aproximación con sklearn

In [16]:
import pandas as pd
import numpy as np

date_range = pd.date_range(start='2021-01-01', periods=10, freq='D')
df = pd.DataFrame(np.random.rand(10, 2), index=date_range, columns=['feature1', 'feature2'])
df

Unnamed: 0,feature1,feature2
2021-01-01,0.864645,0.513011
2021-01-02,0.034696,0.984087
2021-01-03,0.833921,0.926339
2021-01-04,0.542774,0.805903
2021-01-05,0.972923,0.630587
2021-01-06,0.807533,0.32358
2021-01-07,0.021694,0.578052
2021-01-08,0.56151,0.613682
2021-01-09,0.848022,0.979702
2021-01-10,0.950405,0.908207


## Using FunctionTransformer

Constructs a transformer from an arbitrary callable.

A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc.

In [17]:
# Using FunctionTransformer
# ======================================================================================
from sklearn.preprocessing import FunctionTransformer

# Function to extract month and day from datetime index
def extract_datetime_features(X):

    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input must be a DataFrame")
    if not isinstance(X.index, pd.DatetimeIndex):
        raise ValueError("Dataframe must have a datetime index")
    
    features = pd.DataFrame(index=X.index)
    features['month'] = X.index.month
    features['day_of_week'] = X.index.day
    features['month_sin'] = np.sin(2 * np.pi * features['month'] / 12)
    features['month_cos'] = np.cos(2 * np.pi * features['month'] / 12)
    features['day_of_week_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 6)
    features['day_of_week_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 6)
    features = features.drop(columns=['month', 'day_of_week'])

    return features

In [18]:
# Create the FunctionTransformer
datetime_transformer = FunctionTransformer(extract_datetime_features)

# Transform the DataFrame
datetime_features = datetime_transformer.transform(df)
datetime_features

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,0.8660254,0.5
2021-01-02,0.5,0.866025,0.8660254,-0.5
2021-01-03,0.5,0.866025,1.224647e-16,-1.0
2021-01-04,0.5,0.866025,-0.8660254,-0.5
2021-01-05,0.5,0.866025,-0.8660254,0.5
2021-01-06,0.5,0.866025,-2.449294e-16,1.0
2021-01-07,0.5,0.866025,0.8660254,0.5
2021-01-08,0.5,0.866025,0.8660254,-0.5
2021-01-09,0.5,0.866025,3.67394e-16,-1.0
2021-01-10,0.5,0.866025,-0.8660254,-0.5


## Custom Transformer

In [19]:
# Version simple
# ======================================================================================
from sklearn.base import BaseEstimator, TransformerMixin

class DateTimeFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a DataFrame")
        if not isinstance(X.index, pd.DatetimeIndex):
            raise ValueError("Dataframe must have a datetime index")
        
        features = pd.DataFrame(index=X.index)
        features['month'] = X.index.month
        features['day_of_week'] = X.index.day
        features['month_sin'] = np.sin(2 * np.pi * features['month'] / 12)
        features['month_cos'] = np.cos(2 * np.pi * features['month'] / 12)
        features['day_of_week_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 6)
        features['day_of_week_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 6)
        features = features.drop(columns=['month', 'day_of_week'])

        return features

In [20]:
# Create the Transformer
datetime_transformer = DateTimeFeatures()

# Transform the DataFrame
datetime_features = datetime_transformer.transform(df)
datetime_features

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,0.8660254,0.5
2021-01-02,0.5,0.866025,0.8660254,-0.5
2021-01-03,0.5,0.866025,1.224647e-16,-1.0
2021-01-04,0.5,0.866025,-0.8660254,-0.5
2021-01-05,0.5,0.866025,-0.8660254,0.5
2021-01-06,0.5,0.866025,-2.449294e-16,1.0
2021-01-07,0.5,0.866025,0.8660254,0.5
2021-01-08,0.5,0.866025,0.8660254,-0.5
2021-01-09,0.5,0.866025,3.67394e-16,-1.0
2021-01-10,0.5,0.866025,-0.8660254,-0.5


In [21]:
# Version skforecast DateTimeFeatures
# ======================================================================================
from skforecast.preprocessing import DateTimeFeatures

datetime_transformer = DateTimeFeatures(features=['month', 'day_of_week'])
datetime_features = datetime_transformer.transform(df)
display(datetime_features)

Unnamed: 0,month,day_of_week
2021-01-01,1,4
2021-01-02,1,5
2021-01-03,1,6
2021-01-04,1,0
2021-01-05,1,1
2021-01-06,1,2
2021-01-07,1,3
2021-01-08,1,4
2021-01-09,1,5
2021-01-10,1,6


In [22]:
datetime_transformer = DateTimeFeatures(features=['month', 'day_of_week'], cyclic_encoding=True)
datetime_features = datetime_transformer.transform(df)
display(datetime_features)

Unnamed: 0,month_sin,month_cos,day_of_week_sin,day_of_week_cos
2021-01-01,0.5,0.866025,-0.8660254,-0.5
2021-01-02,0.5,0.866025,-0.8660254,0.5
2021-01-03,0.5,0.866025,-2.449294e-16,1.0
2021-01-04,0.5,0.866025,0.0,1.0
2021-01-05,0.5,0.866025,0.8660254,0.5
2021-01-06,0.5,0.866025,0.8660254,-0.5
2021-01-07,0.5,0.866025,1.224647e-16,-1.0
2021-01-08,0.5,0.866025,-0.8660254,-0.5
2021-01-09,0.5,0.866025,-0.8660254,0.5
2021-01-10,0.5,0.866025,-2.449294e-16,1.0


In [23]:
datetime_transformer = DateTimeFeatures(cyclic_encoding=True)
datetime_features = datetime_transformer.transform(df)
display(datetime_features)

Unnamed: 0,year,weekend,month_sin,month_cos,week_sin,week_cos,day_of_week_sin,day_of_week_cos,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,hour_sin,hour_cos,minute_sin,minute_cos,second_sin,second_cos
2021-01-01,2021,False,0.5,0.866025,0.120537,0.992709,-0.8660254,-0.5,0.201299,0.97953,0.017166,0.999853,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-02,2021,True,0.5,0.866025,0.120537,0.992709,-0.8660254,0.5,0.394356,0.918958,0.034328,0.999411,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-03,2021,True,0.5,0.866025,0.120537,0.992709,-2.449294e-16,1.0,0.571268,0.820763,0.051479,0.998674,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-04,2021,False,0.5,0.866025,0.120537,0.992709,0.0,1.0,0.724793,0.688967,0.068615,0.997643,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-05,2021,False,0.5,0.866025,0.120537,0.992709,0.8660254,0.5,0.848644,0.528964,0.085731,0.996318,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-06,2021,False,0.5,0.866025,0.120537,0.992709,0.8660254,-0.5,0.937752,0.347305,0.102821,0.9947,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-07,2021,False,0.5,0.866025,0.120537,0.992709,1.224647e-16,-1.0,0.988468,0.151428,0.119881,0.992788,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-08,2021,False,0.5,0.866025,0.120537,0.992709,-0.8660254,-0.5,0.998717,-0.050649,0.136906,0.990584,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-09,2021,True,0.5,0.866025,0.120537,0.992709,-0.8660254,0.5,0.968077,-0.250653,0.153891,0.988088,0.0,1.0,0.0,1.0,0.0,1.0
2021-01-10,2021,True,0.5,0.866025,0.120537,0.992709,-2.449294e-16,1.0,0.897805,-0.440394,0.17083,0.985301,0.0,1.0,0.0,1.0,0.0,1.0


# Utilizando feature-engine

In [30]:
from feature_engine.datetime import DatetimeFeatures
from feature_engine.creation import CyclicalFeatures
from sklearn.pipeline import Pipeline


dtf = DatetimeFeatures(
    features_to_extract = ["year", "month", "day_of_month"],
    variables = "index",
    drop_original=True
    )
cf = CyclicalFeatures(
    variables=["year", "month", "day_of_month"],
    max_values={"month": 12, "day_of_month": 31}
)

dtf.fit_transform(df)

# pipeline with both transformers
pipe = Pipeline([
    ("dtf", dtf),
    ("cf", cf)
])
pipe.fit(df)
X_transformed = pipe.transform(df)
X_transformed

Unnamed: 0,feature1,feature2,year,month,day_of_month,month_sin,month_cos,day_of_month_sin,day_of_month_cos
2021-01-01,0.864645,0.513011,2021,1,1,0.5,0.866025,0.201299,0.97953
2021-01-02,0.034696,0.984087,2021,1,2,0.5,0.866025,0.394356,0.918958
2021-01-03,0.833921,0.926339,2021,1,3,0.5,0.866025,0.571268,0.820763
2021-01-04,0.542774,0.805903,2021,1,4,0.5,0.866025,0.724793,0.688967
2021-01-05,0.972923,0.630587,2021,1,5,0.5,0.866025,0.848644,0.528964
2021-01-06,0.807533,0.32358,2021,1,6,0.5,0.866025,0.937752,0.347305
2021-01-07,0.021694,0.578052,2021,1,7,0.5,0.866025,0.988468,0.151428
2021-01-08,0.56151,0.613682,2021,1,8,0.5,0.866025,0.998717,-0.050649
2021-01-09,0.848022,0.979702,2021,1,9,0.5,0.866025,0.968077,-0.250653
2021-01-10,0.950405,0.908207,2021,1,10,0.5,0.866025,0.897805,-0.440394


¿¿¿¿¿¿ Como se podría hacer con feature-engine para que no se devuelvan las columnas originales?

In [31]:
X_transformed = pipe.transform(pd.DataFrame(index=df.index))

ValueError: 0 feature(s) (shape=(10, 0)) while a minimum of 1 is required.