In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\jaesc2\GitHub\skforecast


In [21]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from typing import Any, Union, Optional

from skforecast.preprocessing import RollingFeatures
from skforecast.preprocessing import TimeSeriesDifferentiator
from skforecast.recursive import ForecasterRecursive

# Fixtures
from skforecast.recursive.tests.tests_forecaster_recursive.fixtures_forecaster_recursive import y as y_categorical
from skforecast.recursive.tests.tests_forecaster_recursive.fixtures_forecaster_recursive import exog as exog_categorical
from skforecast.recursive.tests.tests_forecaster_recursive.fixtures_forecaster_recursive import exog_predict as exog_predict_categorical
from skforecast.recursive.tests.tests_forecaster_recursive.fixtures_forecaster_recursive import data  # to test results when using differentiation

In [32]:
from numba import njit



class RollingFeaturesExternal():
    """
    This class computes rolling features. To avoid data leakage, the last point 
    in the window is excluded from calculations, ('closed': 'left' and 
    'center': False).

    Parameters
    ----------
    stats : str, list
        Statistics to compute over the rolling window. Can be a `string` or a `list`,
        and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max',
        'sum', 'median', 'ratio_min_max', 'coef_variation'.
    window_sizes : int, list
        Size of the rolling window for each statistic. If an `int`, all stats share 
        the same window size. If a `list`, it should have the same length as stats.
    min_periods : int, list, default `None`
        Minimum number of observations in window required to have a value. 
        Similar to pandas rolling `min_periods` argument. If `None`, defaults 
        to `window_sizes`.
    features_names : list, default `None`
        Names of the output features. If `None`, default names will be used in the 
        format 'roll_stat_window_size', for example 'roll_mean_7'.
    fillna : str, float, default `None`
        Fill missing values in `transform_batch` method. Available 
        methods are: 'mean', 'median', 'ffill', 'bfill', or a float value.
    
    Attributes
    ----------
    stats : list
        Statistics to compute over the rolling window.
    n_stats : int
        Number of statistics to compute.
    window_sizes : list
        Size of the rolling window for each statistic.
    max_window_size : int
        Maximum window size.
    min_periods : list
        Minimum number of observations in window required to have a value.
    features_names : list
        Names of the output features.
    fillna : str, float
        Method to fill missing values in `transform_batch` method.
    unique_rolling_windows : dict
        Dictionary containing unique rolling window parameters and the corresponding
        statistics.
        
    """

    def __init__(
        self, 
        stats: Union[str, list],
        window_sizes: Union[int, list],
        min_periods: Optional[Union[int, list]] = None,
        features_names: Optional[list] = None, 
        fillna: Optional[Union[str, float]] = None
    ) -> None:
        
        self._validate_params(
            stats,
            window_sizes,
            min_periods,
            features_names,
            fillna
        )

        if isinstance(stats, str):
            stats = [stats]
        self.stats = stats
        self.n_stats = len(stats)

        if isinstance(window_sizes, int):
            window_sizes = [window_sizes] * self.n_stats
        self.window_sizes = window_sizes
        self.max_window_size = max(window_sizes)
        
        if min_periods is None:
            min_periods = self.window_sizes
        elif isinstance(min_periods, int):
            min_periods = [min_periods] * self.n_stats
        self.min_periods = min_periods

        if features_names is None:
            features_names = [
                f"roll_{stat}_{window_size}" 
                for stat, window_size in zip(self.stats, self.window_sizes)
            ]
        self.features_names = features_names
        
        self.fillna = fillna

        window_params_list = []
        for i in range(len(self.stats)):
            window_params = (self.window_sizes[i], self.min_periods[i])
            window_params_list.append(window_params)

        # Find unique window parameter combinations
        unique_rolling_windows = {}
        for i, params in enumerate(window_params_list):
            key = f"{params[0]}_{params[1]}"
            if key not in unique_rolling_windows:
                unique_rolling_windows[key] = {
                    'params': {
                        'window': params[0], 
                        'min_periods': params[1], 
                        'center': False,
                        'closed': 'left'
                    },
                    'stats_idx': [], 
                    'stats_names': [], 
                    'rolling_obj': None
                }
            unique_rolling_windows[key]['stats_idx'].append(i)
            unique_rolling_windows[key]['stats_names'].append(self.features_names[i])

        self.unique_rolling_windows = unique_rolling_windows

    def _validate_params(
        self, 
        stats, 
        window_sizes, 
        min_periods: Optional[Union[int, list]] = None,
        features_names: Optional[Union[str, list]] = None, 
        fillna: Optional[Union[str, float]] = None
    ) -> None:
        """
        Validate the parameters of the RollingFeatures class.

        Parameters
        ----------
        stats : str, list
            Statistics to compute over the rolling window. Can be a `string` or a `list`,
            and can have repeats. Available statistics are: 'mean', 'std', 'min', 'max',
            'sum', 'median', 'ratio_min_max', 'coef_variation'.
        window_sizes : int, list
            Size of the rolling window for each statistic. If an `int`, all stats share 
            the same window size. If a `list`, it should have the same length as stats.
        min_periods : int, list, default `None`
            Minimum number of observations in window required to have a value. 
            Similar to pandas rolling `min_periods` argument. If `None`, defaults 
            to `window_sizes`.
        features_names : list, default `None`
            Names of the output features. If `None`, default names will be used in the 
            format 'roll_stat_window_size', for example 'roll_mean_7'.
        fillna : str, float, default `None`
            Fill missing values in `transform_batch` method. Available 
            methods are: 'mean', 'median', 'ffill', 'bfill', or a float value.

        Returns
        -------
        None

        """

        # stats
        if not isinstance(stats, (str, list)):
            raise TypeError(
                f"`stats` must be a string or a list of strings. Got {type(stats)}."
            )        
        
        if isinstance(stats, str):
            stats = [stats]
        allowed_stats = ['mean', 'std', 'min', 'max', 'sum', 'median', 
                         'ratio_min_max', 'coef_variation']
        for stat in set(stats):
            if stat not in allowed_stats:
                raise ValueError(
                    f"Statistic '{stat}' is not allowed. Allowed stats are: {allowed_stats}."
                )
        
        n_stats = len(stats)
        
        # window_sizes
        if not isinstance(window_sizes, (int, list)):
            raise TypeError(
                f"`window_sizes` must be an int or a list of ints. Got {type(window_sizes)}."
            )
        
        if isinstance(window_sizes, list):
            n_window_sizes = len(window_sizes)
            if n_window_sizes != n_stats:
                raise ValueError(
                    (f"Length of `window_sizes` list ({n_window_sizes}) "
                     f"must match length of `stats` list ({n_stats}).")
                )
            
        # Check duplicates (stats, window_sizes)
        if isinstance(window_sizes, int):
            window_sizes = [window_sizes] * n_stats
        if len(set(zip(stats, window_sizes))) != n_stats:
            raise ValueError("Duplicate (stat, window_size) pairs are not allowed.")
        
        # min_periods
        if not isinstance(min_periods, (int, list, type(None))):
            raise TypeError(
                f"`min_periods` must be an int, list of ints, or None. Got {type(min_periods)}."
            )
        
        if min_periods is not None:
            if isinstance(min_periods, int):
                min_periods = [min_periods] * n_stats
            elif isinstance(min_periods, list):
                n_min_periods = len(min_periods)
                if n_min_periods != n_stats:
                    raise ValueError(
                        (f"Length of `min_periods` list ({n_min_periods}) "
                         f"must match length of `stats` list ({n_stats}).")
                    )
            
            for i, min_period in enumerate(min_periods):
                if min_period > window_sizes[i]:
                    raise ValueError(
                        ("Each min_period must be less than or equal to its "
                         "corresponding window_size.")
                    )
        
        # features_names
        if not isinstance(features_names, (list, type(None))):
            raise TypeError(
                f"`features_names` must be a list of strings or None. Got {type(features_names)}."
            )
        
        if isinstance(features_names, list):
            n_features_names = len(features_names)
            if n_features_names != n_stats:
                raise ValueError(
                    (f"Length of `features_names` list ({n_features_names}) "
                     f"must match length of `stats` list ({n_stats}).")
                )
        
        # fillna
        if fillna is not None:
            if not isinstance(fillna, (int, float, str)):
                raise TypeError(
                    f"`fillna` must be a float, string, or None. Got {type(fillna)}."
                )
            
            if isinstance(fillna, str):
                allowed_fill_strategy = ['mean', 'median', 'ffill', 'bfill']
                if fillna not in allowed_fill_strategy:
                    raise ValueError(
                        (f"'{fillna}' is not allowed. Allowed `fillna` "
                         f"values are: {allowed_fill_strategy} or a float value.")
                    )

    def _apply_stat_pandas(
        self, 
        rolling_obj: pd.core.window.rolling.Rolling, 
        stat: str
    ) -> pd.Series:
        """
        Apply the specified statistic to a pandas rolling object.

        Parameters
        ----------
        rolling_obj : pandas Rolling
            Rolling object to apply the statistic.
        stat : str
            Statistic to compute.
        
        Returns
        -------
        stat_series : pandas Series
            Series with the computed statistic.
        
        """

        if stat == 'mean':
            return rolling_obj.mean()
        elif stat == 'std':
            return rolling_obj.std()
        elif stat == 'min':
            return rolling_obj.min()
        elif stat == 'max':
            return rolling_obj.max()
        elif stat == 'sum':
            return rolling_obj.sum()
        elif stat == 'median':
            return rolling_obj.median()
        elif stat == 'ratio_min_max':
            return rolling_obj.min() / rolling_obj.max()
        elif stat == 'coef_variation':
            return rolling_obj.std() / rolling_obj.mean()
        else:
            raise ValueError(f"Statistic '{stat}' is not implemented.")

    def transform_batch(
        self, 
        X: pd.Series
    ) -> pd.DataFrame:
        """
        Transform an entire pandas Series using rolling windows and compute the 
        specified statistics.

        Parameters
        ----------
        X : pandas Series
            The input data series to transform.

        Returns
        -------
        rolling_features : pandas DataFrame
            A DataFrame containing the rolling features.
        
        """

        for k in self.unique_rolling_windows.keys():
            rolling_obj = X.rolling(**self.unique_rolling_windows[k]['params'])
            self.unique_rolling_windows[k]['rolling_obj'] = rolling_obj
        
        rolling_features = []
        for i, stat in enumerate(self.stats):
            window_size = self.window_sizes[i]
            min_periods = self.min_periods[i]

            key = f"{window_size}_{min_periods}"
            rolling_obj = self.unique_rolling_windows[key]['rolling_obj']

            stat_series = self._apply_stat_pandas(rolling_obj=rolling_obj, stat=stat)            
            rolling_features.append(stat_series)

        rolling_features = pd.concat(rolling_features, axis=1)
        rolling_features.columns = self.features_names
        rolling_features = rolling_features.iloc[self.max_window_size:]

        if self.fillna is not None:
            if self.fillna == 'mean':
                rolling_features = rolling_features.fillna(rolling_features.mean())
            elif self.fillna == 'median':
                rolling_features = rolling_features.fillna(rolling_features.median())
            elif self.fillna == 'ffill':
                rolling_features = rolling_features.ffill()
            elif self.fillna == 'bfill':
                rolling_features = rolling_features.bfill()
            else:
                rolling_features = rolling_features.fillna(self.fillna)
        
        return rolling_features

    def _apply_stat_numpy_jit(
        self, 
        X_window: np.ndarray, 
        stat: str
    ) -> float:
        """
        Apply the specified statistic to a numpy array using Numba JIT.

        Parameters
        ----------
        X_window : numpy array
            Array with the rolling window.
        stat : str
            Statistic to compute.

        Returns
        -------
        stat_value : float
            Value of the computed statistic.
        
        """
        
        if stat == 'mean':
            return np.mean(X_window)
        else:
            raise ValueError(f"Statistic '{stat}' is not implemented.")

    def transform(
        self, 
        X: np.ndarray
    ) -> np.ndarray:
        """
        Transform a numpy array using rolling windows and compute the 
        specified statistics. The returned array will have the shape 
        (X.shape[1] if exists, n_stats). For example, if X is a flat
        array, the output will have shape (n_stats,). If X is a 2D array,
        the output will have shape (X.shape[1], n_stats).

        Parameters
        ----------
        X : numpy ndarray
            The input data array to transform.

        Returns
        -------
        rolling_features : numpy ndarray
            An array containing the computed statistics.
        
        """

        array_ndim = X.ndim
        if array_ndim == 1:
            X = X[:, np.newaxis]
            
        rolling_features = np.full(
            shape=(X.shape[1], self.n_stats), fill_value=np.nan, dtype=float
        )

        for i in range(X.shape[1]):
            for j, stat in enumerate(self.stats):
                X_window = X[-self.window_sizes[j]:, i]
                X_window = X_window[~np.isnan(X_window)]
                rolling_features[i, j] = self._apply_stat_numpy_jit(X_window, stat)

        if array_ndim == 1:
            rolling_features = rolling_features.ravel()
        
        return rolling_features

In [33]:
y_datetime = y_categorical.copy()
y_datetime.index = pd.date_range(start='2001-01-01', periods=len(y_datetime), freq='D')
exog_datetime = exog_categorical.copy()
exog_datetime.index = pd.date_range(start='2001-01-01', periods=len(exog_datetime), freq='D')
exog_predict_datetime = exog_predict_categorical.copy()
exog_predict_datetime.index = pd.date_range(start='2001-02-20', periods=len(exog_predict_datetime), freq='D')

rolling = RollingFeaturesExternal(stats=['mean', 'sum'], window_sizes=[3, 5])
forecaster = ForecasterRecursive(
    LGBMRegressor(verbose=-1, random_state=123), lags=3, window_features=rolling
)
forecaster.fit(y=y_datetime, exog=exog_datetime)
predictions = forecaster.predict(steps=10, exog=exog_predict_datetime)
predictions

2001-02-20    0.532665
2001-02-21    0.505028
2001-02-22    0.505028
2001-02-23    0.505028
2001-02-24    0.532665
2001-02-25    0.532665
2001-02-26    0.532665
2001-02-27    0.505028
2001-02-28    0.505028
2001-03-01    0.532665
Freq: D, Name: pred, dtype: float64

In [34]:
forecaster

In [35]:
print(forecaster)

ForecasterRecursive 
Regressor: LGBMRegressor(random_state=123, verbose=-1) 
Lags: [1 2 3] 
Window features: ['roll_mean_3', 'roll_sum_5'] 
Window size: 5 
Exogenous included: True 
Exogenous names: exog 
Transformer for y: None 
Transformer for exog: None 
Weight function included: False 
Differentiation order: None 
Training range: [Timestamp('2001-01-01 00:00:00'), Timestamp('2001-02-19 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: D 
Regressor parameters: 
    {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0,
    'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1,
    'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0,
    'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None,
    'random_state': 123, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0,
    'subsample_for_bin': 200000, 'subsample_freq': 0, 'verbose': -1} 
fit_kwargs: {} 
Creation date: 2024-10-22 16:08:32 

In [36]:
from skforecast.utils import save_forecaster, load_forecaster

save_forecaster(forecaster, 'forecaster.joblib', verbose=True)

In [28]:
forecaster_loaded = load_forecaster(file_name='forecaster.joblib', verbose=False)

In [29]:
predictions = forecaster_loaded.predict(steps=10, exog=exog_predict_datetime)
predictions

2001-02-20    0.532665
2001-02-21    0.505028
2001-02-22    0.505028
2001-02-23    0.505028
2001-02-24    0.532665
2001-02-25    0.532665
2001-02-26    0.532665
2001-02-27    0.505028
2001-02-28    0.505028
2001-03-01    0.532665
Freq: D, Name: pred, dtype: float64

In [31]:
import dill

# Guardar el forecaster
with open('forecaster_model.pkl', 'wb') as f:
    dill.dump(forecaster, f)

# Cargar el forecaster
# with open('forecaster_model.pkl', 'rb') as f:
#     forecaster_loaded = dill.load(f)
