In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
from typing import Union, Tuple, Optional, Callable
import warnings
import logging
import sys
import numpy as np
import pandas as pd
import sklearn
from sklearn.exceptions import NotFittedError
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from copy import copy
import inspect
from joblib import Parallel, delayed

import skforecast
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.ForecasterBase import ForecasterBase
from skforecast.exceptions import MissingValuesWarning
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.utils import initialize_lags
from skforecast.utils import initialize_weights
from skforecast.utils import initialize_transformer_series
from skforecast.utils import check_select_fit_kwargs
from skforecast.utils import check_preprocess_series
from skforecast.utils import check_preprocess_exog_multiseries
from skforecast.utils import align_series_and_exog_multiseries
from skforecast.utils import get_exog_dtypes
from skforecast.utils import check_exog_dtypes
from skforecast.utils import check_interval
from skforecast.utils import check_predict_input
from skforecast.utils import preprocess_last_window
from skforecast.utils import expand_index
from skforecast.utils import transform_series
from skforecast.utils import transform_dataframe
from skforecast.utils import set_skforecast_warnings
from skforecast.preprocessing import TimeSeriesDifferentiator


# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from skforecast.datasets import fetch_dataset
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from skforecast.model_selection_multiseries import backtesting_forecaster_multiseries
from skforecast.model_selection_multiseries import grid_search_forecaster_multiseries
from skforecast.model_selection_multiseries import bayesian_search_forecaster_multiseries

In [3]:
forecaster = ForecasterAutoregMultiSeries(
                 regressor          = RandomForestRegressor(random_state=123),
                 lags               = 3,
                 encoding           = 'ordinal',
                 transformer_series = StandardScaler(),
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

self = forecaster

In [4]:
def _create_train_X_y(
        self,
        series: Union[pd.DataFrame, dict],
        exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None,
        store_last_window: Union[bool, list]=True,
        parallel: bool=False
    ) -> Tuple[pd.DataFrame, pd.Series, dict, list, list, list, dict, dict]:
        """
        Create training matrices from multiple time series and exogenous
        variables. See Notes section for more details depending on the type of
        `series` and `exog`.
        
        Parameters
        ----------
        series : pandas DataFrame, dict
            Training time series.
        exog : pandas Series, pandas DataFrame, dict, default `None`
            Exogenous variable/s included as predictor/s.
        store_last_window : bool, list, default `True`
            Whether or not to store the last window of training data.

            - If `True`, last_window is stored for all series. 
            - If `list`, last_window is stored for the series present in the list.
            - If `False`, last_window is not stored.

        Returns
        -------
        X_train : pandas DataFrame
            Training values (predictors).
        y_train : pandas Series
            Values (target) of the time series related to each row of `X_train`.
        series_indexes : dict
            Dictionary with the index of each series.
        series_col_names : list
            Names of the series (levels) provided by the user during training.
        series_X_train : list
            Names of the series (levels) included in the matrix `X_train` created
            internally for training. It can be different from `series_col_names` if
            some series are dropped during the training process because of NaNs or
            because they are not present in the training period.
        exog_col_names : list
            Names of the exogenous variables used during training.
        exog_dtypes : dict
            Type of each exogenous variable/s used in training. If `transformer_exog` 
            is used, the dtypes are calculated before the transformation.
        last_window : dict
            Last window of training data for each series. It stores the values 
            needed to predict the next `step` immediately after the training data.

        Notes
        -----
        - If `series` is a pandas DataFrame and `exog` is a pandas Series or 
        DataFrame, each exog is duplicated for each series. Exog must have the
        same index as `series` (type, length and frequency).
        - If `series` is a pandas DataFrame and `exog` is a dict of pandas Series 
        or DataFrames. Each key in `exog` must be a column in `series` and the 
        values are the exog for each series. Exog must have the same index as 
        `series` (type, length and frequency).
        - If `series` is a dict of pandas Series, `exog` must be a dict of pandas
        Series or DataFrames. The keys in `series` and `exog` must be the same.
        All series and exog must have a pandas DatetimeIndex with the same 
        frequency.
        
        """

        series_dict, series_indexes = check_preprocess_series(series=series)
        input_series_is_dict = isinstance(series, dict)
        series_col_names = list(series_dict.keys())

        if self.fitted and not (series_col_names == self.series_col_names):
            raise ValueError(
                (f"Once the Forecaster has been trained, `series` must have the "
                 f"same columns as the series used during training:\n" 
                 f" Got      : {series_col_names}\n"
                 f" Expected : {self.series_col_names}")
            )

        exog_dict = {serie: None for serie in series_col_names}
        exog_col_names = None
        if exog is not None:
            exog_dict, exog_col_names = check_preprocess_exog_multiseries(
                                            input_series_is_dict = input_series_is_dict,
                                            series_indexes       = series_indexes,
                                            series_col_names     = series_col_names,
                                            exog                 = exog,
                                            exog_dict            = exog_dict
                                        )

            if self.fitted:
                if self.exog_col_names is None:
                    raise ValueError(
                        ("Once the Forecaster has been trained, `exog` must be `None` "
                         "because no exogenous variables were added during training.")
                    )
                else:
                    if not set(exog_col_names) == set(self.exog_col_names):
                        raise ValueError(
                            (f"Once the Forecaster has been trained, `exog` must have the "
                             f"same columns as the series used during training:\n" 
                             f" Got      : {exog_col_names}\n"
                             f" Expected : {self.exog_col_names}")
                        )

        if not self.fitted:
            self.transformer_series_ = initialize_transformer_series(
                                           series_col_names = series_col_names,
                                           transformer_series = self.transformer_series
                                       )

        if self.differentiation is None:
            self.differentiator_ = {serie: None for serie in series_col_names}
        else:
            if not self.fitted:
                self.differentiator_ = {serie: clone(self.differentiator) 
                                        for serie in series_col_names}

        series_dict, exog_dict = align_series_and_exog_multiseries(
                                     series_dict          = series_dict,
                                     input_series_is_dict = input_series_is_dict,
                                     exog_dict            = exog_dict
                                 )

        ignore_exog = True if exog is None else False
        input_matrices = [
            [series_dict[k], exog_dict[k], ignore_exog]
             for k in series_dict.keys()
        ]

        if not parallel:
            # ======================================================================
            X_train_lags_buffer = []
            X_train_exog_buffer = []
            y_train_buffer = []
            for matrices in input_matrices:

                X_train_lags, X_train_exog, y_train = (
                    self._create_train_X_y_single_series(
                        y           = matrices[0],
                        exog        = matrices[1],
                        ignore_exog = matrices[2],
                    )
                )

                X_train_lags_buffer.append(X_train_lags)
                X_train_exog_buffer.append(X_train_exog)
                y_train_buffer.append(y_train)
            # ======================================================================
        else:
            def process_matrices(matrices):
                return self._create_train_X_y_single_series(
                            y           = matrices[0],
                            exog        = matrices[1],
                            ignore_exog = matrices[2],
                        )

            results = Parallel(n_jobs=-1)(delayed(process_matrices)(matrices) for matrices in input_matrices)
            X_train_lags_buffer, X_train_exog_buffer, y_train_buffer = zip(*results)

        X_train = pd.concat(X_train_lags_buffer, axis=0)
        y_train = pd.concat(y_train_buffer, axis=0)

        if self.fitted:
            encoded_values = self.encoder.transform(X_train[['_level_skforecast']])
        else:
            encoded_values = self.encoder.fit_transform(X_train[['_level_skforecast']])
            for i, code in enumerate(self.encoder.categories_[0]):
                self.encoding_mapping[code] = i

        X_train = pd.concat([
                      X_train.drop(columns='_level_skforecast'),
                      encoded_values
                  ], axis=1)

        if self.encoding == 'onehot':
            X_train.columns = X_train.columns.str.replace('_level_skforecast_', '')
        elif self.encoding == 'ordinal_category':
            X_train['_level_skforecast'] = (
                X_train['_level_skforecast'].astype('category')
            )

        del encoded_values

        exog_dtypes = None
        if exog is not None:

            X_train_exog = pd.concat(X_train_exog_buffer, axis=0)
            if '_dummy_exog_col_to_keep_shape' in X_train_exog.columns:
                X_train_exog = (
                    X_train_exog.drop(columns=['_dummy_exog_col_to_keep_shape'])
                )

            exog_col_names = X_train_exog.columns.to_list()
            exog_dtypes = get_exog_dtypes(exog=X_train_exog)

            fit_transformer = False if self.fitted else True
            X_train_exog = transform_dataframe(
                               df                = X_train_exog,
                               transformer       = self.transformer_exog,
                               fit               = fit_transformer,
                               inverse_transform = False
                           )

            check_exog_dtypes(X_train_exog, call_check_exog=False)
            if not (X_train_exog.index == X_train.index).all():
                raise ValueError(
                    ("Different index for `series` and `exog` after transformation. "
                     "They must be equal to ensure the correct alignment of values.")
                )

            X_train = pd.concat([X_train, X_train_exog], axis=1)

        if y_train.isnull().any():
            mask = y_train.notna().to_numpy()
            y_train = y_train.iloc[mask]
            X_train = X_train.iloc[mask,]
            warnings.warn(
                ("NaNs detected in `y_train`. They have been dropped because the "
                 "target variable cannot have NaN values. Same rows have been "
                 "dropped from `X_train` to maintain alignment. This is caused by "
                 "series with interspersed NaNs."),
                 MissingValuesWarning
            )

        if self.dropna_from_series:
            if X_train.isnull().any().any():
                mask = X_train.notna().all(axis=1).to_numpy()
                X_train = X_train.iloc[mask, ]
                y_train = y_train.iloc[mask]
                warnings.warn(
                    ("NaNs detected in `X_train`. They have been dropped. If "
                     "you want to keep them, set `forecaster.dropna_from_series = False`. " 
                     "Same rows have been removed from `y_train` to maintain alignment. "
                     "This caused by series with interspersed NaNs."),
                     MissingValuesWarning
                )
        else:
            if X_train.isnull().any().any():
                warnings.warn(
                    ("NaNs detected in `X_train`. Some regressors do not allow "
                     "NaN values during training. If you want to drop them, "
                     "set `forecaster.dropna_from_series = True`."),
                     MissingValuesWarning
                )

        if X_train.empty:
            raise ValueError(
                ("All samples have been removed due to NaNs. Set "
                 "`forecaster.dropna_from_series = False` or review `exog` values.")
            )
        
        if self.encoding == 'onehot':
            series_X_train = [
                col for col in series_col_names if X_train[col].sum() > 0
            ]
        else:
            series_X_train = [
                k for k, v in self.encoding_mapping.items()
                if v in X_train['_level_skforecast'].unique()
            ]

        # The last time window of training data is stored so that lags needed as
        # predictors in the first iteration of `predict()` can be calculated.
        last_window = None
        if store_last_window:

            series_to_store = (
                series_X_train if store_last_window is True else store_last_window
            )

            series_not_in_series_dict = set(series_to_store) - set(series_X_train)
            if series_not_in_series_dict:
                warnings.warn(
                    (f"Series {series_not_in_series_dict} are not present in "
                     f"`series`. No last window is stored for them."),
                    IgnoredArgumentWarning
                )
                series_to_store = [s for s in series_to_store 
                                   if s not in series_not_in_series_dict]

            if series_to_store:
                last_window = {
                    k: v.iloc[-self.window_size_diff:].copy()
                    for k, v in series_dict.items()
                    if k in series_to_store
                }

        return (
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        )

In [5]:
data = fetch_dataset(name="items_sales", verbose=False)
data_100 = pd.concat([data]*33, axis=1)
data_100.columns = [f'serie_{i}' for i in range(data_100.shape[1])]
data_1000 = pd.concat([data]*333, axis=1)
data_1000.columns = [f'serie_{i}' for i in range(data_1000.shape[1])]
print(f"Length of each series: {len(data)}")

Length of each series: 1097


In [6]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data, exog=None, store_last_window=True, parallel=False)

14.4 ms ± 326 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data, exog=None, store_last_window=True, parallel=True)

The slowest run took 48.25 times longer than the fastest. This could mean that an intermediate result is being cached.
260 ms ± 379 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_100, exog=None, store_last_window=True, parallel=False)

434 ms ± 978 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_100, exog=None, store_last_window=True, parallel=True)

357 ms ± 6.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_1000, exog=None, store_last_window=True, parallel=False)

11.2 s ± 66.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_1000, exog=None, store_last_window=True, parallel=True)

10.6 s ± 169 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
data = fetch_dataset(name="items_sales", verbose=False)
data = pd.concat([data, data, data, data, data])
data.index = pd.date_range(start=data.index.min(), periods=len(data), freq=data.index.freq)

data_100 = pd.concat([data]*33, axis=1)
data_100.columns = [f'serie_{i}' for i in range(data_100.shape[1])]
data_1000 = pd.concat([data]*333, axis=1)
data_1000.columns = [f'serie_{i}' for i in range(data_1000.shape[1])]
print(f"Length of each series: {len(data)}")

Length of each series: 5485


In [13]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data, exog=None, store_last_window=True, parallel=False)

178 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data, exog=None, store_last_window=True, parallel=True)

185 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_100, exog=None, store_last_window=True, parallel=False)

4.19 s ± 3.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_100, exog=None, store_last_window=True, parallel=True)

4.19 s ± 21.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_1000, exog=None, store_last_window=True, parallel=False)

41.9 s ± 256 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
(
            X_train,
            y_train,
            series_indexes,
            series_col_names,
            series_X_train,
            exog_col_names,
            exog_dtypes,
            last_window,
        ) = _create_train_X_y(self, data_1000, exog=None, store_last_window=True, parallel=True)

42 s ± 319 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
