In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

## Librerias

In [41]:
from typing import Union, Tuple, Optional, Callable
import warnings
import logging
import sys
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from copy import copy, deepcopy
import inspect

import skforecast
from skforecast.ForecasterBase import ForecasterBase
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.utils import initialize_lags
from skforecast.utils import initialize_weights
from skforecast.utils import check_select_fit_kwargs
from skforecast.utils import check_exog
from skforecast.utils import get_exog_dtypes
from skforecast.utils import check_exog_dtypes
from skforecast.utils import check_interval
from skforecast.utils import check_predict_input
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import transform_series
from skforecast.utils import transform_dataframe

from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries
from sklearn.linear_model import LinearRegression

## Data

In [3]:
from skforecast.ForecasterAutoregMultiSeries.tests.fixtures_ForecasterAutoregMultiSeries import series

In [4]:
print(series.shape)
series.columns = ['l1', 'l2']
series.head(2)

(50, 2)


Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [5]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(50),
              'exog2': np.arange(50, 100),
          })


# Exog for series l2
exog_l2 = pd.Series({'exog1': np.arange(100, 150)})


# Dictionary with exog for each series
exog = {
    'l1': exog_l1,
    'l2': exog_l2
}

## Create train X y

In [6]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [7]:
# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None None


In [114]:
def _create_lags(
    y: np.ndarray, 
    series_name: str
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Transforms a 1d array into a 2d array (X) and a 1d array (y). Each row
    in X is associated with a value of y and it represents the lags that
    precede it.
    
    Notice that, the returned matrix X_data, contains the lag 1 in the first
    column, the lag 2 in the second column and so on.
    
    Parameters
    ----------
    y : numpy ndarray
        1d numpy ndarray Training time series.
    series_name : str
        Name of the series.

    Returns
    -------
    X_data : numpy ndarray
        2d numpy ndarray with the lagged values (predictors). 
        Shape: (samples - max(self.lags), len(self.lags))
    y_data : numpy ndarray
        1d numpy ndarray with the values of the time series related to each 
        row of `X_data`. 
        Shape: (samples - max(self.lags), )
    
    """
        
    n_splits = len(y) - max_lag
    if n_splits <= 0:
        raise ValueError(
            (f"The maximum lag ({max_lag}) must be less than the length "
                f"of the series '{series_name}', ({len(y)}).")
        )
    
    X_data = np.full(shape=(n_splits, len(lags)), fill_value=np.nan, dtype=float)

    for i, lag in enumerate(lags):
        X_data[:, i] = y[max_lag - lag: -lag]

    y_data = y[max_lag:]
        
    return X_data, y_data


def check_exog_create_train_X_y(series, exog, key=None):
    """
    """
    if len(exog) != len(series):
        if key:
            raise ValueError(
                (f"`exog` must have same number of samples as `series`.\n"
                 f"    length `exog`: {len(exog)} for series {key}\n"
                 f"    length `series`: {len(series)}")
            )
        else:
            raise ValueError(
                (f"`exog` must have same number of samples as `series`. "
                 f"length `exog`: ({len(exog)}), length `series`: ({len(series)})")
            )
    
    check_exog(exog=exog, allow_nan=True)

    if isinstance(exog, pd.Series):
        # Needed for pandas concat
        exog = exog.to_frame()
    
    exog = transform_dataframe(
                df                = exog,
                transformer       = transformer_exog,
                fit               = True,
                inverse_transform = False
            )

    check_exog(exog=exog, allow_nan=False)
    check_exog_dtypes(exog)

    _, _ = preprocess_exog(exog=exog, return_values=False)
    if not (exog.index[:len(series)] == series.index).all():
        if key:
            raise ValueError(
                (f"Different index for `series` and `exog` for series {key}. "
                 f"They must be equal to ensure the correct alignment of values.")
            )
        else:
            raise ValueError(
                ("Different index for `series` and `exog`. They must be equal "
                 "to ensure the correct alignment of values.")
            )
        
    return exog


def create_train_X_y(
    series: pd.DataFrame,
    exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None
) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
    """
    Create training matrices from multiple time series and exogenous
    variables.
    
    Parameters
    ----------
    series : pandas DataFrame
        Training time series.
    exog : pandas Series, pandas DataFrame, dict, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `series` and their indexes must be aligned.

    Returns
    -------
    X_train : pandas DataFrame
        Training values (predictors).
    y_train : pandas Series
        Values (target) of the time series related to each row of `X_train`.
        Shape: (len(series) - self.max_lag, )
    y_index : pandas Index
        Index of `series`.
    y_train_index: pandas Index
        Index of `y_train`.
    
    """

    if not isinstance(series, pd.DataFrame):
        raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}.")

    series_col_names = list(series.columns)

    if transformer_series is None:
        transformer_series_ = {serie: None for serie in series_col_names}
    elif not isinstance(transformer_series, dict):
        transformer_series_ = {serie: clone(transformer_series) 
                               for serie in series_col_names}
    else:
        transformer_series_ = {serie: None for serie in series_col_names}
        # Only elements already present in transformer_series_ are updated
        transformer_series_.update(
            (k, v) for k, v in deepcopy(transformer_series).items() 
            if k in transformer_series_
        )
        series_not_in_transformer_series = set(series.columns) - set(transformer_series.keys())
        if series_not_in_transformer_series:
            warnings.warn(
                (f"{series_not_in_transformer_series} not present in `transformer_series`."
                 f" No transformation is applied to these series."),
                 IgnoredArgumentWarning
            )
    
    if exog is not None:

        if isinstance(exog, dict):
            
            exog_ = {serie: None for serie in series_col_names}
            # Only elements already present in exog_ are updated
            exog_.update(
                (k, v) for k, v in deepcopy(exog).items() 
                if k in exog_
            )
            series_not_in_exog = set(series.columns) - set(exog.keys())
            if series_not_in_exog:
                # TODO: review warning
                warnings.warn(
                    (f"{series_not_in_exog} not present in `exog`."
                     f" No transformation is applied to these series."),
                     IgnoredArgumentWarning
                )

            for k, v in exog.items():
                exog[k] = check_exog_create_train_X_y(series=series, exog=v, key=k)
        else:   
            exog = check_exog_create_train_X_y(series=series, exog=exog, key=None)
            
        # TODO: adapt get_exog_dtypes when dict
        # exog_dtypes = get_exog_dtypes(exog=exog)
    
    X_levels = []
    len_series = []
    X_train_col_names = [f"lag_{lag}" for lag in lags]

    for i, serie in enumerate(series.columns):

        y = series[serie]
        y_values = y.to_numpy()

        if np.isnan(y_values).all():
            raise ValueError(f"All values of series '{serie}' are NaN.")
        
        first_no_nan_idx = np.argmax(~np.isnan(y_values))
        y_values = y_values[first_no_nan_idx:]

        if np.isnan(y_values).any():
            raise ValueError(
                (f"'{serie}' Time series has missing values in between or "
                    f"at the end of the time series. When working with series "
                    f"of different lengths, all series must be complete after "
                    f"the first non-null value.")
            )
        
        y = transform_series(
                series            = y.iloc[first_no_nan_idx:],
                transformer       = transformer_series_[serie],
                fit               = True,
                inverse_transform = False
            )

        y_values = y.to_numpy()
        X_train_values, y_train_values = _create_lags(y=y_values, series_name=serie)

        if i == 0:
            X_train = X_train_values
            y_train = y_train_values
        else:
            X_train = np.concatenate((X_train, X_train_values), axis=0)
            y_train = np.concatenate((y_train, y_train_values), axis=0)

        X_level = [serie]*len(X_train_values)
        X_levels.extend(X_level)
        len_series.append(len(y_train_values))
    
    X_levels = pd.Series(X_levels)
    X_levels = pd.get_dummies(X_levels, dtype=float)

    X_train = pd.DataFrame(
                  data    = X_train,
                  columns = X_train_col_names
              )

    if exog is not None:
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train. Then Exog is cloned as many times 
        # as there are series, taking into account the length of the series.
        if isinstance(exog, dict):
            exog_to_train = []
            for i, key in enumerate(exog):
                exog_to_train.append(exog[key].iloc[-len_series[i]:, ])
        else:
            exog_to_train = [exog.iloc[-length:, ] for length in len_series]

        exog_to_train = pd.concat(exog_to_train).reset_index(drop=True)
    else:
        exog_to_train = None
    
    X_train = pd.concat([X_train, exog_to_train, X_levels], axis=1)
    X_train_col_names = X_train.columns.to_list()

    y_train = pd.Series(
                  data = y_train,
                  name = 'y'
              )

    _, y_index = preprocess_y(y=series, return_values=False)

    y_index_numpy = y_index.to_numpy()
    y_train_index = pd.Index(
                        np.concatenate(
                            [y_index_numpy[-length:, ] 
                             for length in len_series]
                        )
                    )

    return X_train, y_train, y_index, y_train_index

In [31]:
series.head(2)

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [32]:
exog.keys()

dict_keys(['l1', 'l2'])

In [37]:
len_series = [50, 50]

exog_to_train = []
for i, key in enumerate(exog):

    if isinstance(exog[key], pd.Series):
        exog_ = exog[key].copy().to_frame()
    else:
        exog_ = exog[key]
    
    exog_to_train.append(exog[key].iloc[-len_series[i]:, ])


exog_to_train = pd.concat(exog_to_train).reset_index(drop=True)

  exog_to_train = pd.concat(exog_to_train).reset_index(drop=True)


In [117]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(50),
              'exog2': np.arange(50, 100),
          })


# Exog for series l2
exog_l2 = pd.Series(np.arange(100, 150), name='exog1')


# Dictionary with exog for each series
exog = {
    'l1': exog_l1,
    'l2': exog_l2
}

In [121]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = StandardScaler()

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None StandardScaler()


In [122]:
for k, v in exog.items():
    _ = check_exog_create_train_X_y(series=series, exog=v, key=k)
    print(_.head(2))

      exog1     exog2
0 -1.697749 -1.697749
1 -1.628453 -1.628453
      exog1
0 -1.697749
1 -1.628453


In [123]:
create_train_X_y(series=series, exog=exog)[0]

Unnamed: 0,lag_1,lag_2,lag_3,exog1,exog2,l1,l2
0,0.226851,0.286139,0.696469,-1.489862,-1.489862,1.0,0.0
1,0.551315,0.226851,0.286139,-1.420566,-1.420566,1.0,0.0
2,0.719469,0.551315,0.226851,-1.351270,-1.351270,1.0,0.0
3,0.423106,0.719469,0.551315,-1.281974,-1.281974,1.0,0.0
4,0.980764,0.423106,0.719469,-1.212678,-1.212678,1.0,0.0
...,...,...,...,...,...,...,...
89,0.593177,0.762548,0.355915,1.420566,,0.0,1.0
90,0.691702,0.593177,0.762548,1.489862,,0.0,1.0
91,0.151127,0.691702,0.593177,1.559158,,0.0,1.0
92,0.398876,0.151127,0.691702,1.628453,,0.0,1.0


In [72]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [73]:
exog = pd.Series(np.arange(100, 150), name='exog1')

In [74]:
forecaster.create_train_X_y(series=series, exog=exog)[0]

Unnamed: 0,lag_1,lag_2,lag_3,exog1,l1,l2
0,0.226851,0.286139,0.696469,103,1.0,0.0
1,0.551315,0.226851,0.286139,104,1.0,0.0
2,0.719469,0.551315,0.226851,105,1.0,0.0
3,0.423106,0.719469,0.551315,106,1.0,0.0
4,0.980764,0.423106,0.719469,107,1.0,0.0
...,...,...,...,...,...,...
89,0.593177,0.762548,0.355915,145,0.0,1.0
90,0.691702,0.593177,0.762548,146,0.0,1.0
91,0.151127,0.691702,0.593177,147,0.0,1.0
92,0.398876,0.151127,0.691702,148,0.0,1.0


## Option 2: Parallelitation

In [8]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

In [9]:
# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None None


In [48]:
def _create_lags(
    y: np.ndarray, 
    series_name: str
) -> Tuple[np.ndarray, np.ndarray]:
    """    
    """
        
    n_splits = len(y) - max_lag
    if n_splits <= 0:
        raise ValueError(
            (f"The maximum lag ({max_lag}) must be less than the length "
                f"of the series '{series_name}', ({len(y)}).")
        )
    
    X_data = np.full(shape=(n_splits, len(lags)), fill_value=np.nan, dtype=float)

    for i, lag in enumerate(lags):
        X_data[:, i] = y[max_lag - lag: -lag]

    y_data = y[max_lag:]
        
    return X_data, y_data


def create_train_X_y(
    series: pd.DataFrame,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None
) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
    """    
    """

    if not isinstance(series, pd.DataFrame):
        raise TypeError(f"`series` must be a pandas DataFrame. Got {type(series)}.")

    series_col_names = list(series.columns)

    if transformer_series is None:
        transformer_series_ = {serie: None for serie in series_col_names}
    elif not isinstance(transformer_series, dict):
        transformer_series_ = {serie: clone(transformer_series) 
                               for serie in series_col_names}
    else:
        transformer_series_ = {serie: None for serie in series_col_names}
        # Only elements already present in transformer_series_ are updated
        transformer_series_.update(
            (k, v) for k, v in deepcopy(transformer_series).items() 
            if k in transformer_series_
        )
        series_not_in_transformer_series = set(series.columns) - set(transformer_series.keys())
        if series_not_in_transformer_series:
            warnings.warn(
                (f"{series_not_in_transformer_series} not present in `transformer_series`."
                    f" No transformation is applied to these series."),
                    IgnoredArgumentWarning
            )

    if exog is not None:
        if not isinstance(exog, dict):
            exog_ = {serie: exog for serie in series_col_names}
        else:
            exog_ = {serie: None for serie in series_col_names}
            # Only elements already present in exog_ are updated
            exog_.update(
                (k, v) for k, v in exog.items() 
                if k in exog_
            )
            series_not_in_exog = set(series.columns) - set(exog.keys())
            if series_not_in_exog:
                # TODO: review warning
                warnings.warn(
                    (f"{series_not_in_exog} not present in `exog`."
                     f" No transformation is applied to these series."),
                     IgnoredArgumentWarning
                )

        for k, v in exog_.items():
            if len(v) != len(series):
                # TODO: when series of different length, exog with nans at the begining
                raise ValueError(
                    (f"`exog` must have same number of samples as `series`.\n"
                     f"    length `exog`: {len(v)} for series {k}\n"
                     f"    length `series`: {len(series)}")
                )
            check_exog(exog=v, allow_nan=True)

            # TODO: review this ValueError
            _, _ = preprocess_exog(exog=v, return_values=False)
            if not (v.index[:len(series)] == series.index).all():
                raise ValueError(
                    ("Different index for `series` and `exog`. They must be equal "
                     "to ensure the correct alignment of values.")
                )
    
    X_levels = []
    exog_levels = []
    len_series = []
    X_train_col_names = [f"lag_{lag}" for lag in lags]

    for i, serie in enumerate(series.columns):

        y = series[serie]
        y_values = y.to_numpy()

        if np.isnan(y_values).all():
            raise ValueError(f"All values of series '{serie}' are NaN.")
        
        first_no_nan_idx = np.argmax(~np.isnan(y_values))
        y_values = y_values[first_no_nan_idx:]

        if np.isnan(y_values).any():
            raise ValueError(
                (f"'{serie}' Time series has missing values in between or "
                    f"at the end of the time series. When working with series "
                    f"of different lengths, all series must be complete after "
                    f"the first non-null value.")
            )
        
        y = transform_series(
                series            = y.iloc[first_no_nan_idx:],
                transformer       = transformer_series_[serie],
                fit               = True,
                inverse_transform = False
            )

        y_values = y.to_numpy()
        X_train_values, y_train_values = _create_lags(y=y_values, series_name=serie)

        if i == 0:
            X_train = X_train_values
            y_train = y_train_values
        else:
            X_train = np.concatenate((X_train, X_train_values), axis=0)
            y_train = np.concatenate((y_train, y_train_values), axis=0)

        X_level = [serie]*len(X_train_values)
        X_levels.extend(X_level)
        len_series.append(len(y_train_values))

        if exog is not None:
            exog_level = exog_[serie].iloc[-len(y_train_values):, ]
            if isinstance(exog_level, pd.Series):
                # Needed for pandas concat
                exog_level = exog_level.to_frame()
            exog_levels.append(exog_level)

    X_levels = pd.Series(X_levels)
    X_levels = pd.get_dummies(X_levels, dtype=float)

    X_train = pd.DataFrame(
                  data    = X_train,
                  columns = X_train_col_names
              )

    if exog is not None:
        exog_train = pd.concat(exog_levels).reset_index(drop=True)
        exog_train = transform_dataframe(
                         df                = exog_train,
                         transformer       = transformer_exog,
                         fit               = True,
                         inverse_transform = False
                     )
        
        # check_exog_dtypes llama a check_exog
        # check_exog(exog=exog_train, allow_nan=False)
        check_exog_dtypes(exog_train)
        exog_dtypes = get_exog_dtypes(exog=exog_train)
    else:
        exog_train = None
    
    X_train = pd.concat([X_train, exog_train, X_levels], axis=1)
    X_train_col_names = X_train.columns.to_list()

    y_train = pd.Series(
                    data = y_train,
                    name = 'y'
                )

    _, y_index = preprocess_y(y=series, return_values=False)

    y_index_numpy = y_index.to_numpy()
    y_train_index = pd.Index(
                        np.concatenate(
                            [y_index_numpy[-length:, ] for length in len_series]
                        )
                    )

    return X_train, y_train, y_index, y_train_index

In [23]:
series_2 = series.head(10).copy()
series_2.head(2)

Unnamed: 0,l1,l2
0,0.696469,0.120629
1,0.286139,0.826341


In [56]:
# Exog for series l1
exog_l1 = pd.DataFrame({
              'exog1': np.arange(10),
              'exog2': np.arange(50, 60),
          })

# Exog for series l2
exog_l2 = pd.Series(np.arange(100, 110), name='exog1')

# Dictionary with exog for each series
exog = {
    'l1': exog_l1,
    'l2': exog_l2
}

exog = exog_l1.copy()

In [57]:
# Create forecaster
forecaster = ForecasterAutoregMultiSeries(
                 regressor = LinearRegression(),
                 lags = 3,
                 transformer_series = None
             )

# Forecaster attributes needed
max_lag = forecaster.max_lag
lags = forecaster.lags
transformer_series = forecaster.transformer_series
transformer_exog = forecaster.transformer_exog
# transformer_exog = StandardScaler()

print(max_lag, lags, transformer_series, transformer_exog)

3 [1 2 3] None None


In [63]:
series_3 = pd.DataFrame({
    'l1': np.arange(10000),
    'l2': np.arange(10000)
})

exog_2 = pd.DataFrame({
    'exog1': np.arange(10000),
    'exog2': np.arange(10000)
})

In [67]:
%%timeit -n 1000 -r 4

create_train_X_y(series=series_3, exog=exog_2)[0]

2.96 ms ± 81.9 µs per loop (mean ± std. dev. of 4 runs, 1,000 loops each)


In [68]:
%%timeit -n 1000 -r 4

forecaster.create_train_X_y(series=series_3, exog=exog_2)[0]

2.85 ms ± 70.3 µs per loop (mean ± std. dev. of 4 runs, 1,000 loops each)
