In [24]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ubuntu/varios/skforecast'

In [25]:
import pandas as pd
import numpy as np
from typing import List, Union, Tuple, Optional

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [26]:
# Data simulation
# ==============================================================================
n = 10
series = pd.DataFrame(
    np.random.randn(n, 5),
    index=pd.date_range("2018-01-01", periods=n),
    columns=[f"series_{i}" for i in range(5)],
)

exog = {
    f"series_{i}": pd.DataFrame(
        np.random.randn(n, 2),
        index=pd.date_range("2018-01-01", periods=n),
        columns=[f"exog_{j}" for j in range(2)],
    )
    for i in range(5)
}

# exog = pd.DataFrame(
#     np.random.randn(n, 2),
#     index=pd.date_range("2018-01-01", periods=n),
#     columns=["exog_0", "exog_1"],
# )

# Injecting missing values
series['series_0'].iloc[0:2] = np.nan
series['series_1'].iloc[0:3] = np.nan
series['series_1'].iloc[-3:] = np.nan
# series['series_3'].iloc[[5, 6]] = np.nan Needs to remove the exception in create_train_X_y when there are missing values in y
series



Unnamed: 0,series_0,series_1,series_2,series_3,series_4
2018-01-01,,,-1.26185,0.08787,0.088626
2018-01-02,,,-0.345499,0.23078,0.952178
2018-01-03,1.867116,,1.873486,-0.305502,0.526883
2018-01-04,1.159279,0.304625,-1.417519,0.246185,0.688534
2018-01-05,1.484221,-0.005817,-2.142832,-1.651898,0.738103
2018-01-06,-0.688957,-0.141787,-1.449811,-2.236662,-1.016556
2018-01-07,0.0611,0.060116,-0.295795,0.786142,1.108431
2018-01-08,-1.218385,,-0.003603,-1.835462,-1.498089
2018-01-09,0.363553,,1.095959,0.255753,-0.21541
2018-01-10,1.851692,,-0.747716,0.387752,0.588002


In [27]:
forecaster = ForecasterAutoreg(
    regressor=LinearRegression(),
    lags=3
)

In [28]:
# Preprocessing
# ==============================================================================
# Store series and exog as dict
if isinstance(series, pd.DataFrame):
    series_dict = series.to_dict("series")
elif isinstance(series, dict):
    series_dict = series

if isinstance(exog, pd.DataFrame):
    exog_dict = dict.fromkeys(series_dict.keys(), exog)
elif isinstance(exog, dict):
    exog_dict = exog

# Remove leading and trailing nans from each series and exog. This is done
# so then there is no need to remove leading and trailing nans in each X_train
for k, v in series_dict.items():
    series_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]

for k, v in exog_dict.items():
    exog_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]



print("Series lengths after removing leading and trailing nans")
for k, v in series_dict.items():
    print(f"{k}: {len(v)}")

print("Exog lengths after removing leading and trailing nans")
for k, v in exog_dict.items():
    print(f"{k}: {len(v)}")

Series lengths after removing leading and trailing nans
series_0: 8
series_1: 4
series_2: 10
series_3: 10
series_4: 10
Exog lengths after removing leading and trailing nans
series_0: 10
series_1: 10
series_2: 10
series_3: 10
series_4: 10


In [29]:
X_train_buffer = []
y_train_buffer = []

for key in series_dict.keys():
    y = series_dict[key]
    exog = exog_dict[key]
    # TODO: All neded check goes here
    exog = exog.loc[y.index]
    print(y.name)
    print(exog.columns)
    X_train, y_train = forecaster.create_train_X_y(y=y, exog=exog)
    X_train['level'] = key
    X_train_buffer.append(X_train)
    y_train_buffer.append(y_train)

X_train = pd.concat(X_train_buffer, axis=0)
y_train = pd.concat(y_train_buffer, axis=0)

series_0
Index(['exog_0', 'exog_1'], dtype='object')
series_1
Index(['exog_0', 'exog_1'], dtype='object')
series_2
Index(['exog_0', 'exog_1'], dtype='object')
series_3
Index(['exog_0', 'exog_1'], dtype='object')
series_4
Index(['exog_0', 'exog_1'], dtype='object')


In [30]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_0,exog_1,level
2018-01-06,1.484221,1.159279,1.867116,0.305531,2.765781,series_0
2018-01-07,-0.688957,1.484221,1.159279,-0.536736,2.53797,series_0
2018-01-08,0.0611,-0.688957,1.484221,-1.245,0.063485,series_0
2018-01-09,-1.218385,0.0611,-0.688957,1.450337,0.921844,series_0
2018-01-10,0.363553,-1.218385,0.0611,-0.109622,1.267858,series_0
2018-01-07,-0.141787,-0.005817,0.304625,0.289228,-0.469021,series_1
2018-01-04,1.873486,-0.345499,-1.26185,0.299628,1.389228,series_2
2018-01-05,-1.417519,1.873486,-0.345499,-0.293398,0.644642,series_2
2018-01-06,-2.142832,-1.417519,1.873486,1.994853,-1.037329,series_2
2018-01-07,-1.449811,-2.142832,-1.417519,-1.557848,-1.36962,series_2


In [32]:
y_train

2018-01-06   -0.688957
2018-01-07    0.061100
2018-01-08   -1.218385
2018-01-09    0.363553
2018-01-10    1.851692
2018-01-07    0.060116
2018-01-04   -1.417519
2018-01-05   -2.142832
2018-01-06   -1.449811
2018-01-07   -0.295795
2018-01-08   -0.003603
2018-01-09    1.095959
2018-01-10   -0.747716
2018-01-04    0.246185
2018-01-05   -1.651898
2018-01-06   -2.236662
2018-01-07    0.786142
2018-01-08   -1.835462
2018-01-09    0.255753
2018-01-10    0.387752
2018-01-04    0.688534
2018-01-05    0.738103
2018-01-06   -1.016556
2018-01-07    1.108431
2018-01-08   -1.498089
2018-01-09   -0.215410
2018-01-10    0.588002
Name: aaaaa, dtype: float64

In [None]:
def _create_train_X_y_single_series(
    self,
    y: pd.Series,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    """
    Create training matrices from univariate time series and exogenous
    variables. This method does not transform the exog variables.
    
    Parameters
    ----------
    y : pandas Series
        Training time series.
    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and their indexes must be aligned.

    Returns
    -------
    X_train_lags : pandas DataFrame
        Training values of lags
        Shape: (len(y) - self.max_lag, len(self.lags))
    X_train_exog : pandas DataFrame
        Training values of exogenous variables.
        Shape: (len(y) - self.max_lag, len(exog.columns))
    y_train : pandas Series
        Values (target) of the time series related to each row of `X_train`.
        Shape: (len(y) - self.max_lag, )
    
    """
    y_name = y.name
    check_y(y=y)
    y = transform_series(
            series            = y,
            transformer       = self.transformer_y,
            fit               = True,
            inverse_transform = False
        )
    y_values, y_index = preprocess_y(y=y)

    if self.differentiation is not None:
        y_values = self.differentiator_[y_name].fit_transform(y_values)
    
    if exog is not None:
        if len(exog) != len(y):
            raise ValueError(
                (f'`exog` must have same number of samples as `y`. '
                    f'length `exog`: ({len(exog)}), length `y`: ({len(y)})')
            )
        check_exog(exog=exog, allow_nan=True)
        check_exog_dtypes(exog)
        _, exog_index = preprocess_exog(exog=exog, return_values=False)
        if not (exog_index[:len(y_index)] == y_index).all():
            raise ValueError(
                ("Different index for `y` and `exog`. They must be equal to "
                "ensure the correct alignment of values.")
            )
    
    X_train, y_train = self._create_lags(y=y_values)
    X_train_col_names = [f"lag_{i}" for i in self.lags]
    X_train_lags = pd.DataFrame(
                        data    = X_train,
                        columns = X_train_col_names,
                        index   = y_index[self.max_lag: ]
                   )

    X_train_exog = None
    if exog is not None:
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train.
        X_train_exog = exog.iloc[self.max_lag:, ]
        X_train_exog.index = exog_index[self.max_lag:]
        X_train_exog = X_train_exog.loc[X_train_lags.index, :] # TODO: is this nedeed?

    y_train = pd.Series(
                    data  = y_train,
                    index = y_index[self.max_lag: ],
                    name  = 'y'
                )

    if self.differentiation is not None:
        y_train = y_train.iloc[self.differentiation: ]
        X_train_lags = X_train_lags.iloc[self.differentiation: ]
        if X_train_exog is not None:
            X_train_exog = X_train_exog.iloc[self.differentiation: ]
                    
    return X_train_lags, X_train_exog, y_train

In [None]:
def create_train_X_y(
        self,
        series: Union[pd.DataFrame, dict],
        exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None
    ) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
        """
        Create training matrices from multiple time series and exogenous
        variables.
        
        Parameters
        ----------
        series : pandas DataFrame, dict
            Training time series.
        exog : pandas Series, pandas DataFrame, dict, default `None`
            Exogenous variable/s included as predictor/s. Must have the same
            number of observations as `series` and their indexes must be aligned.

        Returns
        -------
        X_train : pandas DataFrame
            Training values (predictors).
        y_train : pandas Series
            Values (target) of the time series related to each row of `X_train`.
            Shape: (len(series) - self.max_lag, )
        y_index : pandas Index
            Index of `series`.
        y_train_index: pandas Index
            Index of `y_train`.
        
        """

        if not isinstance(series, (pd.DataFrame, dict)):
            raise TypeError(f"`series` must be a pandas DataFrame or dict. Got {type(series)}.")
        
        if not isinstance(exog, (pd.Series, pd.DataFrame, dict, type(None))):
            raise TypeError(f"`exog` must be a pandas Series, DataFrame or dict. Got {type(exog)}.")

        if isinstance(series, pd.DataFrame):
            series_dict = series.to_dict("series")
        elif isinstance(series, dict):
            series_dict = series

        if isinstance(exog, pd.DataFrame):
            exog_dict = dict.fromkeys(series_dict.keys(), exog)
        elif isinstance(exog, dict):
            exog_dict = exog

        series_names = list(series_dict.keys())
        if self.transformer_series is None:
            self.transformer_series_ = {serie: None for serie in series_names}
        elif not isinstance(self.transformer_series, dict):
            self.transformer_series_ = {serie: clone(self.transformer_series) 
                                        for serie in series_names}
        else:
            self.transformer_series_ = {serie: None for serie in series_names}
            # Only elements already present in transformer_series_ are updated
            self.transformer_series_.update(
                (k, v) for k, v in deepcopy(self.transformer_series).items() 
                if k in self.transformer_series_
            )
            series_not_in_transformer_series = set(series.columns) - set(self.transformer_series.keys())
            if series_not_in_transformer_series:
                warnings.warn(
                    (f"{series_not_in_transformer_series} not present in `transformer_series`."
                     f" No transformation is applied to these series."),
                     IgnoredArgumentWarning
                )  
        if self.differentiation is None:
            self.differentiator_ = {serie: None for serie in series_names}
        else:
            self.differentiator_ = {serie: clone(self.self.differentiator) for serie in series_names}


        # Remove leading and trailing nans from each series and exog.
        for k, v in series_dict.items():
            series_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]
        if exog is not None:
            for k, v in exog_dict.items():
                v = v.loc[v.first_valid_index():v.last_valid_index()]
                if isinstance(v, pd.Series):
                    v = pd.DataFrame(v)
                exog_dict[k]

        # TODO: check all series have the same type of index and frequency
        # TODO: check that same exogs in diferent series have the same data types (important for later concatenation)
        
        exog_names = [exog.columns.to_list() for exog in exog_dict.values()]
        exog_names = list(set([item for sublist in exog_names for item in sublist]))
        X_train_lags_buffer = []
        X_train_exog_buffer = []
        y_train_buffer = []

        for key in series_dict.keys():
            y = series_dict[key]
            exog = exog_dict[key]
            exog = exog.loc[y.index]
            exog = exog.reindex(y.index, fill_value=np.nan)
            X_train_lags, X_train_exog, y_train = (
                forecaster._create_train_X_y_single_series(y=y, exog=exog)
            )
            X_train_lags['level'] = key
            X_train_lags_buffer.append(X_train_lags)
            X_train_exog_buffer.append(X_train_exog)
            y_train_buffer.append(y_train)

        X_train_lags = pd.concat(X_train_lags_buffer, axis=0)
        y_train = pd.concat(y_train_buffer, axis=0)
        if exog is not None:
            X_train_exog = pd.concat(X_train_exog_buffer, axis=0)
            X_train_exog = transform_dataframe(
                                df                = X_train_exog,
                                transformer       = self.transformer_exog,
                                fit               = True,
                                inverse_transform = False
                            )
            check_exog(exog=X_train_exog, allow_nan=False)
            check_exog_dtypes(X_train_exog)
            self.exog_dtypes = get_exog_dtypes(exog=X_train_exog)
            if not (X_train_exog.index == X_train_lags).all():
                raise ValueError(
                    ("Different index for `y` and `exog`. They must be equal "
                    "to ensure the correct alignment of values.")
                )
            X_train = pd.concat([X_train_lags, X_train_exog], axis=1)
        else:
            X_train = X_train_lags

        self.X_train_col_names = X_train.columns.to_list()
        y_train_index = y_train.index.to_numpy()


        return X_train, y_train, y_index, y_train_index