In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
from typing import Union, Optional, List, Tuple, Dict
import numpy as np
import pandas as pd

from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from sklearn.linear_model import LinearRegression
from skforecast.utils import check_exog, check_y, transform_series, transform_dataframe, preprocess_exog, preprocess_last_window, preprocess_y, check_exog_dtypes, get_exog_dtypes

In [5]:
forecaster = ForecasterAutoregDirect(regressor=LinearRegression(), lags=3, steps=5)
self = forecaster

In [6]:
def create_train_X_y(
        self,
        y: pd.Series,
        exog: Optional[Union[pd.Series, pd.DataFrame]]=None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Create training matrices from univariate time series and exogenous
        variables. The resulting matrices contain the target variable and predictors
        needed to train all the regressors (one per step).
        
        Parameters
        ----------        
        y : pandas Series
            Training time series.
            
        exog : pandas Series, pandas DataFrame, default `None`
            Exogenous variable/s included as predictor/s. Must have the same
            number of observations as `y` and their indexes must be aligned.

        Returns 
        -------
        X_train : pandas DataFrame, shape (len(y) - self.max_lag, len(self.lags) + exog.shape[1]*steps)
            Pandas DataFrame with the training values (predictors) for each step.
            
        y_train : pandas DataFrame, shape (len(y) - self.max_lag, )
            Values (target) of the time series related to each row of `X_train` 
            for each step.
        
        """

        if len(y) < self.max_lag + self.steps:
            raise ValueError(
                (f"Minimum length of `y` for training this forecaster is "
                 f"{self.max_lag + self.steps}. Got {len(y)}. Reduce the "
                 f"number of predicted steps, {self.steps}, or the maximum "
                 f"lag, {self.max_lag}, if no more data is available.")
            )

        check_y(y=y)
        y = transform_series(
                series            = y,
                transformer       = self.transformer_y,
                fit               = True,
                inverse_transform = False
            )
        y_values, y_index = preprocess_y(y=y)

        if exog is not None:
            if len(exog) != len(y):
                raise ValueError(
                    (f"`exog` must have same number of samples as `y`. "
                     f"length `exog`: ({len(exog)}), length `y`: ({len(y)})")
                )
            check_exog(exog=exog, allow_nan=True)
            # Need here for filter_train_X_y_for_step to work without fitting
            self.included_exog = True
            if isinstance(exog, pd.Series):
                exog = transform_series(
                           series            = exog,
                           transformer       = self.transformer_exog,
                           fit               = True,
                           inverse_transform = False
                       )
            else:
                exog = transform_dataframe(
                           df                = exog,
                           transformer       = self.transformer_exog,
                           fit               = True,
                           inverse_transform = False
                       )
                
            check_exog(exog=exog, allow_nan=False)
            check_exog_dtypes(exog)
            self.exog_dtypes = get_exog_dtypes(exog=exog)

            _, exog_index = preprocess_exog(exog=exog, return_values=False)
            if not (exog_index[:len(y_index)] == y_index).all():
                raise ValueError(
                    ("Different index for `y` and `exog`. They must be equal "
                     "to ensure the correct alignment of values.")      
                )

        X_train, y_train = self._create_lags(y=y_values)
        X_train_col_names = [f"lag_{i}" for i in self.lags]
        X_train = pd.DataFrame(
                      data    = X_train,
                      columns = X_train_col_names,
                      index   = y_index[self.max_lag + (self.steps -1): ]
                  )

        if exog is not None:
            # Transform exog to match direct format
            # The first `self.max_lag` positions have to be removed from X_exog
            # since they are not in X_lags.
            exog_to_train = exog_to_direct(exog=exog, steps=self.steps).iloc[-X_train.shape[0]:, :]
            X_train = pd.concat((X_train, exog_to_train), axis=1)

        self.X_train_col_names = X_train.columns.to_list()

        y_train_col_names = [f"y_step_{i+1}" for i in range(self.steps)]
        y_train = pd.DataFrame(
                      data    = y_train,
                      index   = y_index[self.max_lag + (self.steps -1): ],
                      columns = y_train_col_names,
                  )
        
        return X_train, y_train
        
def exog_to_direct_old(
    exog: Union[pd.Series, pd.DataFrame],
    steps: int
)-> pd.DataFrame:
    """
    Transforms `exog` to a pandas DataFrame with the shape needed for Direct
    forecasting.
    
    Parameters
    ----------
    exog : pandas Series, pandas DataFrame
        Exogenous variables.

    steps : int.
        Number of steps that will be predicted using exog.

    Returns 
    -------
    exog_transformed : pandas DataFrame
        Exogenous variables transformed.
    
    """

    if not isinstance(exog, (pd.Series, pd.DataFrame)):
        raise TypeError(f"`exog` must be a pandas Series or DataFrame. Got {type(exog)}.")

    if isinstance(exog, pd.Series):
        exog = exog.to_frame()

    len_columns = len(exog)
    exog_idx = exog.index
    exog_transformed = []
    for column in exog.columns:

        exog_column_transformed = [
            (exog[column].iloc[i : len_columns - (steps - 1 - i)]).reset_index(drop=True)
            for i in range(steps)
        ]
        exog_column_transformed = pd.concat(exog_column_transformed, axis=1)
        exog_column_transformed.columns = [f"{column}_step_{i+1}" for i in range(steps)]

        exog_transformed.append(exog_column_transformed)

    if len(exog_transformed) > 1:
        exog_transformed = pd.concat(exog_transformed, axis=1)
    else:
        exog_transformed = exog_column_transformed

    exog_transformed.index = exog_idx[-len(exog_transformed):]

    return exog_transformed
	

def exog_to_direct(
    exog: pd.DataFrame,
    steps: int
)-> pd.DataFrame:
    """
    """
    if not isinstance(exog, (pd.Series, pd.DataFrame)):
        raise TypeError(f"`exog` must be a pandas Series or DataFrame. Got {type(exog)}.")

    if isinstance(exog, pd.Series):
        exog = exog.to_frame()

    n_rows = len(exog)
    exog_idx = exog.index
    exog_transformed = []

    for i in range(steps):
        exog_column_transformed = exog.iloc[i : n_rows - (steps - 1 - i), ]
        exog_column_transformed.index = pd.RangeIndex(len(exog_column_transformed))
        exog_column_transformed.columns = [f"{col}_step_{i+1}" for col in exog_column_transformed.columns]
        exog_transformed.append(exog_column_transformed)

    if len(exog_transformed) > 1:
        exog_transformed = pd.concat(exog_transformed, axis=1, copy=False) # Se nota diferencia con el copy
    else:
        exog_transformed = exog_column_transformed

    exog_transformed.index = exog_idx[-len(exog_transformed):]

    return exog_transformed

In [7]:
exog = pd.DataFrame({'exog_1': pd.Series(np.arange(100, 150), dtype=float),
                     'exog_2': pd.Series(np.arange(1000, 1050), dtype=int),
                     'exog_3': pd.Categorical(range(100, 150))})
exog_len = len(exog)
exog.head(3)

exog_train = exog_to_direct(exog=exog, steps=5)
exog_train.head(5)

Unnamed: 0,exog_1_step_1,exog_2_step_1,exog_3_step_1,exog_1_step_2,exog_2_step_2,exog_3_step_2,exog_1_step_3,exog_2_step_3,exog_3_step_3,exog_1_step_4,exog_2_step_4,exog_3_step_4,exog_1_step_5,exog_2_step_5,exog_3_step_5
4,100.0,1000,100,101.0,1001,101,102.0,1002,102,103.0,1003,103,104.0,1004,104
5,101.0,1001,101,102.0,1002,102,103.0,1003,103,104.0,1004,104,105.0,1005,105
6,102.0,1002,102,103.0,1003,103,104.0,1004,104,105.0,1005,105,106.0,1006,106
7,103.0,1003,103,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107
8,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107,108.0,1008,108


In [8]:
X_train, y_train = create_train_X_y(self, y =pd.Series(np.arange(100, 150)),  exog=exog)
X_train.head(5)

Unnamed: 0,lag_1,lag_2,lag_3,exog_1_step_1,exog_2_step_1,exog_3_step_1,exog_1_step_2,exog_2_step_2,exog_3_step_2,exog_1_step_3,exog_2_step_3,exog_3_step_3,exog_1_step_4,exog_2_step_4,exog_3_step_4,exog_1_step_5,exog_2_step_5,exog_3_step_5
7,102.0,101.0,100.0,103.0,1003,103,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107
8,103.0,102.0,101.0,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107,108.0,1008,108
9,104.0,103.0,102.0,105.0,1005,105,106.0,1006,106,107.0,1007,107,108.0,1008,108,109.0,1009,109
10,105.0,104.0,103.0,106.0,1006,106,107.0,1007,107,108.0,1008,108,109.0,1009,109,110.0,1010,110
11,106.0,105.0,104.0,107.0,1007,107,108.0,1008,108,109.0,1009,109,110.0,1010,110,111.0,1011,111


In [23]:
# Get the index positions of columns matching the regex pattern
idx_columns_exog = np.flatnonzero(X_train.columns.str.endswith(f'step_{1}'))
idx_columns_exog

array([3, 4, 5])

In [24]:
def filter_train_X_y_for_step_old(
	self,
	step: int,
	X_train: pd.DataFrame,
	y_train: pd.Series,
	remove_suffix: bool=False
) -> Tuple[pd.DataFrame, pd.Series]:
	"""
	Select the columns needed to train a forecaster for a specific step.  
	The input matrices should be created using `create_train_X_y()`. If 
	`remove_suffix=True` the suffix "_step_i" will be removed from the 
	column names. 

	Parameters
	----------
	step : int
		Step for which columns must be selected selected. Starts at 1.

	X_train : pandas DataFrame
		Pandas DataFrame with the training values (predictors).
		
	y_train : pandas Series
		Values (target) of the time series related to each row of `X_train`.

	remove_suffix : bool, default `False`
		If True, suffix "_step_i" is removed from the column names.

	Returns 
	-------
	X_train_step : pandas DataFrame
		Pandas DataFrame with the training values (predictors) for step.
		
	y_train_step : pandas Series, shape (len(y) - self.max_lag)
		Values (target) of the time series related to each row of `X_train`.

	"""

	if (step < 1) or (step > self.steps):
		raise ValueError(
			(f"Invalid value `step`. For this forecaster, minimum value is 1 "
			 f"and the maximum step is {self.steps}.")
		)

	step = step - 1 # Matrices X_train and y_train start at index 0.
	y_train_step = y_train.iloc[:, step]

	if not self.included_exog:
		X_train_step = X_train
	else:
		idx_columns_lags = np.arange(len(self.lags))
		idx_columns_exog = np.arange(X_train.shape[1])[len(self.lags) + step::self.steps]
		idx_columns = np.hstack((idx_columns_lags, idx_columns_exog))
		X_train_step = X_train.iloc[:, idx_columns]

	if remove_suffix:
		X_train_step.columns = [col_name.replace(f"_step_{step + 1}", "")
								for col_name in X_train_step.columns]
		y_train_step.name = y_train_step.name.replace(f"_step_{step + 1}", "")

	return  X_train_step, y_train_step
	

def filter_train_X_y_for_step_new(
	self,
	step: int,
	X_train: pd.DataFrame,
	y_train: pd.Series,
	remove_suffix: bool=False
) -> Tuple[pd.DataFrame, pd.Series]:
	"""
	Select the columns needed to train a forecaster for a specific step.  
	The input matrices should be created using `create_train_X_y()`. If 
	`remove_suffix=True` the suffix "_step_i" will be removed from the 
	column names. 

	Parameters
	----------
	step : int
		Step for which columns must be selected selected. Starts at 1.

	X_train : pandas DataFrame
		Pandas DataFrame with the training values (predictors).
		
	y_train : pandas Series
		Values (target) of the time series related to each row of `X_train`.

	remove_suffix : bool, default `False`
		If True, suffix "_step_i" is removed from the column names.

	Returns 
	-------
	X_train_step : pandas DataFrame
		Pandas DataFrame with the training values (predictors) for step.
		
	y_train_step : pandas Series, shape (len(y) - self.max_lag)
		Values (target) of the time series related to each row of `X_train`.

	"""

	if (step < 1) or (step > self.steps):
		raise ValueError(
			(f"Invalid value `step`. For this forecaster, minimum value is 1 "
			 f"and the maximum step is {self.steps}.")
		)

	y_train_step = y_train.iloc[:, step-1]

	if not self.included_exog:
		X_train_step = X_train
	else:
		idx_columns_lags = np.arange(len(self.lags))
		idx_columns_exog = np.flatnonzero(X_train.columns.str.endswith(f'step_{step}'))
		idx_columns = np.hstack((idx_columns_lags, idx_columns_exog))
		X_train_step = X_train.iloc[:, idx_columns]

	if remove_suffix:
		X_train_step.columns = [col_name.replace(f"_step_{step}", "")
								for col_name in X_train_step.columns]
		y_train_step.name = y_train_step.name.replace(f"_step_{step}", "")

	return  X_train_step, y_train_step


In [27]:
X, y = filter_train_X_y_for_step_new(self, step=1, X_train=X_train, y_train=y_train, remove_suffix=False)
X

[0 1 2] [3 4 5]


Unnamed: 0,lag_1,lag_2,lag_3,exog_1_step_1,exog_2_step_1,exog_3_step_1
7,102.0,101.0,100.0,103.0,1003,103
8,103.0,102.0,101.0,104.0,1004,104
9,104.0,103.0,102.0,105.0,1005,105
10,105.0,104.0,103.0,106.0,1006,106
11,106.0,105.0,104.0,107.0,1007,107
12,107.0,106.0,105.0,108.0,1008,108
13,108.0,107.0,106.0,109.0,1009,109
14,109.0,108.0,107.0,110.0,1010,110
15,110.0,109.0,108.0,111.0,1011,111
16,111.0,110.0,109.0,112.0,1012,112
