In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [2]:
from typing import Union
import numpy as np
import pandas as pd

In [3]:

def exog_to_direct_old(
    exog: Union[pd.Series, pd.DataFrame],
    steps: int
)-> pd.DataFrame:
    """
    Transforms `exog` to a pandas DataFrame with the shape needed for Direct
    forecasting.
    
    Parameters
    ----------
    exog : pandas Series, pandas DataFrame
        Exogenous variables.

    steps : int.
        Number of steps that will be predicted using exog.

    Returns 
    -------
    exog_transformed : pandas DataFrame
        Exogenous variables transformed.
    
    """

    if not isinstance(exog, (pd.Series, pd.DataFrame)):
        raise TypeError(f"`exog` must be a pandas Series or DataFrame. Got {type(exog)}.")

    if isinstance(exog, pd.Series):
        exog = exog.to_frame()

    len_columns = len(exog)
    exog_idx = exog.index
    exog_transformed = []
    for column in exog.columns:

        exog_column_transformed = [
            (exog[column].iloc[i : len_columns - (steps - 1 - i)]).reset_index(drop=True)
            for i in range(steps)
        ]
        exog_column_transformed = pd.concat(exog_column_transformed, axis=1)
        exog_column_transformed.columns = [f"{column}_step_{i+1}" for i in range(steps)]

        exog_transformed.append(exog_column_transformed)

    if len(exog_transformed) > 1:
        exog_transformed = pd.concat(exog_transformed, axis=1)
    else:
        exog_transformed = exog_column_transformed

    exog_transformed.index = exog_idx[-len(exog_transformed):]

    return exog_transformed
	

def exog_to_direct_new(
    exog: pd.DataFrame,
    steps: int
)-> pd.DataFrame:
    """
    """
    if not isinstance(exog, (pd.Series, pd.DataFrame)):
        raise TypeError(f"`exog` must be a pandas Series or DataFrame. Got {type(exog)}.")

    if isinstance(exog, pd.Series):
        exog = exog.to_frame()

    n_rows = len(exog)
    exog_idx = exog.index
    exog_transformed = []

    for i in range(steps):

        exog_column_transformed = exog.iloc[i : n_rows - (steps - 1 - i), ]
        exog_column_transformed.index = pd.RangeIndex(len(exog_column_transformed))
        exog_column_transformed.columns = [f"{col}_step_{i+1}" for col in exog_column_transformed.columns]

        exog_transformed.append(exog_column_transformed)

    if len(exog_transformed) > 1:
        exog_transformed = pd.concat(exog_transformed, axis=1, copy=True) # Se nota diferencia con el copy
    else:
        exog_transformed = exog_column_transformed

    exog_transformed.index = exog_idx[-len(exog_transformed):]

    return exog_transformed

In [5]:
exog = pd.DataFrame({'exog_1': pd.Series(np.arange(100, 150), dtype=float),
                     'exog_2': pd.Series(np.arange(1000, 1050), dtype=int),
                     'exog_3': pd.Categorical(range(100, 150))})
exog_len = len(exog)
exog.head(3)

X_train = exog_to_direct_new(exog=exog, steps=5).head(5)
X_train

Unnamed: 0,exog_1_step_1,exog_2_step_1,exog_3_step_1,exog_1_step_2,exog_2_step_2,exog_3_step_2,exog_1_step_3,exog_2_step_3,exog_3_step_3,exog_1_step_4,exog_2_step_4,exog_3_step_4,exog_1_step_5,exog_2_step_5,exog_3_step_5
4,100.0,1000,100,101.0,1001,101,102.0,1002,102,103.0,1003,103,104.0,1004,104
5,101.0,1001,101,102.0,1002,102,103.0,1003,103,104.0,1004,104,105.0,1005,105
6,102.0,1002,102,103.0,1003,103,104.0,1004,104,105.0,1005,105,106.0,1006,106
7,103.0,1003,103,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107
8,104.0,1004,104,105.0,1005,105,106.0,1006,106,107.0,1007,107,108.0,1008,108


In [6]:
# Get the index positions of columns matching the regex pattern
idx_columns_exog = np.where(X_train.columns.str.endswith(f'step_{step}'))
idx_columns_exog

(array([6, 7, 8]),)

In [None]:

def filter_train_X_y_for_step_old(
	self,
	step: int,
	X_train: pd.DataFrame,
	y_train: pd.Series,
	remove_suffix: bool=False
) -> Tuple[pd.DataFrame, pd.Series]:
	"""
	Select the columns needed to train a forecaster for a specific step.  
	The input matrices should be created using `create_train_X_y()`. If 
	`remove_suffix=True` the suffix "_step_i" will be removed from the 
	column names. 

	Parameters
	----------
	step : int
		Step for which columns must be selected selected. Starts at 1.

	X_train : pandas DataFrame
		Pandas DataFrame with the training values (predictors).
		
	y_train : pandas Series
		Values (target) of the time series related to each row of `X_train`.

	remove_suffix : bool, default `False`
		If True, suffix "_step_i" is removed from the column names.

	Returns 
	-------
	X_train_step : pandas DataFrame
		Pandas DataFrame with the training values (predictors) for step.
		
	y_train_step : pandas Series, shape (len(y) - self.max_lag)
		Values (target) of the time series related to each row of `X_train`.

	"""

	if (step < 1) or (step > self.steps):
		raise ValueError(
			(f"Invalid value `step`. For this forecaster, minimum value is 1 "
			 f"and the maximum step is {self.steps}.")
		)

	step = step - 1 # Matrices X_train and y_train start at index 0.
	y_train_step = y_train.iloc[:, step]

	if not self.included_exog:
		X_train_step = X_train
	else:
		idx_columns_lags = np.arange(len(self.lags))
		idx_columns_exog = np.arange(X_train.shape[1])[len(self.lags) + step::self.steps]
		idx_columns = np.hstack((idx_columns_lags, idx_columns_exog))
		X_train_step = X_train.iloc[:, idx_columns]

	if remove_suffix:
		X_train_step.columns = [col_name.replace(f"_step_{step + 1}", "")
								for col_name in X_train_step.columns]
		y_train_step.name = y_train_step.name.replace(f"_step_{step + 1}", "")

	return  X_train_step, y_train_step
	

def filter_train_X_y_for_step_new(
	self,
	step: int,
	X_train: pd.DataFrame,
	y_train: pd.Series,
	remove_suffix: bool=False
) -> Tuple[pd.DataFrame, pd.Series]:
	"""
	Select the columns needed to train a forecaster for a specific step.  
	The input matrices should be created using `create_train_X_y()`. If 
	`remove_suffix=True` the suffix "_step_i" will be removed from the 
	column names. 

	Parameters
	----------
	step : int
		Step for which columns must be selected selected. Starts at 1.

	X_train : pandas DataFrame
		Pandas DataFrame with the training values (predictors).
		
	y_train : pandas Series
		Values (target) of the time series related to each row of `X_train`.

	remove_suffix : bool, default `False`
		If True, suffix "_step_i" is removed from the column names.

	Returns 
	-------
	X_train_step : pandas DataFrame
		Pandas DataFrame with the training values (predictors) for step.
		
	y_train_step : pandas Series, shape (len(y) - self.max_lag)
		Values (target) of the time series related to each row of `X_train`.

	"""

	if (step < 1) or (step > self.steps):
		raise ValueError(
			(f"Invalid value `step`. For this forecaster, minimum value is 1 "
			 f"and the maximum step is {self.steps}.")
		)

	step = step - 1 # Matrices X_train and y_train start at index 0.
	y_train_step = y_train.iloc[:, step]

	if not self.included_exog:
		X_train_step = X_train
	else:
		idx_columns_lags = np.arange(len(self.lags))
		idx_columns_exog = np.where(X_train.columns.str.endswith(f'step_{step}'))
		idx_columns = np.hstack((idx_columns_lags, idx_columns_exog))
		X_train_step = X_train.iloc[:, idx_columns]

	if remove_suffix:
		X_train_step.columns = [col_name.replace(f"_step_{step + 1}", "")
								for col_name in X_train_step.columns]
		y_train_step.name = y_train_step.name.replace(f"_step_{step + 1}", "")

	return  X_train_step, y_train_step


In [None]:
idx_columns_exog = np.where(X_train.columns.str.endswith(f'step_{step}'))
idx_columns_exog