In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent.parent))
%config Completer.use_jedi = False

In [2]:
import sys
from pathlib import Path
(Path.cwd().parent)

WindowsPath('C:/Users/jaesc2/OneDrive - IKEA/Documents/GitHub/skforecast/dev')

## Librerias

In [8]:
## ForecasterAutoregMultiSeries
# ==============================================================================
import numpy as np
import pandas as pd

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.utils import check_y
from skforecast.utils import check_exog
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import check_predict_input

In [None]:
def _create_lags(self, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    '''       
    Transforms a 1d array into a 2d array (X) and a 1d array (y). Each row
    in X is associated with a value of y and it represents the lags that
    precede it.

    Notice that, the returned matrix X_data, contains the lag 1 in the first
    column, the lag 2 in the second column and so on.

    Parameters
    ----------        
    y : 1d numpy ndarray
        Training time series.

    Returns 
    -------
    X_data : 2d numpy ndarray, shape (samples - max(self.lags), len(self.lags))
        2d numpy array with the lagged values (predictors).

    y_data : 1d np.ndarray, shape (samples - max(self.lags),)
        Values of the time series related to each row of `X_data`.

    '''

    n_splits = len(y) - self.max_lag
    X_data   = np.full(shape=(n_splits, self.max_lag), fill_value=np.nan, dtype=float)
    y_data   = np.full(shape=(n_splits, 1), fill_value=np.nan, dtype= float)

    for i in range(n_splits):
        X_index = np.arange(i, self.max_lag + i)
        y_index = [self.max_lag + i]
        X_data[i, :] = y[X_index]
        y_data[i]    = y[y_index]

    X_data = X_data[:, -self.lags] # Only keep needed lags
    y_data = y_data.ravel()

    return X_data, y_data


def create_train_X_y(
    self,
    y: pd.Series,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None
) -> Tuple[pd.DataFrame, pd.Series]:
    '''
    Create training matrices from univariate time series and exogenous
    variables.

    Parameters
    ----------        
    y : pandas Series
        Training time series.

    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and their indexes must be aligned.

    Returns 
    -------
    X_train : pandas DataFrame, shape (len(y) - self.max_lag, len(self.lags))
        Pandas DataFrame with the training values (predictors).

    y_train : pandas Series, shape (len(y) - self.max_lag, )
        Values (target) of the time series related to each row of `X_train`.

    '''

    check_y(y=y)
    y_values, y_index = preprocess_y(y=y)

    if exog is not None:
        if len(exog) != len(y):
            raise Exception(
                "`exog` must have same number of samples as `y`."
            )
        check_exog(exog=exog)
        exog_values, exog_index = preprocess_exog(exog=exog)
        if not (exog_index[:len(y_index)] == y_index).all():
            raise Exception(
            ('Different index for `y` and `exog`. They must be equal '
            'to ensure the correct alignment of values.')      
            )

    X_train, y_train = self._create_lags(y=y_values)
    X_train_col_names = [f"lag_{i}" for i in self.lags]
    if exog is not None:
        col_names_exog = exog.columns if isinstance(exog, pd.DataFrame) else [exog.name]
        X_train_col_names.extend(col_names_exog)
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train.
        X_train = np.column_stack((X_train, exog_values[self.max_lag:, ]))

    X_train = pd.DataFrame(
                data    = X_train,
                columns = X_train_col_names,
                index   = y_index[self.max_lag: ]
              )
    self.X_train_col_names = X_train_col_names
    y_train = pd.Series(
                data  = y_train,
                index = y_index[self.max_lag: ],
                name  = 'y'
             )

    return X_train, y_train

## Pruebas

### Single serie

In [27]:
np.random.seed(123)
y = pd.Series(np.random.rand(10))
max_lag=3
lags=np.array([1,2,3])
exog = None

In [28]:
n_splits = len(y) - max_lag
X_data   = np.full(shape=(n_splits, max_lag), fill_value=np.nan, dtype=float)
y_data   = np.full(shape=(n_splits, 1), fill_value=np.nan, dtype= float)

for i in range(n_splits):
    X_index = np.arange(i, max_lag + i)
    y_index = [max_lag + i]
    X_data[i, :] = y[X_index]
    y_data[i]    = y[y_index]

X_data = X_data[:, -lags] # Only keep needed lags
y_data = y_data.ravel()

print(y)
print(X_index)
print(X_data)
print(y_index)
print(y_data)

X_train = X_data
y_train = y_data

0    0.696469
1    0.286139
2    0.226851
3    0.551315
4    0.719469
5    0.423106
6    0.980764
7    0.684830
8    0.480932
9    0.392118
dtype: float64
[6 7 8]
[[0.22685145 0.28613933 0.69646919]
 [0.55131477 0.22685145 0.28613933]
 [0.71946897 0.55131477 0.22685145]
 [0.42310646 0.71946897 0.55131477]
 [0.9807642  0.42310646 0.71946897]
 [0.68482974 0.9807642  0.42310646]
 [0.4809319  0.68482974 0.9807642 ]]
[9]
[0.55131477 0.71946897 0.42310646 0.9807642  0.68482974 0.4809319
 0.39211752]


In [31]:
check_y(y=y)
y_values, y_index = preprocess_y(y=y)

if exog is not None:
    if len(exog) != len(y):
        raise Exception(
            "`exog` must have same number of samples as `y`."
        )
    check_exog(exog=exog)
    exog_values, exog_index = preprocess_exog(exog=exog)
    if not (exog_index[:len(y_index)] == y_index).all():
        raise Exception(
        ('Different index for `y` and `exog`. They must be equal '
        'to ensure the correct alignment of values.')      
        )

# X_train, y_train = self._create_lags(y=y_values)
X_train_col_names = [f"lag_{i}" for i in lags]
if exog is not None:
    col_names_exog = exog.columns if isinstance(exog, pd.DataFrame) else [exog.name]
    X_train_col_names.extend(col_names_exog)
    # The first `self.max_lag` positions have to be removed from exog
    # since they are not in X_train.
    X_train = np.column_stack((X_train, exog_values[max_lag:, ]))

X_train = pd.DataFrame(
            data    = X_train,
            columns = X_train_col_names,
            index   = y_index[max_lag: ]
          )

y_train = pd.Series(
            data  = y_train,
            index = y_index[max_lag: ],
            name  = 'y'
         )

print(y)
print("========")
print(X_train)
print("========")
print(y_train)

0    0.696469
1    0.286139
2    0.226851
3    0.551315
4    0.719469
5    0.423106
6    0.980764
7    0.684830
8    0.480932
9    0.392118
dtype: float64
      lag_1     lag_2     lag_3
3  0.226851  0.286139  0.696469
4  0.551315  0.226851  0.286139
5  0.719469  0.551315  0.226851
6  0.423106  0.719469  0.551315
7  0.980764  0.423106  0.719469
8  0.684830  0.980764  0.423106
9  0.480932  0.684830  0.980764
3    0.551315
4    0.719469
5    0.423106
6    0.980764
7    0.684830
8    0.480932
9    0.392118
Name: y, dtype: float64


### Multi-serie

In [113]:
y_1 = pd.Series(np.random.rand(10))
y_2 = pd.Series(np.random.rand(10))
exog_1 = pd.Series(np.random.rand(10))
exog_2 = pd.Series(np.random.rand(10))

df = pd.DataFrame({'serie_1': y_1, 'serie_2': y_2, 'exog_1': exog_1, 'exog_2': exog_2,
                   'date':pd.date_range(start='2022-01-01', periods=10, freq='5H')})

df = df.set_index('date')
df = df.asfreq('5H')
df = df.sort_index()

max_lag=3
lags=np.array([1,2,3])

df_y = df[['serie_1', 'serie_2']]
exog = df[['exog_1', 'exog_2']]

df

Unnamed: 0_level_0,serie_1,serie_2,exog_1,exog_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01 00:00:00,0.945237,0.594644,0.550953,0.51928
2022-01-01 05:00:00,0.290086,0.62415,0.813314,0.694315
2022-01-01 10:00:00,0.727043,0.668073,0.859941,0.24466
2022-01-01 15:00:00,0.015016,0.172612,0.103521,0.338582
2022-01-01 20:00:00,0.879142,0.898713,0.663043,0.563628
2022-01-02 01:00:00,0.063939,0.620991,0.710075,0.886678
2022-01-02 06:00:00,0.733395,0.043569,0.294517,0.747326
2022-01-02 11:00:00,0.99461,0.684041,0.971364,0.209592
2022-01-02 16:00:00,0.50119,0.196084,0.278687,0.251777
2022-01-02 21:00:00,0.209334,0.027341,0.069982,0.523881


In [101]:
def _create_lags(y):

    n_splits = len(y) - max_lag
    X_data   = np.full(shape=(n_splits, max_lag), fill_value=np.nan, dtype=float)
    y_data   = np.full(shape=(n_splits, 1), fill_value=np.nan, dtype= float)

    for i in range(n_splits):
        X_index = np.arange(i, max_lag + i)
        y_index = [max_lag + i]
        X_data[i, :] = y[X_index]
        y_data[i]    = y[y_index]

    X_data = X_data[:, -lags] # Only keep needed lags
    y_data = y_data.ravel()

    return X_data, y_data

In [126]:
# Este df columns no debe incluir las exog

for i, serie in enumerate(df_y.columns):
    
    y = df[serie]
    check_y(y=y)
    y_values, y_index = preprocess_y(y=y)
    
    if i==0 and exog is not None:
        if len(exog) != len(df):
            raise Exception(
                "`exog` must have same number of samples as `y`."
            )
        check_exog(exog=exog)
        exog_values, exog_index = preprocess_exog(exog=exog)
        if not (exog_index[:len(y_index)] == y_index).all():
            raise Exception(
            ('Different index for `y` and `exog`. They must be equal '
            'to ensure the correct alignment of values.')      
            )

    X_train, y_train = _create_lags(y=y_values)
    X_train_col_names = [f"lag_{i}" for i in lags]
    
    X_level = np.full(shape=(len(X_train), 1), fill_value=f'{serie}')
    X_train_col_names.extend(['level', 'idx'])
    y_index = y_index.strftime('%Y-%m-%d %H:%M:%S')
    X_train = np.column_stack((X_train, X_level, y_index[max_lag: ]))
    
    if exog is not None:
        col_names_exog = exog.columns if isinstance(exog, pd.DataFrame) else [exog.name]
        X_train_col_names.extend(col_names_exog)
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train.
        X_train = np.column_stack((X_train, exog_values[max_lag:, ]))
    
    if i==0:
        X_train_df = pd.DataFrame(
                         data    = X_train,
                         columns = X_train_col_names
                     )

        y_train_df = pd.Series(
                        data  = y_train,
                        name  = 'y'
                     )
    else:
        X_train_df = pd.concat([X_train_df, 
                                pd.DataFrame(
                                    data    = X_train,
                                    columns = X_train_col_names
                                )
                               ]
                     )

        y_train_df = pd.concat([y_train_df, 
                                pd.Series(
                                    data  = y_train,
                                    name  = 'y'
                                )
                               ]
                     )


print(y)
print("========")
display(X_train_df)
print("========")
display(y_train_df)
print("========")
print("========")

date
2022-01-01 00:00:00    0.246243
2022-01-01 05:00:00    0.205350
2022-01-01 10:00:00    0.684826
2022-01-01 15:00:00    0.486112
2022-01-01 20:00:00    0.324910
2022-01-02 01:00:00    0.100214
2022-01-02 06:00:00    0.544763
2022-01-02 11:00:00    0.347025
2022-01-02 16:00:00    0.391096
2022-01-02 21:00:00    0.310509
Freq: 5H, Name: serie_2, dtype: float64


Unnamed: 0,lag_1,lag_2,lag_3,level,idx,exog_1,exog_2
0,0.114385,0.513679,0.899392,serie_1,2022-01-01 15:00:00,0.103521,0.338582
1,0.05258,0.114385,0.513679,serie_1,2022-01-01 20:00:00,0.663043,0.563628
2,0.330582,0.05258,0.114385,serie_1,2022-01-02 01:00:00,0.710075,0.886678
3,0.92033,0.330582,0.05258,serie_1,2022-01-02 06:00:00,0.294517,0.747326
4,0.947582,0.92033,0.330582,serie_1,2022-01-02 11:00:00,0.971364,0.209592
5,0.841164,0.947582,0.92033,serie_1,2022-01-02 16:00:00,0.278687,0.251777
6,0.158679,0.841164,0.947582,serie_1,2022-01-02 21:00:00,0.069982,0.523881
0,0.684826,0.20535,0.246243,serie_2,2022-01-01 15:00:00,0.103521,0.338582
1,0.486112,0.684826,0.20535,serie_2,2022-01-01 20:00:00,0.663043,0.563628
2,0.32491,0.486112,0.684826,serie_2,2022-01-02 01:00:00,0.710075,0.886678




0    0.052580
1    0.330582
2    0.920330
3    0.947582
4    0.841164
5    0.158679
6    0.419923
0    0.486112
1    0.324910
2    0.100214
3    0.544763
4    0.347025
5    0.391096
6    0.310509
Name: y, dtype: float64



In [49]:
X_level = np.full(shape=(1, max_lag), fill_value='serie_1')
X_level

array([['serie_1', 'serie_1', 'serie_1']], dtype='<U7')

In [56]:
X_train.shape

(7, 3)

In [57]:
X_level = np.full(shape=(len(X_train), 1), fill_value=f'{col}')
X_level.shape

(1, 7)

In [63]:
np.column_stack((np.array([1,2]), np.array([1,2]), np.array([1,2])))

array([[1, 1, 1],
       [2, 2, 2]])

### ForecasterAutoregMultiSeries

In [147]:
################################################################################
#                         ForecasterAutoregMultiSeries                         #
#                                                                              #
# This work by Joaquin Amat Rodrigo is licensed under a Creative Commons       #
# Attribution 4.0 International License.                                       #
################################################################################
# coding=utf-8

from typing import Union, Dict, List, Tuple, Any, Optional
import warnings
import logging
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline
from sklearn.base import clone
from copy import copy

import skforecast
from skforecast.ForecasterBase import ForecasterBase
from skforecast.utils import check_y
from skforecast.utils import check_exog
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import check_predict_input

logging.basicConfig(
    format = '%(name)-10s %(levelname)-5s %(message)s', 
    level  = logging.INFO,
)


class ForecasterAutoregMultiSeries_test(ForecasterBase):
    '''
    This class turns any regressor compatible with the scikit-learn API into a
    recursive autoregressive (multi-step) forecaster.
    
    Parameters
    ----------
    regressor : regressor or pipeline compatible with the scikit-learn API
        An instance of a regressor or pipeline compatible with the scikit-learn API.
        
    lags : int, list, 1d numpy ndarray, range
        Lags used as predictors. Index starts at 1, so lag 1 is equal to t-1.
            `int`: include lags from 1 to `lags` (included).
            `list`, `numpy ndarray` or `range`: include only lags present in `lags`,
            all elements must be int.

    
    Attributes
    ----------
    regressor : regressor or pipeline compatible with the scikit-learn API
        An instance of a regressor or pipeline compatible with the scikit-learn API.
        
    lags : numpy ndarray
        Lags used as predictors.
        
    max_lag : int
        Maximum value of lag included in `lags`.

    last_window : pandas Series
        Last window the forecaster has seen during trained. It stores the
        values needed to predict the next `step` right after the training data.
        
    window_size: int
        Size of the window needed to create the predictors. It is equal to
        `max_lag`.
        
    fitted: Bool
        Tag to identify if the regressor has been fitted (trained).
        
    index_type : type
        Type of index of the input used in training.
        
    index_freq : str
        Frequency of Index of the input used in training.
        
    training_range: pandas Index
        First and last values of index of the data used during training.
        
    included_exog : bool
        If the forecaster has been trained using exogenous variable/s.
        
    exog_type : type
        Type of exogenous variable/s used in training.
        
    exog_col_names : list
        Names of columns of `exog` if `exog` used in training was a pandas
        DataFrame.

    X_train_col_names : list
        Names of columns of the matrix created internally for training.
        
    in_sample_residuals: numpy ndarray
        Residuals of the model when predicting training data. Only stored up to
        1000 values.
        
    out_sample_residuals: numpy ndarray
        Residuals of the model when predicting non training data. Only stored
        up to 1000 values.

    creation_date: str
        Date of creation.

    fit_date: str
        Date of last fit.

    skforcast_version: str
        Version of skforecast library used to create the forecaster.
     
    '''
    
    def __init__(self, regressor, lags: Union[int, np.ndarray, list]) -> None:
        
        self.regressor            = regressor
        self.index_type           = None
        self.index_freq           = None
        self.index_values         = None
        self.training_range       = None
        self.last_window          = None
        self.included_exog        = False
        self.exog_type            = None
        self.exog_col_names       = None
        self.X_train_col_names    = None
        self.in_sample_residuals  = None
        self.out_sample_residuals = None
        self.fitted               = False
        self.creation_date        = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
        self.fit_date             = None
        self.skforcast_version    = skforecast.__version__
        
        if isinstance(lags, int) and lags < 1:
            raise Exception('Minimum value of lags allowed is 1.')
            
        if isinstance(lags, (list, range, np.ndarray)) and min(lags) < 1:
            raise Exception('Minimum value of lags allowed is 1.')

        if isinstance(lags, (list, np.ndarray)):
            for lag in lags:
                if not isinstance(lag, (int, np.int64, np.int32)):
                    raise Exception('Values in lags must be int.')
            
        if isinstance(lags, int):
            self.lags = np.arange(lags) + 1
        elif isinstance(lags, (list, range)):
            self.lags = np.array(lags)
        elif isinstance(lags, np.ndarray):
            self.lags = lags
        else:
            raise Exception(
                '`lags` argument must be int, 1d numpy ndarray, range or list. '
                f"Got {type(lags)}"
            )
            
        self.max_lag  = max(self.lags)
        self.window_size = self.max_lag


    def __repr__(self) -> str:
        '''
        Information displayed when a ForecasterAutoreg object is printed.
        '''

        if isinstance(self.regressor, sklearn.pipeline.Pipeline):
            name_pipe_steps = tuple(name + "__" for name in self.regressor.named_steps.keys())
            params = {key : value for key, value in self.regressor.get_params().items() \
                     if key.startswith(name_pipe_steps)}
        else:
            params = self.regressor.get_params()

        info = (
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"{str(type(self)).split('.')[1]} \n"
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"Regressor: {self.regressor} \n"
            f"Lags: {self.lags} \n"
            f"Window size: {self.window_size} \n"
            f"Included exogenous: {self.included_exog} \n"
            f"Type of exogenous variable: {self.exog_type} \n"
            f"Exogenous variables names: {self.exog_col_names} \n"
            f"Training range: {self.training_range.to_list() if self.fitted else None} \n"
            f"Training index type: {str(self.index_type).split('.')[-1][:-2] if self.fitted else None} \n"
            f"Training index frequency: {self.index_freq if self.fitted else None} \n"
            f"Regressor parameters: {params} \n"
            f"Creation date: {self.creation_date} \n"
            f"Last fit date: {self.fit_date} \n"
            f"Skforecast version: {self.skforcast_version} \n"
        )

        return info

    
    def _create_lags(self, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        '''       
        Transforms a 1d array into a 2d array (X) and a 1d array (y). Each row
        in X is associated with a value of y and it represents the lags that
        precede it.
        
        Notice that, the returned matrix X_data, contains the lag 1 in the first
        column, the lag 2 in the second column and so on.
        
        Parameters
        ----------        
        y : 1d numpy ndarray
            Training time series.

        Returns 
        -------
        X_data : 2d numpy ndarray, shape (samples - max(self.lags), len(self.lags))
            2d numpy array with the lagged values (predictors).
        
        y_data : 1d np.ndarray, shape (samples - max(self.lags),)
            Values of the time series related to each row of `X_data`.
            
        '''
          
        n_splits = len(y) - self.max_lag
        X_data   = np.full(shape=(n_splits, self.max_lag), fill_value=np.nan, dtype=float)
        y_data   = np.full(shape=(n_splits, 1), fill_value=np.nan, dtype= float)

        for i in range(n_splits):
            X_index = np.arange(i, self.max_lag + i)
            y_index = [self.max_lag + i]
            X_data[i, :] = y[X_index]
            y_data[i]    = y[y_index]
            
        X_data = X_data[:, -self.lags] # Only keep needed lags
        y_data = y_data.ravel()
            
        return X_data, y_data


    def create_train_X_y(
        self,
        series: pd.DataFrame,
        exog: Optional[Union[pd.Series, pd.DataFrame]]=None
    ) -> Tuple[pd.DataFrame, pd.Series]:
        '''
        Create training matrices from univariate time series and exogenous
        variables.
        
        Parameters
        ----------        
        series : pandas DataFrame
            Training time series.
            
        exog : pandas Series, pandas DataFrame, default `None`
            Exogenous variable/s included as predictor/s. Must have the same
            number of observations as `y` and their indexes must be aligned.

        Returns 
        -------
        X_train : pandas DataFrame, shape (len(y) - self.max_lag, len(self.lags))
            Pandas DataFrame with the training values (predictors).
            
        y_train : pandas Series, shape (len(y) - self.max_lag, )
            Values (target) of the time series related to each row of `X_train`.
        
        '''
        
        if not isinstance(series, pd.DataFrame):
            raise Exception('`series` must be a pandas DataFrame.')
        
        for i, serie in enumerate(series.columns):
            
            y = series[serie]
            check_y(y=y)
            y_values, y_index = preprocess_y(y=y)

            if i==0 and exog is not None:
                if len(exog) != len(series):
                    raise Exception(
                        "`exog` must have same number of samples as `series`."
                    )
                check_exog(exog=exog)
                exog_values, exog_index = preprocess_exog(exog=exog)
                if not (exog_index[:len(y_index)] == y_index).all():
                    raise Exception(
                        ('Different index for `series` and `exog`. They must be equal '
                        'to ensure the correct alignment of values.')      
                    )

            X_train, y_train = _create_lags(y=y_values)
            X_train_col_names = [f"lag_{i}" for i in lags]

            X_level = np.full(shape=(len(X_train), 1), fill_value=f'{serie}')
            X_train_col_names.append('level')
            X_train = np.column_stack((X_train, X_level))

            if exog is not None:
                col_names_exog = exog.columns if isinstance(exog, pd.DataFrame) else [exog.name]
                X_train_col_names.extend(col_names_exog)
                # The first `self.max_lag` positions have to be removed from exog
                # since they are not in X_train.
                X_train = np.column_stack((X_train, exog_values[max_lag:, ]))

            if i==0:
                X_train_df = pd.DataFrame(
                                 data    = X_train,
                                 columns = X_train_col_names
                             )

                y_train_df = pd.Series(
                                data  = y_train,
                                name  = 'y'
                             )
            else:
                X_train_df = pd.concat([X_train_df, 
                                        pd.DataFrame(
                                            data    = X_train,
                                            columns = X_train_col_names
                                        )
                                       ]
                             )

                y_train_df = pd.concat([y_train_df, 
                                        pd.Series(
                                            data  = y_train,
                                            name  = 'y'
                                        )
                                       ]
                             )
                
        return X_train_df, y_train_df, y_index

        
    def fit(
        self,
        series: pd.DataFrame,
        exog: Optional[Union[pd.Series, pd.DataFrame]]=None
    ) -> None:
        '''
        Training Forecaster.
        
        Parameters
        ----------        
        series : pandas DataFrame
            Training time series.
            
        exog : pandas Series, pandas DataFrame, default `None`
            Exogenous variable/s included as predictor/s. Must have the same
            number of observations as `y` and their indexes must be aligned so
            that y[i] is regressed on exog[i].


        Returns 
        -------
        None
        
        '''
        
        # Reset values in case the forecaster has already been fitted.
        self.index_type           = None
        self.index_freq           = None
        self.index_values         = None
        self.last_window          = None
        self.included_exog        = False
        self.exog_type            = None
        self.exog_col_names       = None
        self.X_train_col_names    = None
        self.in_sample_residuals  = None
        self.fitted               = False
        self.training_range       = None
        
        if exog is not None:
            self.included_exog = True
            self.exog_type = type(exog)
            self.exog_col_names = \
                 exog.columns.to_list() if isinstance(exog, pd.DataFrame) else exog.name
 
        X_train, y_train, y_index = self.create_train_X_y(series=series, exog=exog)

        if not str(type(self.regressor)) == "<class 'xgboost.sklearn.XGBRegressor'>":
            self.regressor.fit(X=X_train, y=y_train)
        else:
            self.regressor.fit(X=X_train.to_numpy(), y=y_train.to_numpy())
        
        self.fitted = True
        self.fit_date = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
        self.training_range = y_index[[0, -1]]
        self.index_type = type(y_index)
        if isinstance(y_index, pd.DatetimeIndex):
            self.index_freq = y_index.freqstr
        else: 
            self.index_freq = y_index.step
        self.index_values = y_index
        
        if not str(type(self.regressor)) == "<class 'xgboost.sklearn.XGBRegressor'>":
            residuals = y_train - self.regressor.predict(X_train)
        else:
            residuals = y_train - self.regressor.predict(X_train.to_numpy())
        if len(residuals) > 1000:
            # Only up to 1000 residuals are stored
            rng = np.random.default_rng(seed=123)
            residuals = rng.choice(a=residuals, size=1000, replace=False)                                              
        self.in_sample_residuals = residuals
        
        # The last time window of training data is stored so that lags needed as
        # predictors in the first iteration of `predict()` can be calculated.
        self.last_window = series.iloc[-self.max_lag:, ].copy()
    

    def _recursive_predict(
        self,
        steps: int,
        last_window: np.array,
        exog: np.array
    ) -> pd.Series:
        '''
        Predict n steps ahead. It is an iterative process in which, each prediction,
        is used as a predictor for the next step.
        
        Parameters
        ----------
        steps : int
            Number of future steps predicted.
            
        last_window : numpy ndarray
            Values of the series used to create the predictors (lags) need in the 
            first iteration of prediction (t + 1).
            
        exog : numpy ndarray, pandas DataFrame
            Exogenous variable/s included as predictor/s.

        Returns 
        -------
        predictions : numpy ndarray
            Predicted values.
            
        '''

        predictions = np.full(shape=steps, fill_value=np.nan)

        for i in range(steps):
            X = last_window[-self.lags].reshape(1, -1)
            if exog is not None:
                X = np.column_stack((X, exog[i, ].reshape(1, -1)))

            with warnings.catch_warnings():
                # Suppress scikitlearn warning: "X does not have valid feature names,
                # but NoOpTransformer was fitted with feature names".
                warnings.simplefilter("ignore")
                prediction = self.regressor.predict(X)
                predictions[i] = prediction.ravel()[0]

            # Update `last_window` values. The first position is discarded and 
            # the new prediction is added at the end.
            last_window = np.append(last_window[1:], prediction)

        return predictions

            
    def predict(
        self,
        steps: int,
        last_window: Optional[pd.Series]=None,
        exog: Optional[Union[pd.Series, pd.DataFrame]]=None
    ) -> pd.Series:
        '''
        Predict n steps ahead. It is an recursive process in which, each prediction,
        is used as a predictor for the next step.
        
        Parameters
        ----------
        steps : int
            Number of future steps predicted.
            
        last_window : pandas Series, default `None`
            Values of the series used to create the predictors (lags) need in the 
            first iteration of prediction (t + 1).
    
            If `last_window = None`, the values stored in` self.last_window` are
            used to calculate the initial predictors, and the predictions start
            right after training data.
            
        exog : pandas Series, pandas DataFrame, default `None`
            Exogenous variable/s included as predictor/s.

        Returns 
        -------
        predictions : pandas Series
            Predicted values.
            
        '''

        check_predict_input(
            steps          = steps,
            fitted         = self.fitted,
            included_exog  = self.included_exog,
            index_type     = self.index_type,
            index_freq     = self.index_freq,
            window_size    = self.window_size,
            last_window    = last_window,
            exog           = exog,
            exog_type      = self.exog_type,
            exog_col_names = self.exog_col_names,
            max_steps      = None,
        ) 

        if exog is not None:
            if isinstance(exog, pd.DataFrame):
                exog_values, _ = preprocess_exog(
                                    exog = exog[self.exog_col_names].iloc[:steps, ]
                                 )
            else: 
                exog_values, _ = preprocess_exog(
                                    exog = exog.iloc[:steps, ]
                                 )
        else:
            exog_values = None
            
        if last_window is not None:
            last_window_values, last_window_index = preprocess_last_window(
                                                        last_window = last_window
                                                    )  
        else:
            last_window_values, last_window_index = preprocess_last_window(
                                                        last_window = self.last_window
                                                    )
            
        predictions = self._recursive_predict(
                        steps       = steps,
                        last_window = copy(last_window_values),
                        exog        = copy(exog_values)
                      )

        predictions = pd.Series(
                        data  = predictions,
                        index = expand_index(
                                    index = last_window_index,
                                    steps = steps
                                ),
                        name = 'pred'
                      )

        return predictions
    
    
    def _estimate_boot_interval(
        self,
        steps: int,
        last_window: Optional[np.ndarray]=None,
        exog: Optional[np.ndarray]=None,
        interval: list=[5, 95],
        n_boot: int=500,
        random_state: int=123,
        in_sample_residuals: bool=True
    ) -> np.ndarray:
        '''
        Iterative process in which, each prediction, is used as a predictor
        for the next step and bootstrapping is used to estimate prediction
        intervals. This method only returns prediction intervals.
        See predict_intervals() to calculate both, predictions and intervals.
        
        Parameters
        ----------   
        steps : int
            Number of future steps predicted.
            
        last_window : 1d numpy ndarray shape (, max_lag), default `None`
            Values of the series used to create the predictors (lags) needed in the 
            first iteration of prediction (t + 1).
    
            If `last_window = `None`, the values stored in` self.last_window` are
            used to calculate the initial predictors, and the predictions start
            right after training data.
            
        exog : numpy ndarray, default `None`
            Exogenous variable/s included as predictor/s.
            
        n_boot: int, default `500`
            Number of bootstrapping iterations used to estimate prediction
            intervals.

        random_state: int
            Sets a seed to the random generator, so that boot intervals are always 
            deterministic.
            
        interval: list, default `[5, 95]`
            Confidence of the prediction interval estimated. Sequence of percentiles
            to compute, which must be between 0 and 100 inclusive.
            
        in_sample_residuals: bool, default `True`
            If `True`, residuals from the training data are used as proxy of
            prediction error to create prediction intervals. If `False`, out of
            sample residuals are used. In the latter case, the user should have
            calculated and stored the residuals within the forecaster (see
            `set_out_sample_residuals()`).
            

        Returns 
        -------
        prediction_interval : numpy ndarray, shape (steps, 2)
            Interval estimated for each prediction by bootstrapping:
                first column = lower bound of the interval.
                second column= upper bound interval of the interval.

        Notes
        -----
        More information about prediction intervals in forecasting:
        https://otexts.com/fpp2/prediction-intervals.html
        Forecasting: Principles and Practice (2nd ed) Rob J Hyndman and
        George Athanasopoulos.
            
        '''
        
        if last_window is None:
            last_window = self.last_window.values

        boot_predictions = np.full(
                                shape      = (steps, n_boot),
                                fill_value = np.nan,
                                dtype      = float
                           )
        rng = np.random.default_rng(seed=random_state)
        seeds = rng.integers(low=0, high=10000, size=n_boot)

        for i in range(n_boot):
            # In each bootstraping iteration the initial last_window and exog 
            # need to be restored.
            last_window_boot = last_window.copy()
            if exog is not None:
                exog_boot = exog.copy()
            else:
                exog_boot = None
 
            if in_sample_residuals:
                residuals = self.in_sample_residuals
            else:
                residuals = self.out_sample_residuals

            rng = np.random.default_rng(seed=seeds[i])
            sample_residuals = rng.choice(
                                    a       = residuals,
                                    size    = steps,
                                    replace = True
                               )

            for step in range(steps):
                prediction = self._recursive_predict(
                                steps       = 1,
                                last_window = last_window_boot,
                                exog        = exog_boot 
                            )
                
                prediction_with_residual  = prediction + sample_residuals[step]
                boot_predictions[step, i] = prediction_with_residual

                last_window_boot = np.append(
                                    last_window_boot[1:],
                                    prediction_with_residual
                                   )
                
                if exog is not None:
                    exog_boot = exog_boot[1:]
                            
        prediction_interval = np.percentile(boot_predictions, q=interval, axis=1)
        prediction_interval = prediction_interval.transpose()
        
        return prediction_interval
    
        
    def predict_interval(
        self,
        steps: int,
        last_window: Optional[pd.Series]=None,
        exog: Optional[Union[pd.Series, pd.DataFrame]]=None,
        interval: list=[5, 95],
        n_boot: int=500,
        random_state: int=123,
        in_sample_residuals: bool=True
    ) -> pd.DataFrame:
        '''
        Iterative process in which, each prediction, is used as a predictor
        for the next step and bootstrapping is used to estimate prediction
        intervals. Both, predictions and intervals, are returned.
        
        Parameters
        ---------- 
        steps : int
            Number of future steps predicted.
            
        last_window : pandas Series, default `None`
            Values of the series used to create the predictors (lags) needed in the 
            first iteration of prediction (t + 1).
    
            If `last_window = None`, the values stored in` self.last_window` are
            used to calculate the initial predictors, and the predictions start
            right after training data.
            
        exog : pandas Series, pandas DataFrame, default `None`
            Exogenous variable/s included as predictor/s.
            
        interval: list, default `[5, 95]`
            Confidence of the prediction interval estimated. Sequence of percentiles
            to compute, which must be between 0 and 100 inclusive.
            
        n_boot: int, default `500`
            Number of bootstrapping iterations used to estimate prediction
            intervals.

        random_state: int, default 123
            Sets a seed to the random generator, so that boot intervals are always 
            deterministic.
            
        in_sample_residuals: bool, default `True`
            If `True`, residuals from the training data are used as proxy of
            prediction error to create prediction intervals. If `False`, out of
            sample residuals are used. In the latter case, the user should have
            calculated and stored the residuals within the forecaster (see
            `set_out_sample_residuals()`).

        Returns 
        -------
        predictions : pandas DataFrame
            Values predicted by the forecaster and their estimated interval:
                column pred = predictions.
                column lower_bound = lower bound of the interval.
                column upper_bound = upper bound interval of the interval.

        Notes
        -----
        More information about prediction intervals in forecasting:
        https://otexts.com/fpp2/prediction-intervals.html
        Forecasting: Principles and Practice (2nd ed) Rob J Hyndman and
        George Athanasopoulos.
            
        '''
        
        check_predict_input(
            steps          = steps,
            fitted         = self.fitted,
            included_exog  = self.included_exog,
            index_type     = self.index_type,
            index_freq     = self.index_freq,
            window_size    = self.window_size,
            last_window    = last_window,
            exog           = exog,
            exog_type      = self.exog_type,
            exog_col_names = self.exog_col_names,
            max_steps      = None,
        ) 
        
        if exog is not None:
            if isinstance(exog, pd.DataFrame):
                exog_values, _ = preprocess_exog(
                                    exog = exog[self.exog_col_names].iloc[:steps, ]
                                 )
            else: 
                exog_values, _ = preprocess_exog(
                                    exog = exog.iloc[:steps, ]
                                 )
        else:
            exog_values = None
            
        if last_window is not None:
            last_window_values, last_window_index = preprocess_last_window(
                                                        last_window = last_window
                                                    )  
        else:
            last_window_values, last_window_index = preprocess_last_window(
                                                        last_window = self.last_window
                                                    )
        
        # Since during predict() `last_window_values` and `exog_values` are modified,
        # the originals are stored to be used later.
        last_window_values_original = last_window_values.copy()
        if exog is not None:
            exog_values_original = exog_values.copy()
        else:
            exog_values_original = None
        
        predictions = self._recursive_predict(
                            steps       = steps,
                            last_window = last_window_values,
                            exog        = exog_values
                      )

        predictions_interval = self._estimate_boot_interval(
                                    steps       = steps,
                                    last_window = copy(last_window_values_original),
                                    exog        = copy(exog_values_original),
                                    interval    = interval,
                                    n_boot      = n_boot,
                                    random_state = random_state,
                                    in_sample_residuals = in_sample_residuals
                                )
        
        predictions = np.column_stack((predictions, predictions_interval))

        predictions = pd.DataFrame(
                        data = predictions,
                        index = expand_index(
                                    index = last_window_index,
                                    steps = steps
                                ),
                        columns = ['pred', 'lower_bound', 'upper_bound']
                      )

        return predictions

    
    def set_params(self, **params: dict) -> None:
        '''
        Set new values to the parameters of the scikit learn model stored in the
        ForecasterAutoreg.
        
        Parameters
        ----------
        params : dict
            Parameters values.

        Returns 
        -------
        self
        
        '''
        self.regressor = clone(self.regressor)
        self.regressor.set_params(**params)
        
        
    def set_lags(self, lags: Union[int, list, np.ndarray, range]) -> None:
        '''      
        Set new value to the attribute `lags`.
        Attributes `max_lag` and `window_size` are also updated.
        
        Parameters
        ----------
        lags : int, list, 1D np.array, range
            Lags used as predictors. Index starts at 1, so lag 1 is equal to t-1.
                `int`: include lags from 1 to `lags`.
                `list` or `np.array`: include only lags present in `lags`.

        Returns 
        -------
        None
        
        '''
        
        if isinstance(lags, int) and lags < 1:
            raise Exception('min value of lags allowed is 1')
            
        if isinstance(lags, (list, range, np.ndarray)) and min(lags) < 1:
            raise Exception('min value of lags allowed is 1')
            
        if isinstance(lags, int):
            self.lags = np.arange(lags) + 1
        elif isinstance(lags, (list, range)):
            self.lags = np.array(lags)
        elif isinstance(lags, np.ndarray):
            self.lags = lags
        else:
            raise Exception(
                f"`lags` argument must be `int`, `1D np.ndarray`, `range` or `list`. "
                f"Got {type(lags)}"
            )
            
        self.max_lag  = max(self.lags)
        self.window_size = max(self.lags)
        
        
    def set_out_sample_residuals(self, residuals: pd.Series, append: bool=True)-> None:
        '''
        Set new values to the attribute `out_sample_residuals`. Out of sample
        residuals are meant to be calculated using observations that did not
        participate in the training process.
        
        Parameters
        ----------
        residuals : pd.Series
            Values of residuals. If len(residuals) > 1000, only a random sample
            of 1000 values are stored.
            
        append : bool, default `True`
            If `True`, new residuals are added to the once already stored in the
            attribute `out_sample_residuals`. Once the limit of 1000 values is
            reached, no more values are appended. If False, `out_sample_residuals`
            is overwritten with the new residuals.
            
        Returns 
        -------
        self
        '''

        if not isinstance(residuals, pd.Series):
            raise Exception(
                f"`residuals` argument must be `pd.Series`. Got {type(residuals)}"
            )

        if len(residuals) > 1000:
            rng = np.random.default_rng(seed=123)
            residuals = rng.choice(a=residuals, size=1000, replace=False)
            residuals = pd.Series(residuals)   
      
        if append and self.out_sample_residuals is not None:
            free_space = max(0, 1000 - len(self.out_sample_residuals))
            if len(residuals) < free_space:
                residuals = np.hstack((
                                self.out_sample_residuals,
                                residuals
                            ))
            else:
                residuals = np.hstack((
                                self.out_sample_residuals,
                                residuals[:free_space]
                            ))

        self.out_sample_residuals = pd.Series(residuals)

    
    def get_feature_importance(self) -> pd.DataFrame:
        '''      
        Return feature importance of the regressor stored in the
        forecaster. Only valid when regressor stores internally the feature
        importance in the attribute `feature_importances_` or `coef_`.

        Parameters
        ----------
        self

        Returns
        -------
        feature_importance : pandas DataFrame
            Feature importance associated with each predictor.
        '''

        if isinstance(self.regressor, sklearn.pipeline.Pipeline):
            estimator = self.regressor[-1]
        else:
            estimator = self.regressor

        try:
            feature_importance = pd.DataFrame({
                                    'feature': self.X_train_col_names,
                                    'importance' : estimator.feature_importances_
                                })
        except:   
            try:
                feature_importance = pd.DataFrame({
                                        'feature': self.X_train_col_names,
                                        'importance' : estimator.coef_
                                    })
            except:
                warnings.warn(
                    f"Impossible to access feature importance for regressor of type {type(estimator)}. "
                    f"This method is only valid when the regressor stores internally "
                    f"the feature importance in the attribute `feature_importances_` "
                    f"or `coef_`."
                )

                feature_importance = None

        return feature_importance

### Create forecaster

In [151]:
y_1 = pd.Series(np.random.rand(10))
y_2 = pd.Series(np.random.rand(10))
exog_1 = pd.Series(np.random.rand(10))
exog_2 = pd.Series(np.random.rand(10))

df = pd.DataFrame({'1': y_1, '2': y_2, 'exog_1': exog_1, 'exog_2': exog_2,
                   'date':pd.date_range(start='2022-01-01', periods=10, freq='5H')})

df = df.set_index('date')
df = df.asfreq('5H')
df = df.sort_index()

max_lag=3
lags=np.array([1,2,3])

df.head(3)
df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,1,2,exog_1,exog_2
0,0.351357,0.665588,0.902172,0.209264
1,0.065158,0.042819,0.880695,0.070805
2,0.442673,0.93618,0.071194,0.179824


In [156]:
df.iloc[-3:, ]

Unnamed: 0,1,2,exog_1,exog_2
7,0.170969,0.307253,0.35624,0.70648
8,0.324064,0.669441,0.74462,0.082111
9,0.574278,0.294294,0.044237,0.286787


In [149]:
# Create and fit forecaster
# ==============================================================================
from sklearn.ensemble import RandomForestRegressor

forecaster = ForecasterAutoregMultiSeries_test(
                    regressor = RandomForestRegressor(random_state=123),
                    lags      = 3
                )

forecaster.fit(
    series = df[['1', '2']],
    exog   = df[['exog_1', 'exog_2']]
)

forecaster

ForecasterAutoregMultiSeries_test'> 
Regressor: RandomForestRegressor(random_state=123) 
Lags: [1 2 3] 
Window size: 3 
Included exogenous: True 
Type of exogenous variable: <class 'pandas.core.frame.DataFrame'> 
Exogenous variables names: ['exog_1', 'exog_2'] 
Training range: [0, 9] 
Training index type: RangeIndex 
Training index frequency: 1 
Regressor parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 123, 'verbose': 0, 'warm_start': False} 
Creation date: 2022-05-24 16:34:27 
Last fit date: 2022-05-24 16:34:27 
Skforecast version: 0.5.dev1 

In [150]:
# Predict
# ==============================================================================
steps = 3
exog_1_test = pd.Series(np.random.rand(steps))
exog_2_test = pd.Series(np.random.rand(steps))

df_test = pd.DataFrame({'exog_1': exog_1_test, 'exog_2': exog_2_test})

predictions = forecaster.predict(
                steps = steps,
                exog = df_test[['exog_1', 'exog_2']]
               )
# Add datetime index to predictions
predictions = pd.Series(data=predictions, index=df_test.index)
predictions.head(3)

ValueError: X has 8 features, but RandomForestRegressor is expecting 6 features as input.