In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
%config Completer.use_jedi = False

In [2]:
import sys
from pathlib import Path
(Path.cwd().parent)

WindowsPath('c:/Users/jaesc2/GitHub/skforecast')

In [3]:
str(Path.cwd().parent.parent)

'c:\\Users\\jaesc2\\GitHub'

## Librerias

In [4]:
## ForecasterAutoregMultiSeries
# ==============================================================================
from typing import Union, Dict, List, Tuple, Any, Optional
import warnings
import logging
import sys
import inspect
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline
from sklearn.base import clone
from copy import copy

import skforecast
from skforecast.ForecasterBase import ForecasterBase
from skforecast.utils import check_y
from skforecast.utils import check_exog
from skforecast.utils import preprocess_y
from skforecast.utils import preprocess_last_window
from skforecast.utils import preprocess_exog
from skforecast.utils import expand_index
from skforecast.utils import check_predict_input
from skforecast.utils import transform_series
from skforecast.utils import transform_dataframe

## Create_train_X_y

In [7]:
class dummy_forecaster():
    
    def __init__(
        self,
        regressor,
        lags: Union[int, np.ndarray, list],
        transformer_series: Optional[object]=None,
        transformer_exog: Optional[object]=None,
        series_weights: Optional[dict]=None
    ) -> None:
        
        self.regressor            = regressor
        self.transformer_series   = transformer_series
        self.transformer_exog     = transformer_exog
        self.series_weights       = series_weights
        self.index_type           = None
        self.index_freq           = None
        self.index_values         = None
        self.training_range       = None
        self.last_window          = None
        self.included_exog        = False
        self.exog_type            = None
        self.exog_col_names       = None
        self.series_levels        = None
        self.X_train_col_names    = None
        self.in_sample_residuals  = None
        self.out_sample_residuals = None
        self.fitted               = False
        self.creation_date        = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
        self.fit_date             = None
        self.skforcast_version    = skforecast.__version__
        self.python_version       = sys.version.split(" ")[0]
        
        if isinstance(lags, int) and lags < 1:
            raise ValueError('Minimum value of lags allowed is 1.')

        if isinstance(lags, (list, np.ndarray)):
            for lag in lags:
                if not isinstance(lag, (int, np.int64, np.int32)):
                    raise TypeError('All values in `lags` must be int.')
            
        if isinstance(lags, (list, range, np.ndarray)) and min(lags) < 1:
            raise ValueError('Minimum value of lags allowed is 1.')
            
        if isinstance(lags, int):
            self.lags = np.arange(lags) + 1
        elif isinstance(lags, (list, range)):
            self.lags = np.array(lags)
        elif isinstance(lags, np.ndarray):
            self.lags = lags
        else:
            raise TypeError(
                '`lags` argument must be int, 1d numpy ndarray, range or list. '
                f"Got {type(lags)}"
            )

        if series_weights is not None:
            if 'sample_weight' not in inspect.getfullargspec(self.regressor.fit)[0]:
                warnings.warm(
                    f"""
                    Argument `series_weights` is ignored since regressor {self.regressor}
                    does not accept `sample_weight` in its `fit` method.
                    """
                )
                self.series_weights = None
            
        self.max_lag = max(self.lags)
        self.window_size = self.max_lag


    def __repr__(
        self
    ) -> str:
        """
        Information displayed when a ForecasterAutoregMultiSeries object is printed.
        """

        if isinstance(self.regressor, sklearn.pipeline.Pipeline):
            name_pipe_steps = tuple(name + "__" for name in self.regressor.named_steps.keys())
            params = {key : value for key, value in self.regressor.get_params().items() \
                     if key.startswith(name_pipe_steps)}
        else:
            params = self.regressor.get_params()

        info = (
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"{str(type(self)).split('.')[1]} \n"
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"Regressor: {self.regressor} \n"
            f"Lags: {self.lags} \n"
            f"Transformer for series: {self.transformer_series} \n"
            f"Transformer for exog: {self.transformer_exog} \n"
            f"Window size: {self.window_size} \n"
            f"Series levels (names): {self.series_levels} \n"
            f"Series weights: {self.series_weights} \n"
            f"Included exogenous: {self.included_exog} \n"
            f"Type of exogenous variable: {self.exog_type} \n"
            f"Exogenous variables names: {self.exog_col_names} \n"
            f"Training range: {self.training_range.to_list() if self.fitted else None} \n"
            f"Training index type: {str(self.index_type).split('.')[-1][:-2] if self.fitted else None} \n"
            f"Training index frequency: {self.index_freq if self.fitted else None} \n"
            f"Regressor parameters: {params} \n"
            f"Creation date: {self.creation_date} \n"
            f"Last fit date: {self.fit_date} \n"
            f"Skforecast version: {self.skforcast_version} \n"
            f"Python version: {self.python_version} \n"
        )

        return info


In [8]:
forecaster = dummy_forecaster(
                 regressor = 'dummy',
                 lags = 5
             )

In [9]:
forecaster.lags

array([1, 2, 3, 4, 5])

In [10]:
forecaster.max_lag

5

In [11]:
def _create_lags(
    self, 
    y: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
        
    n_splits = len(y) - self.max_lag
    if n_splits <= 0:
        raise ValueError(
            f'The maximum lag ({self.max_lag}) must be less than the length '
            f'of the series ({len(y)}).'
        )
    
    X_data = np.full(shape=(n_splits, len(self.lags)), fill_value=np.nan, dtype=float)

    for i, lag in enumerate(self.lags):
        X_data[:, i] = y[self.max_lag - lag: -lag]

    y_data = y[self.max_lag:]
        
    return X_data, y_data

In [16]:
_create_lags(forecaster, np.array(list(range(10))))

(array([[4., 3., 2., 1., 0.],
        [5., 4., 3., 2., 1.],
        [6., 5., 4., 3., 2.],
        [7., 6., 5., 4., 3.],
        [8., 7., 6., 5., 4.]]),
 array([5, 6, 7, 8, 9]))

In [13]:
def create_train_X_y(
    self,
    series: pd.DataFrame,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None
) -> Tuple[pd.DataFrame, pd.Series]:

    if not isinstance(series, pd.DataFrame):
        raise TypeError(f'`series` must be a pandas DataFrame. Got {type(series)}.')

    series_levels = list(series.columns)

    if self.transformer_series is None:
        dict_transformers = {level: None for level in series_levels}
        self.transformer_series = dict_transformers
    elif not isinstance(self.transformer_series, dict):
        dict_transformers = {level: clone(self.transformer_series) 
                                for level in series_levels}
        self.transformer_series = dict_transformers
    else:
        if list(self.transformer_series.keys()) != series_levels:
            raise ValueError(
                (f'When `transformer_series` parameter is a `dict`, its keys '
                    f'must be the same as `series_levels` : {series_levels}.')
            )
    
    X_train_col_names = [f"{level}_lag_{lag}" for level in series_levels for lag in self.lags]

    for i, serie in enumerate(series.columns):

        y = series[serie]
        check_y(y=y)
        y = transform_series(
                series            = y,
                transformer       = self.transformer_series[serie],
                fit               = True,
                inverse_transform = False
            )
        y_values, y_index = preprocess_y(y=y)
        X_train_values, y_train_values = self._create_lags(y=y_values)

        if i == 0:
            X_train = X_train_values
            y_train = y_train_values
        else:
            X_train = np.hstack((X_train, X_train_values))
            y_train = np.append(y_train, y_train_values)

    if exog is not None:
        if len(exog) != len(series):
            raise ValueError(
                f'`exog` must have same number of samples as `series`. '
                f'length `exog`: ({len(exog)}), length `series`: ({len(series)})'
            )
        check_exog(exog=exog)
        if isinstance(exog, pd.Series):
            exog = transform_series(
                        series            = exog,
                        transformer       = self.transformer_exog,
                        fit               = True,
                        inverse_transform = False
                    )
        else:
            exog = transform_dataframe(
                        df                = exog,
                        transformer       = self.transformer_exog,
                        fit               = True,
                        inverse_transform = False
                    )
        exog_values, exog_index = preprocess_exog(exog=exog)
        if not (exog_index[:len(y_index)] == y_index).all():
            raise ValueError(
                ('Different index for `series` and `exog`. They must be equal '
                    'to ensure the correct alignment of values.')      
            )
        col_names_exog = exog.columns if isinstance(exog, pd.DataFrame) else [exog.name]
        X_train_col_names.extend(col_names_exog)

        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train. Then exog is cloned as many times
        # as series.
        if exog_values.ndim == 1:
            X_train = np.column_stack((
                        X_train,
                        np.tile(exog_values[self.max_lag:, ], series.shape[1])
                        )) 

        else:
            X_train = np.column_stack((
                        X_train,
                        np.tile(exog_values[self.max_lag:, ], [series.shape[1], 1])
                        ))

    X_levels = pd.Series(X_levels)
    X_levels = pd.get_dummies(X_levels, dtype=float)
    X_train_col_names.extend(X_levels.columns)
    X_train = np.column_stack((X_train, X_levels.values))

    X_train = pd.DataFrame(
                data    = X_train,
                columns = X_train_col_names
            )

    y_train = pd.Series(
                data  = y_train,
                name  = 'y'
            )
    
    self.X_train_col_names = X_train_col_names

    return X_train, y_train, y_index

In [35]:
series_1 = pd.DataFrame({'1': pd.Series(np.arange(5)),  
                         '2': pd.Series(np.arange(5))
                        })

series_2 = pd.DataFrame({'1': pd.Series(np.arange(100, 105)),  
                         '2': pd.Series(np.arange(100, 105))
                        })

X_train = np.hstack((series_1.values, series_2.values))
X_train

array([[  0,   0, 100, 100],
       [  1,   1, 101, 101],
       [  2,   2, 102, 102],
       [  3,   3, 103, 103],
       [  4,   4, 104, 104]])

In [38]:
lags = np.array([1,2,3])
levels = ['l1', 'l2']

[f"{level}_lag_{lag}" for level in levels for lag in lags]

['l1_lag_1', 'l1_lag_2', 'l1_lag_3', 'l2_lag_1', 'l2_lag_2', 'l2_lag_3']

## Init

In [8]:
class ForecasterAutoregMultiVariate(ForecasterBase):
        
    def __init__(
        self,
        regressor: dict[str, object],
        lags: dict[str, Union[int, np.ndarray, list]],
        transformer_series: Optional[Union[object, dict[str, object]]]=None,
        transformer_exog: Optional[Union[object, dict[str, object]]]=None,
        series_weights: Optional[dict[str, float]]=None,
        weight_func: Optional[Union[callable, dict[str, callable]]]=None
    ) -> None:
        
        self.regressor               = regressor
        self.transformer_series      = transformer_series
        self.transformer_exog        = transformer_exog
        self.series_weights          = series_weights
        self.weight_func             = weight_func
        self.source_code_weight_func = None
        self.index_type              = None
        self.index_freq              = None
        self.index_values            = None
        self.training_range          = None
        self.last_window             = None
        self.included_exog           = False
        self.exog_type               = None
        self.exog_col_names          = None
        self.series_levels           = None
        self.X_train_col_names       = None
        self.in_sample_residuals     = None
        self.out_sample_residuals    = None
        self.fitted                  = False
        self.creation_date           = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
        self.fit_date                = None
        self.skforcast_version       = skforecast.__version__
        self.python_version          = sys.version.split(" ")[0]
        
        if not isinstance(self.regressor, dict):
            raise TypeError(
                ('The `regressor` argument must be a `dict` containing the levels of '
                 'the forecaster (column names of the series) as keys and the '
                 'corresponding regressor for each level. {level (str) : regressor}.')
            )

        for key in self.regressor:
            if not isinstance(key, str):
                raise TypeError(
                    f"All regressor dict keys must be a string.\n"
                    f"    Key {key} is a {type(key)}"
                )

        # Next crear lags dict, si es un dict comprobar que coinciden las keys

        if isinstance(lags, int) and lags < 1:
            raise ValueError('Minimum value of lags allowed is 1.')

        if isinstance(lags, (list, np.ndarray)):
            for lag in lags:
                if not isinstance(lag, (int, np.int64, np.int32)):
                    raise TypeError('All values in `lags` must be int.')
            
        if isinstance(lags, (list, range, np.ndarray)) and min(lags) < 1:
            raise ValueError('Minimum value of lags allowed is 1.')
            
        if isinstance(lags, int):
            self.lags = np.arange(lags) + 1
        elif isinstance(lags, (list, range)):
            self.lags = np.array(lags)
        elif isinstance(lags, np.ndarray):
            self.lags = lags
        else:
            raise TypeError(
                '`lags` argument must be int, 1d numpy ndarray, range or list. '
                f"Got {type(lags)}"
            )

        if series_weights is not None:
            if 'sample_weight' not in inspect.getfullargspec(self.regressor.fit)[0]:
                warnings.warm(
                    f"""
                    Argument `series_weights` is ignored since regressor {self.regressor}
                    does not accept `sample_weight` in its `fit` method.
                    """
                )
                self.series_weights = None
            
        self.max_lag = max(self.lags)
        self.window_size = self.max_lag


    def __repr__(
        self
    ) -> str:
        """
        Information displayed when a ForecasterAutoregMultiSeries object is printed.
        """

        if isinstance(self.regressor, sklearn.pipeline.Pipeline):
            name_pipe_steps = tuple(name + "__" for name in self.regressor.named_steps.keys())
            params = {key : value for key, value in self.regressor.get_params().items() \
                     if key.startswith(name_pipe_steps)}
        else:
            params = self.regressor.get_params()

        info = (
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"{str(type(self)).split('.')[1]} \n"
            f"{'=' * len(str(type(self)).split('.')[1])} \n"
            f"Regressor: {self.regressor} \n"
            f"Lags: {self.lags} \n"
            f"Transformer for series: {self.transformer_series} \n"
            f"Transformer for exog: {self.transformer_exog} \n"
            f"Window size: {self.window_size} \n"
            f"Series levels (names): {self.series_levels} \n"
            f"Series weights: {self.series_weights} \n"
            f"Included exogenous: {self.included_exog} \n"
            f"Type of exogenous variable: {self.exog_type} \n"
            f"Exogenous variables names: {self.exog_col_names} \n"
            f"Training range: {self.training_range.to_list() if self.fitted else None} \n"
            f"Training index type: {str(self.index_type).split('.')[-1][:-2] if self.fitted else None} \n"
            f"Training index frequency: {self.index_freq if self.fitted else None} \n"
            f"Regressor parameters: {params} \n"
            f"Creation date: {self.creation_date} \n"
            f"Last fit date: {self.fit_date} \n"
            f"Skforecast version: {self.skforcast_version} \n"
            f"Python version: {self.python_version} \n"
        )

        return info

In [9]:
for key in {'1':1, 2:2}.keys():
    

{1: 1}

In [27]:
for key in {'1':1, 2:2}.keys():
    if not isinstance(key, str):
        raise TypeError(
            f"All regressor dict keys must be a string.\n"
            f"    Key {key} is a {type(key)}"
        )

TypeError: `lags` argument must be int, 1d numpy ndarray, range or list.
Got : [<class 'str'>, <class 'int'>]

In [26]:
[type(key) for key in {'1':1, 2:2}.keys()]

[str, int]

In [37]:
y = np.arange(10).reshape(-1, 1)
x = np.arange(10).reshape(-1, 1)
weights = [1, 1, 1, 1, -1, 1, 1, 1, 1, 1]

In [36]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X=x, y=y, sample_weight=weights)
model.predict(y)

  model.fit(X=x, y=y, sample_weight=weights)
  out[0] += prediction


array([-inf, -inf, -inf,  nan,  nan,  nan,  nan,  inf,  inf,  inf])

In [38]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X=x, y=y, sample_weight=weights)
model.predict(y)

ValueError: Negative values in data passed to `sample_weight`

In [41]:
weights = [1, 1, 1, 1, 0, 1, 1, 1, 1, 1]

np.any(np.array(weights) < 0)

False

## ForecasterAutoregDirect

In [5]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [44]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o.csv')
data = pd.read_csv(url, sep=',', header=0, names=['y', 'datetime'])

# Data preprocessing
# ==============================================================================
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y/%m/%d')
data = data.set_index('datetime')
data = data.asfreq('MS')
data = data['y']
data = data.sort_index()

# Split train-test
# ==============================================================================
steps = 36
data_train = data[:-steps]
data_test  = data[-steps:]

# Plot
# ==============================================================================
# fig, ax=plt.subplots(figsize=(9, 4))
# data_train.plot(ax=ax, label='train')
# data_test.plot(ax=ax, label='test')
# ax.legend();

In [49]:
# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterAutoregDirect(
                 regressor = Ridge(),
                 steps     = 5,
                 lags      = 15
             )

# forecaster.fit(y=data_train)
forecaster

ForecasterAutoregDirect 
Regressor: Ridge() 
Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] 
Transformer for y: None 
Transformer for exog: None 
Included weights function: False 
Window size: 15 
Maximum steps predicted: 5 
Included exogenous: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Regressor parameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': 'deprecated', 'positive': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001} 
Creation date: 2022-10-31 17:32:31 
Last fit date: None 
Skforecast version: 0.6.0 
Python version: 3.9.13 

In [19]:
forecaster = ForecasterAutoregDirect(LinearRegression(), lags=3, steps=2)
X_train, y_train = forecaster.create_train_X_y(
                        y    = pd.Series(np.arange(10)),
                        exog = pd.Series(np.arange(100, 110), name='exog')
                    )
results = forecaster.filter_train_X_y_for_step(step=2, X_train=X_train, y_train=y_train)
results

(   lag_1  lag_2  lag_3  exog_step_1  exog_step_2
 4    2.0    1.0    0.0        103.0        104.0
 5    3.0    2.0    1.0        104.0        105.0
 6    4.0    3.0    2.0        105.0        106.0
 7    5.0    4.0    3.0        106.0        107.0
 8    6.0    5.0    4.0        107.0        108.0
 9    7.0    6.0    5.0        108.0        109.0,
 4    4.0
 5    5.0
 6    6.0
 7    7.0
 8    8.0
 9    9.0
 Name: y_step_2, dtype: float64)

In [8]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_step_1,exog_step_2
4,2.0,1.0,0.0,103.0,104.0
5,3.0,2.0,1.0,104.0,105.0
6,4.0,3.0,2.0,105.0,106.0
7,5.0,4.0,3.0,106.0,107.0
8,6.0,5.0,4.0,107.0,108.0
9,7.0,6.0,5.0,108.0,109.0


In [18]:
idx_columns_lags = np.arange(3)
idx_columns_exog = np.arange(X_train.shape[1])[3 + 1::2]
idx_columns_exog
idx_columns = np.hstack((idx_columns_lags, idx_columns_exog))
X_train_step = X_train.iloc[:, idx_columns]
X_train_step

Unnamed: 0,lag_1,lag_2,lag_3,exog_step_2
4,2.0,1.0,0.0,104.0
5,3.0,2.0,1.0,105.0
6,4.0,3.0,2.0,106.0
7,5.0,4.0,3.0,107.0
8,6.0,5.0,4.0,108.0
9,7.0,6.0,5.0,109.0


In [9]:

aa = {'a1':3, 'a2':4}
max(aa.values())

4

In [9]:
lags = 3
multivariate_series = ['l1', 'l2']

dict_lags = {serie: lags for serie in multivariate_series}
lags = dict_lags

lags

{'l1': 3, 'l2': 3}

In [12]:
lags = [1, 2, 3]
multivariate_series = ['l1', 'l2']

lags = {serie: lags for serie in multivariate_series}

lags

{'l1': [1, 2, 3], 'l2': [1, 2, 3]}

In [13]:
lags

{'l1': [1, 2, 3], 'l2': [1, 2, 3]}

In [14]:
[f"{key}_lag_{lag}" for key in lags for lag in lags[key]]

['l1_lag_1', 'l1_lag_2', 'l1_lag_3', 'l2_lag_1', 'l2_lag_2', 'l2_lag_3']

In [15]:
serie = 'l1'
series = ['1', '2', '3']

if serie not in series:
    print(1)

1


## ForecasterAutoregMultiVariate

In [5]:
## ForecasterAutoregMultiSeries
# ==============================================================================
from typing import Union, Dict, List, Tuple, Any, Optional
import warnings
import logging
import sys
import inspect
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline
from sklearn.linear_model import LinearRegression
from sklearn.base import clone
from copy import copy

import skforecast
from skforecast.ForecasterAutoregMultiVariate import ForecasterAutoregMultiVariate

In [71]:
# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterAutoregMultiVariate(
                 regressor = LinearRegression(),
                 level     = 'l1', 
                 steps     = 3,
                 lags      = 5
             )

forecaster       

ForecasterAutoregMultiVariate 
Regressor: LinearRegression() 
Lags: [1 2 3 4 5] 
Transformer for series: None 
Transformer for exog: None 
Window size: 5 
Multivariate series (names): None 
Maximum steps predicted: 3 
Weight function included: False 
Exogenous included: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: None 
Training index type: None 
Training index frequency: None 
Regressor parameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': 'deprecated', 'positive': False} 
Creation date: 2022-11-01 14:32:46 
Last fit date: None 
Skforecast version: 0.6.0 
Python version: 3.9.13 

In [73]:
y = pd.Series(np.arange(10), name='l1')
forecaster.lags = {'l1': np.array([1, 2, 3, 4, 5])}

forecaster._create_lags(y=y, serie='l1')

5


(array([[4., 3., 2., 1., 0.],
        [5., 4., 3., 2., 1.],
        [6., 5., 4., 3., 2.]]),
 array([[5., 6., 7.],
        [6., 7., 8.],
        [7., 8., 9.]]))

In [74]:
forecaster = ForecasterAutoregMultiVariate(
                 regressor = LinearRegression(),
                 level     = 'l1', 
                 steps     = 3,
                 lags      = 5
             )

series = pd.DataFrame({'l1' : np.arange(10), 
                       'l2' : np.arange(100, 110)})

exog = pd.Series(np.arange(10), name='exog1')

forecaster.create_train_X_y(series=series, exog=exog)

5
5
[[  4.   3.   2.   1.   0. 104. 103. 102. 101. 100.   5.   6.   7.]
 [  5.   4.   3.   2.   1. 105. 104. 103. 102. 101.   6.   7.   8.]
 [  6.   5.   4.   3.   2. 106. 105. 104. 103. 102.   7.   8.   9.]]
[[5. 6. 7.]
 [6. 7. 8.]
 [7. 8. 9.]]


(   l1_lag_1  l1_lag_2  l1_lag_3  l1_lag_4  l1_lag_5  l2_lag_1  l2_lag_2  \
 7       4.0       3.0       2.0       1.0       0.0     104.0     103.0   
 8       5.0       4.0       3.0       2.0       1.0     105.0     104.0   
 9       6.0       5.0       4.0       3.0       2.0     106.0     105.0   
 
    l2_lag_3  l2_lag_4  l2_lag_5  exog1_step_1  exog1_step_2  exog1_step_3  
 7     102.0     101.0     100.0           5.0           6.0           7.0  
 8     103.0     102.0     101.0           6.0           7.0           8.0  
 9     104.0     103.0     102.0           7.0           8.0           9.0  ,
    l1_step_0  l1_step_1  l1_step_2
 7        5.0        6.0        7.0
 8        6.0        7.0        8.0
 9        7.0        8.0        9.0)

In [75]:
forecaster = ForecasterAutoregMultiVariate(
                 regressor = LinearRegression(),
                 level     = 'l1', 
                 steps     = 3,
                 lags      = 5
             )

series = pd.DataFrame({'l1' : np.arange(10), 
                       'l2' : np.arange(100, 110)})

exog = pd.DataFrame({'exog1' : np.arange(20, 30), 
                     'exog2' : np.arange(200, 210)})

forecaster.create_train_X_y(series=series, exog=exog)

5
5
[[  4.   3.   2.   1.   0. 104. 103. 102. 101. 100.  25.  26.  27. 205.
  206. 207.]
 [  5.   4.   3.   2.   1. 105. 104. 103. 102. 101.  26.  27.  28. 206.
  207. 208.]
 [  6.   5.   4.   3.   2. 106. 105. 104. 103. 102.  27.  28.  29. 207.
  208. 209.]]
[[5. 6. 7.]
 [6. 7. 8.]
 [7. 8. 9.]]


(   l1_lag_1  l1_lag_2  l1_lag_3  l1_lag_4  l1_lag_5  l2_lag_1  l2_lag_2  \
 7       4.0       3.0       2.0       1.0       0.0     104.0     103.0   
 8       5.0       4.0       3.0       2.0       1.0     105.0     104.0   
 9       6.0       5.0       4.0       3.0       2.0     106.0     105.0   
 
    l2_lag_3  l2_lag_4  l2_lag_5  exog1_step_1  exog1_step_2  exog1_step_3  \
 7     102.0     101.0     100.0          25.0          26.0          27.0   
 8     103.0     102.0     101.0          26.0          27.0          28.0   
 9     104.0     103.0     102.0          27.0          28.0          29.0   
 
    exog2_step_1  exog2_step_2  exog2_step_3  
 7         205.0         206.0         207.0  
 8         206.0         207.0         208.0  
 9         207.0         208.0         209.0  ,
    l1_step_0  l1_step_1  l1_step_2
 7        5.0        6.0        7.0
 8        6.0        7.0        8.0
 9        7.0        8.0        9.0)

In [78]:
forecaster = ForecasterAutoregMultiVariate(
                 regressor = LinearRegression(),
                 level     = 'l1', 
                 steps     = 3,
                 lags      = {'l1': 5, 'l2': 6}
             )

series = pd.DataFrame({'l1' : np.arange(10), 
                       'l2' : np.arange(100, 110)})

forecaster.create_train_X_y(series=series)

[[  5.   4.   3.   2.   1. 105. 104. 103. 102. 101. 100.]
 [  6.   5.   4.   3.   2. 106. 105. 104. 103. 102. 101.]]
[[6. 7. 8.]
 [7. 8. 9.]]


(   l1_lag_1  l1_lag_2  l1_lag_3  l1_lag_4  l1_lag_5  l2_lag_1  l2_lag_2  \
 8       5.0       4.0       3.0       2.0       1.0     105.0     104.0   
 9       6.0       5.0       4.0       3.0       2.0     106.0     105.0   
 
    l2_lag_3  l2_lag_4  l2_lag_5  l2_lag_6  
 8     103.0     102.0     101.0     100.0  
 9     104.0     103.0     102.0     101.0  ,
    l1_step_0  l1_step_1  l1_step_2
 8        6.0        7.0        8.0
 9        7.0        8.0        9.0)

In [79]:
forecaster = ForecasterAutoregMultiVariate(
                 regressor = LinearRegression(),
                 level     = 'l1', 
                 steps     = 3,
                 lags      = {'l1': 5, 'l2': 6}
             )

series = pd.DataFrame({'l1' : np.arange(10), 
                       'l2' : np.arange(100, 110)})

exog = pd.DataFrame({'exog1' : np.arange(20, 30), 
                     'exog2' : np.arange(200, 210)})

forecaster.create_train_X_y(series=series, exog=exog)

[[  5.   4.   3.   2.   1. 105. 104. 103. 102. 101. 100.  26.  27.  28.
  206. 207. 208.]
 [  6.   5.   4.   3.   2. 106. 105. 104. 103. 102. 101.  27.  28.  29.
  207. 208. 209.]]
[[6. 7. 8.]
 [7. 8. 9.]]


(   l1_lag_1  l1_lag_2  l1_lag_3  l1_lag_4  l1_lag_5  l2_lag_1  l2_lag_2  \
 8       5.0       4.0       3.0       2.0       1.0     105.0     104.0   
 9       6.0       5.0       4.0       3.0       2.0     106.0     105.0   
 
    l2_lag_3  l2_lag_4  l2_lag_5  l2_lag_6  exog1_step_1  exog1_step_2  \
 8     103.0     102.0     101.0     100.0          26.0          27.0   
 9     104.0     103.0     102.0     101.0          27.0          28.0   
 
    exog1_step_3  exog2_step_1  exog2_step_2  exog2_step_3  
 8          28.0         206.0         207.0         208.0  
 9          29.0         207.0         208.0         209.0  ,
    l1_step_0  l1_step_1  l1_step_2
 8        6.0        7.0        8.0
 9        7.0        8.0        9.0)

In [8]:
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries

forecaster = ForecasterAutoregMultiSeries(LinearRegression(), lags=3)
series = pd.DataFrame({'1': pd.Series(np.arange(7)), 
                       '2': pd.Series(np.arange(7))
                       })

results = forecaster.create_train_X_y(series=series)
print(results)
expected = (pd.DataFrame(
                data = np.array([[2.0, 1.0, 0.0, 1., 0.],
                                    [3.0, 2.0, 1.0, 1., 0.],
                                    [4.0, 3.0, 2.0, 1., 0.],
                                    [5.0, 4.0, 3.0, 1., 0.],
                                    [2.0, 1.0, 0.0, 0., 1.],
                                    [3.0, 2.0, 1.0, 0., 1.],
                                    [4.0, 3.0, 2.0, 0., 1.],
                                    [5.0, 4.0, 3.0, 0., 1.]]),
                index   = np.array([0, 1, 2, 3, 4, 5, 6, 7]),
                columns = ['lag_1', 'lag_2', 'lag_3', '1', '2']
            ),
            pd.Series(
                np.array([3., 4., 5., 6., 3., 4., 5., 6.]),
                index = np.array([0, 1, 2, 3, 4, 5, 6, 7]),
                name = 'y'
            ),
            pd.RangeIndex(start=0, stop =len(series), step=1
            ),
            pd.Index(np.array([3, 4, 5, 6, 3, 4, 5, 6])
            )
            )

pd.testing.assert_frame_equal(results[0], expected[0])
assert (results[1] == expected[1]).all()
assert (results[2] == expected[2]).all()
assert (results[3] == expected[3]).all()

(   lag_1  lag_2  lag_3    1    2
0    2.0    1.0    0.0  1.0  0.0
1    3.0    2.0    1.0  1.0  0.0
2    4.0    3.0    2.0  1.0  0.0
3    5.0    4.0    3.0  1.0  0.0
4    2.0    1.0    0.0  0.0  1.0
5    3.0    2.0    1.0  0.0  1.0
6    4.0    3.0    2.0  0.0  1.0
7    5.0    4.0    3.0  0.0  1.0, 0    3
1    4
2    5
3    6
4    3
5    4
6    5
7    6
Name: y, dtype: int32, RangeIndex(start=0, stop=7, step=1), Int64Index([3, 4, 5, 6, 3, 4, 5, 6], dtype='int64'))


In [35]:
pd.Index(np.repeat(
                            pd.RangeIndex(start=0, stop=len(series), step=1)[forecaster.max_lag: ].values,
                            repeats = len(list(series.columns))
                    ))

Int64Index([3, 3, 4, 4, 5, 5, 6, 6], dtype='int64')

In [36]:
pd.Index(list(pd.RangeIndex(start=0, stop=len(series), step=1)[forecaster.max_lag: ].values)*len(list(series.columns))
                    )

Int64Index([3, 4, 5, 6, 3, 4, 5, 6], dtype='int64')

In [33]:
pd.Index(list(pd.DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03'], dtype='datetime64[ns]', freq='D').values)*len(list(series.columns)))


DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03', '1990-01-01',
               '1990-01-02', '1990-01-03'],
              dtype='datetime64[ns]', freq=None)

In [7]:
pd.Index(
                            np.tile(
                                pd.DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03'], dtype='datetime64[ns]', freq='D').values,
                                reps = 2
                            )
                        )

DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03', '1990-01-01',
               '1990-01-02', '1990-01-03'],
              dtype='datetime64[ns]', freq=None)

In [9]:
pd.DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03', '1990-01-01', '1990-01-02', '1990-01-03'])

DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03', '1990-01-01',
               '1990-01-02', '1990-01-03'],
              dtype='datetime64[ns]', freq=None)

In [26]:
pd.DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03'], dtype='datetime64[ns]', freq='D').values

array(['1990-01-01T00:00:00.000000000', '1990-01-02T00:00:00.000000000',
       '1990-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

In [10]:
aa = pd.DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03'], dtype='datetime64[ns]', freq='D')
aa

DatetimeIndex(['1990-01-01', '1990-01-02', '1990-01-03'], dtype='datetime64[ns]', freq='D')

In [16]:

forecaster = ForecasterAutoregMultiSeries(LinearRegression(), lags=3)
series = pd.DataFrame({'1': (np.arange(7)), 
                        '2': (np.arange(7))},
                        index = pd.date_range("1990-01-01", periods=7, freq='D')
                        )

results = forecaster.create_train_X_y(
                        series = series,
                        exog = pd.DataFrame(
                                    {'exog_1' : np.arange(100, 107),
                                    'exog_2' : np.arange(1000, 1007)},
                                    index = pd.date_range("1990-01-01", periods=7, freq='D')
                                )           
            )

expected = (pd.DataFrame(
                data = np.array([[2.0, 1.0, 0.0, 103., 1003., 1., 0.],
                                    [3.0, 2.0, 1.0, 104., 1004., 1., 0.],
                                    [4.0, 3.0, 2.0, 105., 1005., 1., 0.],
                                    [5.0, 4.0, 3.0, 106., 1006., 1., 0.],
                                    [2.0, 1.0, 0.0, 103., 1003., 0., 1.],
                                    [3.0, 2.0, 1.0, 104., 1004., 0., 1.],
                                    [4.0, 3.0, 2.0, 105., 1005., 0., 1.],
                                    [5.0, 4.0, 3.0, 106., 1006., 0., 1.]]),
                index   = np.array([0, 1, 2, 3, 4, 5, 6, 7]),
                columns = ['lag_1', 'lag_2', 'lag_3', 'exog_1', 'exog_2', '1', '2']
            ),
            pd.Series(
                data = np.array([3., 4., 5., 6., 3., 4., 5., 6.]),
                index = np.array([0, 1, 2, 3, 4, 5, 6, 7]),
                name = 'y'),
            pd.date_range("1990-01-01", periods=7, freq='D'
            ),
            pd.Index(pd.DatetimeIndex(['1990-01-04', '1990-01-05', '1990-01-06', '1990-01-07', 
                                        '1990-01-04', '1990-01-05', '1990-01-06', '1990-01-07'])
            )
            )

for i in range(len(expected)):
    if isinstance(expected[i], pd.DataFrame):
        pd.testing.assert_frame_equal(results[i], expected[i])
    else:
        assert (results[i] == expected[i]).all()

In [14]:
pd.DataFrame(data={'1': np.arange(7), 
                           '2':np.arange(7)},
                           index = pd.date_range("1990-01-01", periods=7, freq='D')
                          )

Unnamed: 0,1,2
1990-01-01,0,0
1990-01-02,1,1
1990-01-03,2,2
1990-01-04,3,3
1990-01-05,4,4
1990-01-06,5,5
1990-01-07,6,6
