In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'/home/ubuntu/varios/skforecast'

In [2]:
import pandas as pd
import numpy as np
from typing import List, Union, Tuple, Optional

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [3]:
# Data simulation
# ==============================================================================
n = 10
series = pd.DataFrame(
    np.random.randn(n, 5),
    index=pd.date_range("2018-01-01", periods=n),
    columns=[f"series_{i}" for i in range(5)],
)

exog = {
    f"series_{i}": pd.DataFrame(
        np.random.randn(n, 2),
        index=pd.date_range("2018-01-01", periods=n),
        columns=[f"exog_{j}" for j in range(2)],
    )
    for i in range(5)
}

# exog = pd.DataFrame(
#     np.random.randn(n, 2),
#     index=pd.date_range("2018-01-01", periods=n),
#     columns=["exog_0", "exog_1"],
# )

# Injecting missing values
series['series_0'].iloc[0:2] = np.nan
series['series_1'].iloc[0:3] = np.nan
series['series_1'].iloc[-3:] = np.nan
# series['series_3'].iloc[[5, 6]] = np.nan Needs to remove the exception in create_train_X_y when there are missing values in y
series



Unnamed: 0,series_0,series_1,series_2,series_3,series_4
2018-01-01,,,-0.199187,-0.01532,0.656192
2018-01-02,,,0.532387,0.045695,-1.044882
2018-01-03,0.780637,,0.729556,-0.409606,0.49543
2018-01-04,-0.337556,0.190319,1.130121,0.307832,0.473396
2018-01-05,-0.264497,0.740949,-1.405627,-1.163562,0.801584
2018-01-06,1.403882,1.371611,-0.398193,-1.452256,0.012552
2018-01-07,0.929296,-2.077337,0.503161,0.071028,1.664818
2018-01-08,-0.39689,,-1.425457,1.15715,-1.593471
2018-01-09,2.014113,,1.729486,-1.503684,-0.451313
2018-01-10,0.693924,,-0.082551,0.689935,-0.537518


In [4]:
forecaster = ForecasterAutoreg(
    regressor=LinearRegression(),
    lags=3
)

In [5]:
# Preprocessing
# ==============================================================================
# Store series and exog as dict
if isinstance(series, pd.DataFrame):
    series_dict = series.to_dict("series")
elif isinstance(series, dict):
    series_dict = series

if isinstance(exog, pd.DataFrame):
    exog_dict = dict.fromkeys(series_dict.keys(), exog)
elif isinstance(exog, dict):
    exog_dict = exog

# Remove leading and trailing nans from each series and exog. This is done
# so then there is no need to remove leading and trailing nans in each X_train
for k, v in series_dict.items():
    series_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]

for k, v in exog_dict.items():
    exog_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]



print("Series lengths after removing leading and trailing nans")
for k, v in series_dict.items():
    print(f"{k}: {len(v)}")

print("Exog lengths after removing leading and trailing nans")
for k, v in exog_dict.items():
    print(f"{k}: {len(v)}")

Series lengths after removing leading and trailing nans
series_0: 8
series_1: 4
series_2: 10
series_3: 10
series_4: 10
Exog lengths after removing leading and trailing nans
series_0: 10
series_1: 10
series_2: 10
series_3: 10
series_4: 10


In [6]:
X_train_buffer = []
y_train_buffer = []

for key in series_dict.keys():
    y = series_dict[key]
    exog = exog_dict[key]
    # TODO: All neded check goes here
    exog = exog.loc[y.index]
    print(y.name)
    print(exog.columns)
    X_train, y_train = forecaster.create_train_X_y(y=y, exog=exog)
    X_train['level'] = key
    X_train_buffer.append(X_train)
    y_train_buffer.append(y_train)

X_train = pd.concat(X_train_buffer, axis=0)
y_train = pd.concat(y_train_buffer, axis=0)

series_0
Index(['exog_0', 'exog_1'], dtype='object')
series_1
Index(['exog_0', 'exog_1'], dtype='object')
series_2
Index(['exog_0', 'exog_1'], dtype='object')
series_3
Index(['exog_0', 'exog_1'], dtype='object')
series_4
Index(['exog_0', 'exog_1'], dtype='object')


In [7]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_0,exog_1,level
2018-01-06,-0.264497,-0.337556,0.780637,1.208184,0.65276,series_0
2018-01-07,1.403882,-0.264497,-0.337556,0.769003,-0.534458,series_0
2018-01-08,0.929296,1.403882,-0.264497,-1.039888,0.374818,series_0
2018-01-09,-0.39689,0.929296,1.403882,-1.852216,-0.070069,series_0
2018-01-10,2.014113,-0.39689,0.929296,0.648391,-1.214645,series_0
2018-01-07,1.371611,0.740949,0.190319,-1.192708,-0.889509,series_1
2018-01-04,0.729556,0.532387,-0.199187,-1.15173,-0.781212,series_2
2018-01-05,1.130121,0.729556,0.532387,-0.319204,-0.608719,series_2
2018-01-06,-1.405627,1.130121,0.729556,0.864698,1.365684,series_2
2018-01-07,-0.398193,-1.405627,1.130121,0.634982,0.878234,series_2


In [8]:
y_train

2018-01-06    1.403882
2018-01-07    0.929296
2018-01-08   -0.396890
2018-01-09    2.014113
2018-01-10    0.693924
2018-01-07   -2.077337
2018-01-04    1.130121
2018-01-05   -1.405627
2018-01-06   -0.398193
2018-01-07    0.503161
2018-01-08   -1.425457
2018-01-09    1.729486
2018-01-10   -0.082551
2018-01-04    0.307832
2018-01-05   -1.163562
2018-01-06   -1.452256
2018-01-07    0.071028
2018-01-08    1.157150
2018-01-09   -1.503684
2018-01-10    0.689935
2018-01-04    0.473396
2018-01-05    0.801584
2018-01-06    0.012552
2018-01-07    1.664818
2018-01-08   -1.593471
2018-01-09   -0.451313
2018-01-10   -0.537518
Name: y, dtype: float64

In [9]:
def _create_train_X_y_single_series(
    self,
    y: pd.Series,
    exog: Optional[Union[pd.Series, pd.DataFrame]]=None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    """
    Create training matrices from univariate time series and exogenous
    variables. This method does not transform the exog variables.
    
    Parameters
    ----------
    y : pandas Series
        Training time series.
    exog : pandas Series, pandas DataFrame, default `None`
        Exogenous variable/s included as predictor/s. Must have the same
        number of observations as `y` and their indexes must be aligned.

    Returns
    -------
    X_train_lags : pandas DataFrame
        Training values of lags
        Shape: (len(y) - self.max_lag, len(self.lags))
    X_train_exog : pandas DataFrame
        Training values of exogenous variables.
        Shape: (len(y) - self.max_lag, len(exog.columns))
    y_train : pandas Series
        Values (target) of the time series related to each row of `X_train`.
        Shape: (len(y) - self.max_lag, )
    
    """
    y_name = y.name
    check_y(y=y)
    y = transform_series(
            series            = y,
            transformer       = self.transformer_y,
            fit               = True,
            inverse_transform = False
        )
    y_values, y_index = preprocess_y(y=y)

    if self.differentiation is not None:
        y_values = self.differentiator_[y_name].fit_transform(y_values)
    
    if exog is not None:
        if len(exog) != len(y):
            raise ValueError(
                (f'`exog` must have same number of samples as `y`. '
                 f'length `exog`: ({len(exog)}), length `y`: ({len(y)})')
            )
        check_exog(exog=exog, allow_nan=True)
        check_exog_dtypes(exog)
        _, exog_index = preprocess_exog(exog=exog, return_values=False)
        if not (exog_index == y_index).all():
            raise ValueError(
                ("Different index for `y` and `exog`. They must be equal to "
                "ensure the correct alignment of values.")
            )
    
    X_train, y_train = self._create_lags(y=y_values)
    X_train_col_names = [f"lag_{i}" for i in self.lags]
    X_train_lags = pd.DataFrame(
                        data    = X_train,
                        columns = X_train_col_names,
                        index   = y_index[self.max_lag: ]
                   )

    if exog is not None:
        # The first `self.max_lag` positions have to be removed from exog
        # since they are not in X_train.
        X_train_exog = exog.iloc[self.max_lag:, ]
    else:
        # TODO: test if this is efficient
        X_train_exog = pd.DataFrame(
                            data    = np.nan,
                            columns = ['dummy_exog_col_to_keep_shape'],
                            index   = y_index[self.max_lag: ]
                        )

    y_train = pd.Series(
                    data  = y_train,
                    index = y_index[self.max_lag: ],
                    name  = 'y'
                )

    if self.differentiation is not None:
        y_train = y_train.iloc[self.differentiation: ]
        X_train_lags = X_train_lags.iloc[self.differentiation: ]
        if X_train_exog is not None:
            X_train_exog = X_train_exog.iloc[self.differentiation: ]

    assert X_train_lags.index.equals(y_train.index)
    assert X_train_exog.index.equals(y_train.index)
                    
    return X_train_lags, X_train_exog, y_train

In [28]:
my_set = set()
my_set.add('element')
my_set.add('element')
my_set

{'element'}

In [10]:
def create_train_X_y(
        self,
        series: Union[pd.DataFrame, dict],
        exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None
    ) -> Tuple[pd.DataFrame, pd.Series, pd.Index, pd.Index]:
        """
        Create training matrices from multiple time series and exogenous
        variables.
        
        Parameters
        ----------
        series : pandas DataFrame, dict
            Training time series.
        exog : pandas Series, pandas DataFrame, dict, default `None`
            Exogenous variable/s included as predictor/s. Must have the same
            number of observations as `series` and their indexes must be aligned.

        Returns
        -------
        X_train : pandas DataFrame
            Training values (predictors).
        y_train : pandas Series
            Values (target) of the time series related to each row of `X_train`.
            Shape: (len(series) - self.max_lag, )
        y_index : pandas Index
            Index of `series`.
        y_train_index: pandas Index
            Index of `y_train`.
        
        """

        if not isinstance(series, (pd.DataFrame, dict)):
            raise TypeError(f"`series` must be a pandas DataFrame or dict. Got {type(series)}.")
        
        if not isinstance(exog, (pd.Series, pd.DataFrame, dict, type(None))):
            raise TypeError(f"`exog` must be a pandas Series, DataFrame or dict. Got {type(exog)}.")

        if isinstance(series, pd.DataFrame):
            series_dict = series.to_dict("series")
        elif isinstance(series, dict):
            series_dict = series

        if isinstance(exog, pd.DataFrame):
            exog_dict = dict.fromkeys(series_dict.keys(), exog)
        elif isinstance(exog, dict):
            exog_dict = exog
        # TODO: if a series has no exog add none in the exog_dict

        series_names = list(series_dict.keys())
        if self.transformer_series is None:
            self.transformer_series_ = {serie: None for serie in series_names}
        elif not isinstance(self.transformer_series, dict):
            self.transformer_series_ = {serie: clone(self.transformer_series) 
                                        for serie in series_names}
        else:
            self.transformer_series_ = {serie: None for serie in series_names}
            # Only elements already present in transformer_series_ are updated
            self.transformer_series_.update(
                (k, v) for k, v in deepcopy(self.transformer_series).items() 
                if k in self.transformer_series_
            )
            series_not_in_transformer_series = set(series.columns) - set(self.transformer_series.keys())
            if series_not_in_transformer_series:
                warnings.warn(
                    (f"{series_not_in_transformer_series} not present in `transformer_series`."
                     f" No transformation is applied to these series."),
                     IgnoredArgumentWarning
                )  
        if self.differentiation is None:
            self.differentiator_ = {serie: None for serie in series_names}
        else:
            self.differentiator_ = {serie: clone(self.self.differentiator) for serie in series_names}


        # Remove leading and trailing nans from each series.
        for k, v in series_dict.items():
            series_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]

        if exog is not None:
            # Convert exog to dataframe if it is a series
            for k, v in exog_dict.items():
                if isinstance(v, pd.Series):
                    v = v.to_frame()
                exog_dict[k] = v

            exog_names = [exog.columns.to_list() for exog in exog_dict.values()]
            exog_names = list(set([item for sublist in exog_names for item in sublist]))

            # Check that all exog have the same dtypes for common columns
            exog_dtype_dict = {col_name: set() for col_name in exog_names}
            for exog in exog_dict.values():
                for col_name in exog.columns:
                    exog_dtype_dict[col_name].add(exog[col_name].dtype)
            for col_name, dtypes in exog_dtype_dict.items():
                if len(dtypes) > 1:
                    raise TypeError(
                        (f"Column {col_name} has different dtypes in different exog "
                         f"DataFrames or Series.")
                    )
                    
        # Check that all series have the same index type and frequency  
        indexes_dtypes = [series.index.dtype for series in series_dict.values()]
        if not len(set(indexes_dtypes)) == 1:
            raise TypeError("All series must have the same index type.")
        if isinstance(indexes_dtypes[0], pd.DatetimeIndex):
            indexes_freq = [series.index.freq for series in series_dict.values()]
            if not len(set(indexes_freq)) == 1:
                raise ValueError("All series must have the same frequency.")
                   
        X_train_lags_buffer = []
        X_train_exog_buffer = []
        y_train_buffer = []

        for key in series_dict.keys():
            y = series_dict[key]
            exog = exog_dict[key]
            if exog is not None:
                exog = exog.loc[y.index]
                exog = exog.reindex(y.index, fill_value=np.nan)
                # TODO: add warnings and check that both series and exog have the same index
            X_train_lags, X_train_exog, y_train = (
                forecaster._create_train_X_y_single_series(y=y, exog=exog)
            )
            X_train_lags['level'] = key
            X_train_lags_buffer.append(X_train_lags)
            X_train_exog_buffer.append(X_train_exog)
            y_train_buffer.append(y_train)

        X_train_lags = pd.concat(X_train_lags_buffer, axis=0)
        y_train = pd.concat(y_train_buffer, axis=0)
        if exog is not None:
            X_train_exog = pd.concat(X_train_exog_buffer, axis=0)
            X_train_exog = X_train_exog.drop(columns=['dummy_exog_col_to_keep_shape'])
            X_train_exog = transform_dataframe(
                                df                = X_train_exog,
                                transformer       = self.transformer_exog,
                                fit               = True,
                                inverse_transform = False
                            )
            check_exog(exog=X_train_exog, allow_nan=False)
            check_exog_dtypes(X_train_exog)
            self.exog_dtypes = get_exog_dtypes(exog=X_train_exog)
            if not (X_train_exog.index == X_train_lags).all():
                raise ValueError(
                    ("Different index for `y` and `exog`. They must be equal "
                    "to ensure the correct alignment of values.")
                )
            X_train = pd.concat([X_train_lags, X_train_exog], axis=1)
        else:
            X_train = X_train_lags

        self.X_train_col_names = X_train.columns.to_list()
        y_train_index = y_train.index.to_numpy()


        return X_train, y_train, y_index, y_train_index

In [14]:
n= 10_000

# Create a list with 20 dataframes that have only one column named dummy_exog with n rows and all nan
exog_dummy = [pd.DataFrame(data=np.nan, columns=['dummy_exog'], index=pd.date_range('2018-01-01', periods=n)) for i in range(20)]

# Create a list with 20 dataframes with 3 columns and n rows and random values
exog_1 = [pd.DataFrame(data=np.random.randn(n, 3), columns=['exog_0', 'exog_1', 'exog_2'], index=pd.date_range('2018-01-01', periods=n)) for i in range(20)]
exog_2 = [pd.DataFrame(data=np.random.randn(n, 3), columns=['exog_3', 'exog_4', 'exog_5'], index=pd.date_range('2018-01-01', periods=n)) for i in range(20)]

In [18]:
%%timeit
pd.concat(exog_dummy + exog_1 + exog_2, axis=1)

10.7 ms ± 12 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
n= 10_000

# Create a list with 20 dataframes with columns from exog_0 to exog_5 and n rows and nan
exog_dummy = [pd.DataFrame(data=np.nan, columns=['exog_0', 'exog_1', 'exog_2', 'exog_3', 'exog_4', 'exog_5'], index=pd.date_range('2018-01-01', periods=n)) for i in range(20)]

# create a list with 20 dataframes with columns from exog_0 to exog_2 with random values and columns from exog_3 to exog_5 with nan
exog_1 = []
for i in range(20):
    df = pd.DataFrame(data=np.random.randn(n, 3), columns=['exog_0', 'exog_1', 'exog_2'], index=pd.date_range('2018-01-01', periods=n))
    df['exog_3'] = np.nan
    df['exog_4'] = np.nan
    df['exog_5'] = np.nan
    df = df[['exog_0', 'exog_1', 'exog_2', 'exog_3', 'exog_4', 'exog_5']]
    exog_1.append(df)

# create a list with 20 dataframes with columns from exog_3 to exog_5 with random values and columns from exog_0 to exog_2 with nan
exog_2 = []
for i in range(20):
    df = pd.DataFrame(data=np.random.randn(n, 3), columns=['exog_3', 'exog_4', 'exog_5'], index=pd.date_range('2018-01-01', periods=n))
    df['exog_0'] = np.nan
    df['exog_1'] = np.nan
    df['exog_2'] = np.nan
    # sort columns
    df = df[['exog_0', 'exog_1', 'exog_2', 'exog_3', 'exog_4', 'exog_5']]
    exog_2.append(df)


In [26]:
%%timeit
pd.concat(exog_dummy + exog_1 + exog_2, axis=1)

17.2 ms ± 46.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
