In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\jaesc2\GitHub\skforecast


In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import TimeSeriesFold
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.model_selection import grid_search_forecaster_multiseries
from skforecast.model_selection import bayesian_search_forecaster_multiseries

In [3]:
# Data download
# ==============================================================================
data = fetch_dataset(name="items_sales")

# Split data into train-val-test
# ==============================================================================
end_train = '2014-07-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)
data.head()

items_sales
-----------
Simulated time series for the sales of 3 different items.
Simulated data.
Shape of the dataset: (1097, 3)
Train dates : 2012-01-01 00:00:00 --- 2014-07-15 00:00:00   (n=927)
Test dates  : 2014-07-16 00:00:00 --- 2015-01-01 00:00:00   (n=170)


Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [4]:
# Create and train ForecasterRecursiveMultiSeries
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 window_features    = RollingFeatures(stats=['mean', 'mean'], window_sizes=[24, 48]),
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = 1,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

# forecaster.fit(series=data_train)
print(forecaster.differentiation)
print(forecaster.differentiation_max)
print(forecaster.differentiator)
print(forecaster.differentiator_)

1
1
TimeSeriesDifferentiator(order=1, window_size=49)
None


In [5]:
# Create df with categoricals
# ==============================================================================
data_train_cat = data_train.copy()
data_train_cat['item_id'] = np.array(['A', 'B'] * int(len(data_train) / 2) + ['A'])
data_train_cat['item_id'] = data_train_cat['item_id'].astype('category')
data_train_cat

Unnamed: 0_level_0,item_1,item_2,item_3,item_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-01-01,8.253175,21.047727,19.429739,A
2012-01-02,22.777826,26.578125,28.009863,B
2012-01-03,27.549099,31.751042,32.078922,A
2012-01-04,25.895533,24.567708,27.252276,B
2012-01-05,21.379238,18.191667,20.357737,A
...,...,...,...,...
2014-07-11,25.662128,11.002083,10.396751,A
2014-07-12,23.773923,11.008333,16.139173,B
2014-07-13,22.609388,8.100000,13.028927,A
2014-07-14,23.307307,10.895833,9.315334,B


In [6]:
data_train_cat.dtypes

item_1      float64
item_2      float64
item_3      float64
item_id    category
dtype: object

In [7]:
pd.DataFrame().reindex_like(data_train_cat).dtypes

item_1     float64
item_2     float64
item_3     float64
item_id    float64
dtype: object

In [10]:
data_train_cat.dtypes.to_dict()

{'item_1': dtype('float64'),
 'item_2': dtype('float64'),
 'item_3': dtype('float64'),
 'item_id': CategoricalDtype(categories=['A', 'B'], ordered=False, categories_dtype=object)}

In [12]:
pd.DataFrame(index=data_train_cat.index, columns=data_train_cat.columns).astype(data_train_cat.dtypes.to_dict()).dtypes

item_1      float64
item_2      float64
item_3      float64
item_id    category
dtype: object

In [21]:
from copy import copy

copy(None)

In [7]:
from __future__ import annotations

def sum(a: int | list | None = None) -> int:
    return a

In [11]:
import numpy as np

np.max([5, None])

TypeError: '>=' not supported between instances of 'int' and 'NoneType'

In [None]:
diff = {'l1': [1], 'l2': [2]}
differentiator_ = {'l1': [1], 'l2': [1], 'l3': None}
differentiator_.update(
    {k: deepcopy(v) for k, v in diff.items()}
)
differentiator_

{'l1': [1], 'l2': [2], 'l3': None}

In [28]:
differentiator_['l1'] = [1000]
differentiator_

{'l1': [1000], 'l2': [2], 'l3': None}

In [29]:
diff

{'l1': [1], 'l2': [2]}

In [3]:
import re
import pytest
import numpy as np
import pandas as pd
from skforecast.exceptions import MissingValuesWarning
from skforecast.exceptions import IgnoredArgumentWarning
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries

In [21]:
series = {
    "l1": pd.Series(np.array([14,  2, 85, 92, 77, 91, 63, 96, 11, 53])),
    "l2": pd.Series(np.array([16, 23, 98, 76, 75,  9, 23])),
    "l3": pd.Series(np.array([92,  2, 76, 94, 88, 10, 63])),
}
series["l1"].index = pd.date_range("1990-01-01", periods=10, freq="D")
series["l2"].index = pd.date_range("1990-01-05", periods=7, freq="D")
series["l3"].index = pd.date_range("1990-01-03", periods=7, freq="D")

exog = {
    "l1": pd.Series(np.arange(100, 110), name="exog_1", dtype=float),
    "l3": pd.DataFrame(
        {"exog_1": np.arange(203, 210, dtype=float), "exog_2": np.arange(303, 310, dtype=float)}
    ),
}
exog["l1"].index = pd.date_range("1990-01-01", periods=10, freq="D")
exog["l3"].index = pd.date_range("1990-01-03", periods=7, freq="D")

In [30]:
forecaster = ForecasterRecursiveMultiSeries(
        LinearRegression(), lags=3, window_features=RollingFeatures(stats='mean', window_sizes=4), 
        encoding='ordinal',
        transformer_series=None, differentiation={'l1': 1, 'l2': 2, 'l3': None}
    )
    
forecaster._create_train_X_y(series=series, exog=exog)[0]



Unnamed: 0,lag_1,lag_2,lag_3,roll_mean_4,_level_skforecast,exog_1,exog_2
1990-01-07,14.0,-15.0,7.0,22.25,0,106.0,
1990-01-08,-28.0,14.0,-15.0,-5.5,0,107.0,
1990-01-09,33.0,-28.0,14.0,1.0,0,108.0,
1990-01-10,-85.0,33.0,-28.0,-16.5,0,109.0,
1990-01-11,-65.0,21.0,-97.0,-18.25,1,,
1990-01-09,10.0,88.0,94.0,67.0,2,209.0,309.0


In [33]:
forecaster._create_train_X_y(series=series, exog=exog)[0].dtypes



lag_1                float64
lag_2                float64
lag_3                float64
roll_mean_4          float64
_level_skforecast      int32
exog_1               float64
exog_2               float64
dtype: object

In [42]:
forecaster._create_train_X_y(series=series, exog=exog)[9]



{'l1': 1990-01-05    77
 1990-01-06    91
 1990-01-07    63
 1990-01-08    96
 1990-01-09    11
 1990-01-10    53
 Freq: D, Name: l1, dtype: int32,
 'l2': 1990-01-06    23
 1990-01-07    98
 1990-01-08    76
 1990-01-09    75
 1990-01-10     9
 1990-01-11    23
 Freq: D, Name: l2, dtype: int32,
 'l3': 1990-01-04     2
 1990-01-05    76
 1990-01-06    94
 1990-01-07    88
 1990-01-08    10
 1990-01-09    63
 Freq: D, Name: l3, dtype: int32}

In [50]:
series = {
        "l1": pd.Series(np.array([14,  2, 85, 92, 77, 91, 63, 96, 11, 53], dtype=float)),
        "l2": pd.Series(np.array([16, 23, 98, 76, 75,  9, 23], dtype=float)),
        "l3": pd.Series(np.array([92,  2, 76, 94, 88, 10, 63], dtype=float)),
    }
series["l1"].index = pd.date_range("1990-01-01", periods=10, freq="D")
series["l2"].index = pd.date_range("1990-01-05", periods=7, freq="D")
series["l3"].index = pd.date_range("1990-01-03", periods=7, freq="D")

exog = {
    "l1": pd.Series(np.arange(100, 110), name="exog_1", dtype=float),
    "l3": pd.DataFrame(
        {"exog_1": np.arange(203, 210, dtype=float), 
            "exog_2": np.arange(303, 310, dtype=float)}
    ),
}
exog["l1"].index = pd.date_range("1990-01-01", periods=10, freq="D")
exog["l3"].index = pd.date_range("1990-01-03", periods=7, freq="D")

window_features = RollingFeatures(stats='mean', window_sizes=4)
forecaster = ForecasterRecursiveMultiSeries(
    LGBMRegressor(verbose=-1, random_state=123), lags=3, 
    encoding='ordinal', window_features=window_features,
    transformer_series=None, differentiation={'l1': 1, 'l2': 2, 'l3': None}
)

if True:
    forecaster.fit(series=series, exog=exog)

results = forecaster._create_train_X_y(series=series, exog=exog)

expected = (
    pd.DataFrame(
        data = np.array([[ 14., -15.,   7.,  22.25, 0., 106.  , np.nan],
                            [-28.,  14., -15.,  -5.5 , 0., 107.  , np.nan],
                            [ 33., -28.,  14.,   1.  , 0., 108.  , np.nan],
                            [-85.,  33., -28., -16.5 , 0., 109.  , np.nan],
                            [-65.,  21., -97., -18.25, 1., np.nan, np.nan],
                            [ 10.,  88.,  94.,  67.  , 2., 209.  , 309.  ]]),
        index   = pd.Index(
                        pd.DatetimeIndex(
                            ['1990-01-07', '1990-01-08', '1990-01-09', '1990-01-10',
                            '1990-01-11', 
                            '1990-01-09']
                        )
                    ),
        columns = ['lag_1', 'lag_2', 'lag_3', 'roll_mean_4', 
                    '_level_skforecast', 'exog_1', 'exog_2']
    ).astype({'_level_skforecast': int}
    ),
    pd.Series(
        data  = np.array([-28., 33., -85., 42., 80., 63.]),
        index = pd.Index(
                    pd.DatetimeIndex(
                        ['1990-01-07', '1990-01-08', '1990-01-09', '1990-01-10',
                            '1990-01-11', 
                            '1990-01-09']
                    )
                ),
        name  = 'y',
        dtype = float
    ),
    {'l1': pd.date_range("1990-01-01", periods=10, freq='D'),
        'l2': pd.date_range("1990-01-05", periods=7, freq='D'),
        'l3': pd.date_range("1990-01-03", periods=7, freq='D')},
    ['l1', 'l2', 'l3'],
    ['l1', 'l2', 'l3'],
    ['exog_1', 'exog_2'],
    ['roll_mean_4'],
    ['exog_1', 'exog_2'],
    {'exog_1': exog['l1'].dtypes, 'exog_2': exog['l3'].dtypes},
    {'l1': pd.Series(
                data  = np.array([77, 91, 63, 96, 11, 53]),
                index = pd.date_range("1990-01-05", periods=6, freq='D'),
                name  = 'l1',
                dtype = float
            ),
        'l2': pd.Series(
                data  = np.array([23, 98, 76, 75,  9, 23]),
                index = pd.date_range("1990-01-06", periods=6, freq='D'),
                name  = 'l2',
                dtype = float
            ),
        'l3': pd.Series(
                data  = np.array([2, 76, 94, 88, 10, 63]),
                index = pd.date_range("1990-01-04", periods=6, freq='D'),
                name  = 'l3',
                dtype = float
            )
    }
)

pd.testing.assert_frame_equal(results[0], expected[0])
pd.testing.assert_series_equal(results[1], expected[1])
for k in results[2].keys():
    pd.testing.assert_index_equal(results[2][k], expected[2][k])
assert results[3] == expected[3]
assert results[4] == expected[4]
assert results[5] == expected[5]
assert results[6] == expected[6]
assert results[7] == expected[7]
for k in results[8].keys():
    assert results[8][k] == expected[8][k]
for k in results[9].keys():
    pd.testing.assert_series_equal(results[9][k], expected[9][k])



AssertionError: 

In [51]:
results[8]

{'exog_1': dtype('float64'), 'exog_2': dtype('float64')}

In [53]:
expected[8]

{'exog_1': dtype('float64'),
 'exog_2': exog_1    float64
 exog_2    float64
 dtype: object}

In [52]:
expected[0].dtypes

lag_1                float64
lag_2                float64
lag_3                float64
roll_mean_4          float64
_level_skforecast      int32
exog_1               float64
exog_2               float64
dtype: object

In [47]:
{'exog_1': exog['l1'].dtypes, 'exog_2': exog['l3'].dtypes}

{'exog_1': dtype('float64'),
 'exog_2': exog_1    float64
 exog_2    float64
 dtype: object}

In [54]:
series = {
    'l1': pd.Series(np.arange(10, dtype=float)), 
    'l2': pd.Series(np.arange(15, 20, dtype=float)),
    'l3': pd.Series(np.arange(20, 25, dtype=float))
}
series['l1'].loc[3] = np.nan
series['l2'].loc[2] = np.nan
series['l1'].index = pd.date_range("1990-01-01", periods=10, freq='D')
series['l2'].index = pd.date_range("1990-01-05", periods=5, freq='D')
series['l3'].index = pd.date_range("1990-01-03", periods=5, freq='D')

exog = {
    'l1': pd.Series(np.arange(100, 110), name='exog_1', dtype=float),
    'l2': None,
    'l3': pd.DataFrame({'exog_1': np.arange(203, 207, dtype=float),
                        'exog_2': ['a', 'b', 'a', 'b']})
}
exog['l1'].index = pd.date_range("1990-01-01", periods=10, freq='D')
exog['l3'].index = pd.date_range("1990-01-03", periods=4, freq='D')

forecaster = ForecasterRecursiveMultiSeries(LinearRegression(), lags=3,
                                            encoding='onehot',
                                            transformer_series=None,
                                            dropna_from_series=False)
results = forecaster._create_train_X_y(series=series, exog=exog)
results[-2]



{'exog_1': dtype('float64'), 'exog_2': dtype('O')}

In [57]:
series = {
        'l1': pd.Series(np.arange(10, dtype=float)), 
        'l2': pd.Series(np.arange(15, 20, dtype=float)),
        'l3': pd.Series(np.arange(20, 25, dtype=float))
}
series['l1'].loc[3] = np.nan
series['l2'].loc[2] = np.nan
series['l1'].index = pd.date_range("1990-01-01", periods=10, freq='D')
series['l2'].index = pd.date_range("1990-01-05", periods=5, freq='D')
series['l3'].index = pd.date_range("1990-01-03", periods=5, freq='D')

exog = {
    'l1': pd.Series(np.arange(100, 110), name='exog_1', dtype=float),
    'l2': None,
    'l3': pd.DataFrame({'exog_1': np.arange(203, 207, dtype=float),
                        'exog_2': ['a', 'b', 'a', 'b']})
}
exog['l1'].index = pd.date_range("1990-01-01", periods=10, freq='D')
exog['l3'].index = pd.date_range("1990-01-03", periods=4, freq='D')

forecaster = ForecasterRecursiveMultiSeries(LinearRegression(), lags=3,
                                            encoding='onehot',
                                            transformer_series=None,
                                            dropna_from_series=False)
results = forecaster._create_train_X_y(series=series, exog=exog)

expected = (
    pd.DataFrame(
        data = np.array([[np.nan, 2., 1., 1., 0., 0., 104., np.nan],
                            [4., np.nan, 2., 1., 0., 0., 105., np.nan],
                            [5., 4., np.nan, 1., 0., 0., 106., np.nan],
                            [6., 5., 4., 1., 0., 0., 107., np.nan],
                            [7., 6., 5., 1., 0., 0., 108., np.nan],
                            [8., 7., 6., 1., 0., 0., 109., np.nan],
                            [np.nan, 16., 15., 0., 1., 0., np.nan, np.nan],
                            [18., np.nan, 16., 0., 1., 0., np.nan, np.nan],
                            [22., 21., 20., 0., 0., 1., 206., 'b'],
                            [23., 22., 21., 0., 0., 1., np.nan, np.nan]]),
        index   = pd.Index(
                        pd.DatetimeIndex(
                            ['1990-01-05', '1990-01-06', '1990-01-07', '1990-01-08',
                            '1990-01-09', '1990-01-10',
                            '1990-01-08', '1990-01-09', 
                            '1990-01-06', '1990-01-07']
                        )
                    ),
        columns = ['lag_1', 'lag_2', 'lag_3', 'l1', 'l2', 'l3', 
                    'exog_1', 'exog_2']
    ).astype({'lag_1': float, 'lag_2': float, 'lag_3': float, 'l1': float, 
                'l2': float, 'l3': float, 'exog_1': float, 'exog_2': object}
    ).astype({'l1': int, 'l2': int, 'l3': int}
    ),
    pd.Series(
        data  = np.array([4., 5., 6., 7., 8., 9., 18., 19., 23., 24.]),
        index = pd.Index(
                    pd.DatetimeIndex(
                        ['1990-01-05', '1990-01-06',
                            '1990-01-07', '1990-01-08',
                            '1990-01-09', '1990-01-10',
                            '1990-01-08', '1990-01-09', 
                            '1990-01-06', '1990-01-07']
                    )
                ),
        name  = 'y',
        dtype = float
    ),
    {'l1': pd.date_range("1990-01-01", periods=10, freq='D'),
        'l2': pd.date_range("1990-01-05", periods=5, freq='D'),
        'l3': pd.date_range("1990-01-03", periods=5, freq='D')},
    ['l1', 'l2', 'l3'],
    ['l1', 'l2', 'l3'],
    ['exog_1', 'exog_2'],
    None,
    ['exog_1', 'exog_2'],
    {'exog_1': exog['l1'].dtypes,
        'exog_2': exog['l3'].dtypes},
    {'l1': pd.Series(
                data  = np.array([7., 8., 9.]),
                index = pd.date_range("1990-01-08", periods=3, freq='D'),
                name  = 'l1',
                dtype = float
            ),
        'l2': pd.Series(
                data  = np.array([np.nan, 18., 19.]),
                index = pd.date_range("1990-01-07", periods=3, freq='D'),
                name  = 'l2',
                dtype = float
            ),
        'l3': pd.Series(
                data  = np.array([22., 23., 24.]),
                index = pd.date_range("1990-01-05", periods=3, freq='D'),
                name  = 'l3',
                dtype = float
            )
    }
)
expected[0].iloc[[0, 1, 2, 3, 4, 5, 6, 7, 9], -1] = np.nan



In [58]:
expected[-2]

{'exog_1': dtype('float64'),
 'exog_2': exog_1    float64
 exog_2     object
 dtype: object}

In [59]:
results[-2]

{'exog_1': dtype('float64'), 'exog_2': dtype('O')}

In [None]:
{'exog': np.dtype('float')}
{'exog_1': np.dtype('float'), 'exog_2': np.dtype('O')}
{'exog_1': np.dtype('float'), 
 'exog_2': np.dtype('int'), 
 'exog_3': pd.CategoricalDtype(categories=range(100, 110))
}

{'exog_1': dtype('float64'), 'exog_2': dtype('O')}

In [67]:
np.dtype(type('s'))

dtype('<U')

In [72]:
dtype = str
{'exog': np.dtype(dtype)} if dtype is bool else {'exog': np.dtype('O')}

{'exog': dtype('O')}