In [16]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ubuntu/varios/skforecast'

In [17]:
import pandas as pd
import numpy as np

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import LinearRegression

In [18]:
# Data simulation
# ==============================================================================
n = 10
series = pd.DataFrame(
    np.random.randn(n, 5),
    index=pd.date_range("2018-01-01", periods=n),
    columns=[f"series_{i}" for i in range(5)],
)

exog = {
    f"series_{i}": pd.DataFrame(
        np.random.randn(n, 2),
        index=pd.date_range("2018-01-01", periods=n),
        columns=[f"exog_{j}" for j in range(2)],
    )
    for i in range(5)
}

# exog = pd.DataFrame(
#     np.random.randn(n, 2),
#     index=pd.date_range("2018-01-01", periods=n),
#     columns=["exog_0", "exog_1"],
# )

# Injecting missing values
series['series_0'].iloc[0:2] = np.nan
series['series_1'].iloc[0:3] = np.nan
series['series_1'].iloc[-3:] = np.nan
# series['series_3'].iloc[[5, 6]] = np.nan Needs to remove the exception in create_train_X_y when there are missing values in y
series



Unnamed: 0,series_0,series_1,series_2,series_3,series_4
2018-01-01,,,0.927543,0.272155,-0.660284
2018-01-02,,,-0.768084,-2.016062,0.858932
2018-01-03,-0.464179,,0.0379,-1.978839,-2.18004
2018-01-04,0.282657,-0.04815,1.358801,1.558254,1.066286
2018-01-05,0.0056,1.705463,-1.732001,0.183656,-0.47446
2018-01-06,1.127777,0.378419,-1.980796,0.400175,-0.116402
2018-01-07,1.790992,-0.874756,0.319139,-0.321604,0.098658
2018-01-08,-0.25036,,0.660625,-0.599652,-1.563703
2018-01-09,-0.144512,,-1.001138,-0.441618,1.092047
2018-01-10,-1.264163,,0.456936,0.884763,0.564554


In [19]:
forecaster = ForecasterAutoreg(
    regressor=LinearRegression(),
    lags=3
)

In [20]:
# Preprocessing
# ==============================================================================
# Store series and exog as dict
if isinstance(series, pd.DataFrame):
    series_dict = series.to_dict("series")
elif isinstance(series, dict):
    series_dict = series

if isinstance(exog, pd.DataFrame):
    exog_dict = dict.fromkeys(series_dict.keys(), exog)
elif isinstance(exog, dict):
    exog_dict = exog

# Remove leading and trailing nans from each series and exog. This is done
# so then there is no need to remove leading and trailing nans in each X_train
for k, v in series_dict.items():
    series_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]

for k, v in exog_dict.items():
    exog_dict[k] = v.loc[v.first_valid_index():v.last_valid_index()]



print("Series lengths after removing leading and trailing nans")
for k, v in series_dict.items():
    print(f"{k}: {len(v)}")

print("Exog lengths after removing leading and trailing nans")
for k, v in exog_dict.items():
    print(f"{k}: {len(v)}")

Series lengths after removing leading and trailing nans
series_0: 8
series_1: 4
series_2: 10
series_3: 10
series_4: 10
Exog lengths after removing leading and trailing nans
series_0: 10
series_1: 10
series_2: 10
series_3: 10
series_4: 10


In [21]:
X_train_buffer = []
y_train_buffer = []

for key in series_dict.keys():
    y = series_dict[key]
    exog = exog_dict[key]
    # TODO: All neded check goes here
    exog = exog.loc[y.index]
    print(y.name)
    print(exog.columns)
    X_train, y_train = forecaster.create_train_X_y(y=y, exog=exog)
    X_train['level'] = key
    X_train_buffer.append(X_train)
    y_train_buffer.append(y_train)

X_train = pd.concat(X_train_buffer, axis=0)
y_train = pd.concat(y_train_buffer, axis=0)

series_0
Index(['exog_0', 'exog_1'], dtype='object')
series_1
Index(['exog_0', 'exog_1'], dtype='object')
series_2
Index(['exog_0', 'exog_1'], dtype='object')
series_3
Index(['exog_0', 'exog_1'], dtype='object')
series_4
Index(['exog_0', 'exog_1'], dtype='object')


In [22]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,exog_0,exog_1,level
2018-01-06,0.0056,0.282657,-0.464179,-0.598923,-0.925823,series_0
2018-01-07,1.127777,0.0056,0.282657,-0.787976,0.575992,series_0
2018-01-08,1.790992,1.127777,0.0056,0.145728,0.484409,series_0
2018-01-09,-0.25036,1.790992,1.127777,0.888218,0.473401,series_0
2018-01-10,-0.144512,-0.25036,1.790992,0.03822,0.642649,series_0
2018-01-07,0.378419,1.705463,-0.04815,0.063588,0.500113,series_1
2018-01-04,0.0379,-0.768084,0.927543,-0.685962,-0.390861,series_2
2018-01-05,1.358801,0.0379,-0.768084,-0.139454,-0.433289,series_2
2018-01-06,-1.732001,1.358801,0.0379,-0.909661,-0.654653,series_2
2018-01-07,-1.980796,-1.732001,1.358801,-0.005629,0.81912,series_2


In [23]:
y_train

2018-01-06    1.127777
2018-01-07    1.790992
2018-01-08   -0.250360
2018-01-09   -0.144512
2018-01-10   -1.264163
2018-01-07   -0.874756
2018-01-04    1.358801
2018-01-05   -1.732001
2018-01-06   -1.980796
2018-01-07    0.319139
2018-01-08    0.660625
2018-01-09   -1.001138
2018-01-10    0.456936
2018-01-04    1.558254
2018-01-05    0.183656
2018-01-06    0.400175
2018-01-07   -0.321604
2018-01-08   -0.599652
2018-01-09   -0.441618
2018-01-10    0.884763
2018-01-04    1.066286
2018-01-05   -0.474460
2018-01-06   -0.116402
2018-01-07    0.098658
2018-01-08   -1.563703
2018-01-09    1.092047
2018-01-10    0.564554
Name: y, dtype: float64