In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\Joaquín Amat\Documents\GitHub\skforecast


In [2]:
import pandas as pd
import numpy as np
import os
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.datasets import fetch_dataset
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [3]:
# Download data
# ==============================================================================
data = fetch_dataset(
    name="h2o", raw=True, kwargs_read_csv={"names": ["y", "datetime"], "header": 0}
)

# Data preprocessing
# ==============================================================================
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d')
data = data.set_index('datetime')
data = data.asfreq('MS')
data = data['y']
data = data.sort_index()

# Split train-test
# ==============================================================================
steps = 36
data_train = data[:-steps]
data_test  = data[-steps:]

h2o
---
Monthly expenditure ($AUD) on corticosteroid drugs that the Australian health
system had between 1991 and 2008.
Hyndman R (2023). fpp3: Data for Forecasting: Principles and Practice(3rd
Edition). http://pkg.robjhyndman.com/fpp3package/,https://github.com/robjhyndman
/fpp3package, http://OTexts.com/fpp3.
Shape of the dataset: (204, 2)


In [4]:
forecaster = ForecasterAutoreg(
                 regressor = LGBMRegressor(random_state=123, verbose=-1),
                 lags            = 15,
                 differentiation = 1,
                 transformer_y  = StandardScaler()
             )

forecaster.fit(y=data_train)
predictions = forecaster.predict(steps=36)

y_test = forecaster.transformer_y.transform(data_test.to_numpy().reshape(-1, 1)).flatten()
y_true = forecaster.transformer_y.transform(predictions.to_numpy().reshape(-1, 1)).flatten()
y_test = forecaster.differentiator.transform(y_test)[forecaster.differentiation:]
y_true = forecaster.differentiator.transform(y_true)[forecaster.differentiation:]
residuals = y_test - y_true
residuals = np.sort(residuals)

forecaster.set_out_sample_residuals(
    y_true=data_test,
    y_pred = predictions
)
out_sample_residuals_ = np.sort(forecaster.out_sample_residuals_)
np.testing.assert_array_almost_equal(residuals, out_sample_residuals_)


from sklearn.linear_model import LinearRegression

def test_forecaster_set_outsample_residuals_when_transformer_y_and_diferentiation():
    data_train = pd.Series(np.random.normal(loc=0, scale=1, size=100), index=range(100))
    data_test  = pd.Series(np.random.normal(loc=0, scale=1, size=36), index=range(100, 136))
    forecaster = ForecasterAutoreg(
                     regressor       = LinearRegression(),
                     lags            = 5,
                     differentiation = 1,
                     transformer_y   = StandardScaler()
                 )

    forecaster.fit(y=data_train)
    predictions = forecaster.predict(steps=36)
    forecaster.set_out_sample_residuals(
        y_true = data_test,
        y_pred = predictions
    )

    y_test = forecaster.transformer_y.transform(data_test.to_numpy().reshape(-1, 1)).flatten()
    y_true = forecaster.transformer_y.transform(predictions.to_numpy().reshape(-1, 1)).flatten()
    y_test = forecaster.differentiator.transform(y_test)[forecaster.differentiation:]
    y_true = forecaster.differentiator.transform(y_true)[forecaster.differentiation:]
    residuals = y_test - y_true
    residuals = np.sort(residuals)
    out_sample_residuals_ = np.sort(forecaster.out_sample_residuals_)

    np.testing.assert_array_almost_equal(residuals, out_sample_residuals_)

test_forecaster_set_outsample_residuals_when_transformer_y_and_diferentiation()



In [5]:
data_train = pd.Series(np.random.normal(loc=0, scale=1, size=100), index=range(100))
data_test  = pd.Series(np.random.normal(loc=0, scale=1, size=36), index=range(100, 136))
forecaster = ForecasterAutoreg(
                    regressor       = LinearRegression(),
                    lags            = 5,
                    differentiation = 1,
                    transformer_y   = StandardScaler()
                )

forecaster.fit(y=data_train)
predictions = forecaster.predict(steps=36)
forecaster.set_out_sample_residuals(
    y_true = data_test,
    y_pred = predictions
)

y_test = forecaster.transformer_y.transform(data_test.to_numpy().reshape(-1, 1)).flatten()
y_true = forecaster.transformer_y.transform(predictions.to_numpy().reshape(-1, 1)).flatten()
y_test = forecaster.differentiator.transform(y_test)[forecaster.differentiation:]
y_true = forecaster.differentiator.transform(y_true)[forecaster.differentiation:]
residuals = y_test - y_true
residuals = np.sort(residuals)
out_sample_residuals_ = np.sort(forecaster.out_sample_residuals_)

predictions



100    1.047076
101   -0.208507
102    0.036779
103   -0.193056
104    0.092140
105    0.226463
106    0.019968
107   -0.005131
108   -0.034870
109    0.097856
110    0.088266
111    0.044418
112    0.010025
113    0.030624
114    0.066774
115    0.061669
116    0.042338
117    0.032328
118    0.045196
119    0.055745
120    0.052805
121    0.044559
122    0.043185
123    0.048846
124    0.052242
125    0.050447
126    0.047568
127    0.047986
128    0.050428
129    0.051544
130    0.050724
131    0.049951
132    0.050537
133    0.051649
134    0.052083
135    0.051859
Name: pred, dtype: float64

In [6]:
forecaster.binner.n_bins_[0]

np.int64(10)

In [7]:
2510//forecaster.binner.n_bins_[0]

np.int64(251)

In [12]:
y_true = pd.Series(np.random.normal(loc=10, scale=10, size=5000))
y_pred = pd.Series(np.random.normal(loc=10, scale=10, size=5000))
forecaster = ForecasterAutoreg(LinearRegression(), lags=3)
forecaster.fit(y_true)
forecaster.set_out_sample_residuals(y_true=y_true, y_pred=y_pred)


5000
415


  f"No predicted values fall in the interval "


In [83]:
def print_directory_tree(startpath, indent_level=0, ignore_folders=None):
    if ignore_folders is None:
        ignore_folders = ['__pycache__', 'tests']  # Default folders to ignore

    for item in os.listdir(startpath):
        path = os.path.join(startpath, item)

        # Skip ignored folders
        if os.path.isdir(path) and item in ignore_folders:
            continue

        print(' ' * indent_level + '|-- ' + item)  # Print the item with indent

        # Recursively call for subdirectories
        if os.path.isdir(path):
            print_directory_tree(path, indent_level + 4, ignore_folders)

# Example usage: Set the path of the directory you want to print
directory_path = '/home/ubuntu/varios/skforecast/skforecast'
print_directory_tree(directory_path , ignore_folders=['__pycache__', 'tests', 'htmlcov'])

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/home/ubuntu/varios/skforecast/skforecast'

    
    |-- skforecast
        |-- autoreg
            |-- ForecasterRecursive.py                   
            |-- ForecasterMultiseriesRecursive.py
            |-- ForecasterSarimax.py
            |-- ForecasterEquivalentDate.py             
            |-- __init__.py
        |-- direct
            |-- ForecasterDirect.py
            |-- ForecasterMultivariateDirect.py
            |-- __init__.py
        |-- deep_learning
            |-- ForecasterRnn.py
        |-- base
            |-- ForecasterBase.py
        |-- __init__.py