In [11]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
c:\Users\Joaquín Amat\Documents\GitHub\skforecast


In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from timeit import repeat
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import LinearRegression   

In [13]:
import numpy as np

class QuantileBinner:
    """
    QuantileBinner class to bin data into quantile-based bins.
    
    Parameters
    ----------
    n_bins : int, optional
        The number of quantile-based bins to create. Default is 4 (quartiles).
    method : str, default='linear'
        The method used to compute the quantiles. This parameter is passed to 
        `numpy.percentile`. Default is 'linear'. Valid values are 'linear',
        'lower', 'higher', 'midpoint', 'nearest'.
    subsample : int, default=200000
        The number of samples to use for computing quantiles. If the dataset 
        has more samples than `subsample`, a random subset will be used. 
        Default is 200000.
    dtype : data type, default=numpy.int32
        The data type to use for the bin indices. Default is `numpy.int32`.
    
    Attributes
    ----------
    n_bins : int, optional
        The number of quantile-based bins to create. Default is 4 (quartiles).
    method : str, default='linear'
        The method used to compute the quantiles. This parameter is passed to 
        `numpy.percentile`. Default is 'linear'. Valid values are 'linear',
        'lower', 'higher', 'midpoint', 'nearest'.
    subsample : int, default=200000
        The number of samples to use for computing quantiles. If the dataset 
        has more samples than `subsample`, a random subset will be used. 
        Default is 200000.
    dtype : data type, default=numpy.int32
        The data type to use for the bin indices. Default is `numpy.int32`.
    bin_edges_ : ndarray
        The edges of the bins learned during fitting.
    
    """
    
    def __init__(self, n_bins=4, method="linear", subsample=200000, dtype=np.int32):
        self.n_bins     = n_bins
        self.method     = method
        self.subsample  = subsample
        self.dtype      = dtype
        self.bin_edges_ = None
    
    def fit(self, X: np.ndarray):
        """
        Learn the bin edges based on quantiles from the training data.
        
        Parameters
        ----------
        X : numpy.ndarray
            The training data used to compute the quantiles.
        
        Returns
        -------
        self : QuantileBinner
            Fitted estimator.
        """

        if X.size == 0:
            raise ValueError("Input data `X` cannot be empty.")
        if len(X) > self.subsample:
            rng = np.random.default_rng()
            X = rng.choice(X, size=self.subsample, replace=False)
        
        if self.n_bins == 1:
            self.bin_edges_ = [np.min(X), np.max(X)]
        else:
            self.bin_edges_ = np.percentile(X, np.linspace(0, 100, self.n_bins + 1), method=self.method)
        return self
    
    def transform(self, X_new):
        """
        Assign new data to the learned bins.
        
        Parameters
        ----------
        X_new : array-like of shape (n_samples,)
            The data to assign to the bins.
        
        Returns
        -------
        bin_indices : ndarray of shape (n_samples,)
            The indices of the bins each value belongs to.
            Values less than the smallest bin edge are assigned to the first bin,
            and values greater than the largest bin edge are assigned to the last bin.
        """
        if self.bin_edges_ is None:
            raise ValueError("The model has not been fitted yet. Call 'fit' with training data first.")
        
        bin_indices = np.digitize(X_new, bins=self.bin_edges_, right=True)
        bin_indices = np.clip(bin_indices, 1, self.n_bins).astype(self.dtype)
        
        return bin_indices
    
    def fit_transform(self, X):
        """
        Fit the model to the data and return the bin indices for the same data.
        
        Parameters
        ----------
        X : array-like of shape (n_samples,)
            The data to fit and transform.
        
        Returns
        -------
        bin_indices : ndarray of shape (n_samples,)
            The indices of the bins each value in `X` belongs to.
        """
        self.fit(X)
        return self.transform(X)
    
    def get_bin_edges(self):
        """
        Get the learned bin edges after fitting.
        
        Returns
        -------
        bin_edges_ : ndarray
            The edges of the bins.
        """
        return self.bin_edges_
    
    def get_bin_intervals(self):
        """
        Get the bin intervals as a list of tuples (lower_bound, upper_bound).
        
        Returns
        -------
        intervals : list of tuple
            A list of bin intervals, where each interval is represented as a 
            tuple (lower_bound, upper_bound).
        """

        if self.bin_edges_ is None:
            raise ValueError("The model has not been fitted yet. Call 'fit' with training data first.")
        
        intervals = [(self.bin_edges_[i], self.bin_edges_[i + 1]) for i in range(self.n_bins)]
        return intervals
    
    def get_params(self):
        """
        Get the parameters of the quantile binner.
        
        Returns
        -------
        params : dict
            A dictionary of the parameters of the quantile binner.
        """
        return {
            "n_bins": self.n_bins,
            "method": self.method,
            "subsample": self.subsample,
            "dtype": self.dtype
        }


In [44]:
X = np.random.normal(10, 10, 10000)
X_reshaped = X.reshape(-1, 1)

In [45]:
binner = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform', dtype=np.float64)
times_fit = repeat("binner.fit_transform(X_reshaped)", repeat=100, number=1, globals=globals())
times_transform = repeat("binner.transform(X_reshaped)", repeat=100, number=1, globals=globals())
print(f"Sklearn KBinsDiscretizer {binner.get_params()}")
print(f"    Fit      : {1000 * np.mean(times_fit):.6f} μs +- {np.std(times_fit):.6f}")
print(f"    Transform: {1000 * np.mean(times_transform):.6f} μs +- {np.std(times_transform):.6f}")
print("")

binner = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', dtype=np.float64)
times_fit = repeat("binner.fit_transform(X_reshaped)", repeat=100, number=1, globals=globals())
times_transform = repeat("binner.transform(X_reshaped)", repeat=100, number=1, globals=globals())
print(f"Sklearn KBinsDiscretizer {binner.get_params()}")
print(f"    Fit      : {1000 * np.mean(times_fit):.6f} μs +- {np.std(times_fit):.6f}")
print(f"    Transform: {1000 * np.mean(times_transform):.6f} μs +- {np.std(times_transform):.6f}")
print("")


binner = QuantileBinner(n_bins=10, method="linear", dtype=np.float64)
times_fit = repeat("binner.fit_transform(X)", repeat=100, number=1, globals=globals())
times_transform = repeat("binner.transform(X)", repeat=100, number=1, globals=globals())
print(f"QuantileBinner {binner.get_params()}")
print(f"    Fit      : {1000 * np.mean(times_fit):.6f} μs +- {np.std(times_fit):.6f}")
print(f"    Transform: {1000 * np.mean(times_transform):.6f} μs +- {np.std(times_transform):.6f}")
print("")

binner = QuantileBinner(n_bins=10, method="closest_observation", dtype=np.float64)
times_fit = repeat("binner.fit_transform(X)", repeat=100, number=1, globals=globals())
times_transform = repeat("binner.transform(X)", repeat=100, number=1, globals=globals())
print(f"QuantileBinner {binner.get_params()}")
print(f"    Fit      : {1000 * np.mean(times_fit):.6f} μs +- {np.std(times_fit):.6f}")
print(f"    Transform: {1000 * np.mean(times_transform):.6f} μs +- {np.std(times_transform):.6f}")


Sklearn KBinsDiscretizer {'dtype': <class 'numpy.float64'>, 'encode': 'ordinal', 'n_bins': 10, 'random_state': None, 'strategy': 'uniform', 'subsample': 200000}
    Fit      : 0.308964 μs +- 0.000051
    Transform: 0.170891 μs +- 0.000013

Sklearn KBinsDiscretizer {'dtype': <class 'numpy.float64'>, 'encode': 'ordinal', 'n_bins': 10, 'random_state': None, 'strategy': 'quantile', 'subsample': 200000}
    Fit      : 0.637658 μs +- 0.000056
    Transform: 0.219849 μs +- 0.000022

QuantileBinner {'n_bins': 10, 'method': 'linear', 'subsample': 200000, 'dtype': <class 'numpy.float64'>}
    Fit      : 0.434101 μs +- 0.000031
    Transform: 0.173879 μs +- 0.000012

QuantileBinner {'n_bins': 10, 'method': 'closest_observation', 'subsample': 200000, 'dtype': <class 'numpy.float64'>}
    Fit      : 0.416794 μs +- 0.000035
    Transform: 0.171105 μs +- 0.000012


In [16]:
# Assert equivalence
binner_1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
binner_2 = QuantileBinner(n_bins=10, method="linear")

binner_1.fit(X.reshape(-1, 1))
binner_2.fit(X)

transformed_1 = binner_1.transform(X.reshape(-1, 1)).flatten()
transformed_2 = binner_2.transform(X) -1

np.testing.assert_array_almost_equal(binner_1.bin_edges_[0], binner_2.get_bin_edges())
np.testing.assert_array_almost_equal(transformed_1, transformed_2)

In [43]:
24*365

8760

In [40]:
y = pd.Series(X)
forecaster = ForecasterAutoreg(regressor=LinearRegression(), lags=10)
forecaster.fit(y)
steps = 24
last_window_values, _, _  = forecaster._create_predict_inputs(
            steps=steps, last_window=forecaster.last_window_
        )
last_window_values

array([17.26292295,  3.89720011, 11.43095513, 10.74486889, 18.53360102,
        5.19816378, 29.445935  ,  5.96495197,  5.89220585, 10.44343867])

In [41]:
%%timeit -r 100 -n 2

forecaster._recursive_predict_13(
    steps=500,
    last_window_values=last_window_values,
    residuals=forecaster.in_sample_residuals_by_bin_,
    use_binned_residuals=True,
    rng=np.random.default_rng(),
)

79.2 ms ± 19.2 ms per loop (mean ± std. dev. of 100 runs, 2 loops each)


In [42]:
%%timeit -r 100 -n 2

forecaster._recursive_predict(
    steps=500,
    last_window_values=last_window_values,
    residuals=forecaster.in_sample_residuals_by_bin_,
    use_binned_residuals=True,
    rng=np.random.default_rng(),
)

59.7 ms ± 6.24 ms per loop (mean ± std. dev. of 100 runs, 2 loops each)
