In [3]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
import timeit
from numba import njit

In [2]:
# Create a sample pandas Series
np.random.seed(0)
series = pd.Series(np.random.randn(10000))
window = 50

# Benchmark numpy vs pandas

In [6]:
# Rolling mean
# ==============================================================================
# 1. Pandas Implementation
def pandas_rolling(series, window):
    return series.rolling(window=window).mean()

# 2. Numpy Implementation
def numpy_rolling(series, window):
    return np.mean(sliding_window_view(series, window), axis=1)

repeats = 5
number = 100
pandas_times = timeit.repeat(lambda: pandas_rolling(series, window), number=number, repeat=repeats)
numpy_times = timeit.repeat(lambda: numpy_rolling(series.to_numpy(), window), number=number, repeat=repeats)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")

np.testing.assert_allclose(
    pandas_rolling(series, window).dropna(),
    numpy_rolling(series.values, window)
)

Pandas Mean Time: 0.028297 seconds, Std Dev: 0.003984 seconds
Numpy Mean Time : 0.018500 seconds, Std Dev: 0.000188 seconds


In [7]:
# Rolling std
# ==============================================================================
# 1. Pandas Implementation
def pandas_rolling(series, window):
    return series.rolling(window=window).std()

# 2. Numpy Implementation
def numpy_rolling(series, window):
    return np.std(sliding_window_view(series, window), axis=1, ddof=1)

repeats = 5
number = 100
pandas_times = timeit.repeat(lambda: pandas_rolling(series, window), number=number, repeat=repeats)
numpy_times = timeit.repeat(lambda: numpy_rolling(series.to_numpy(), window), number=number, repeat=repeats)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")

np.testing.assert_allclose(
    pandas_rolling(series, window).dropna(),
    numpy_rolling(series.values, window)
)

Pandas Mean Time: 0.030795 seconds, Std Dev: 0.004558 seconds
Numpy Mean Time : 0.234323 seconds, Std Dev: 0.024244 seconds


In [8]:
# Rolling median
# ==============================================================================
# 1. Pandas Implementation
def pandas_rolling(series, window):
    return series.rolling(window=window).median()

# 2. Numpy Implementation
def numpy_rolling(series, window):
    return np.median(sliding_window_view(series, window), axis=1)

repeats = 5
number = 100
pandas_times = timeit.repeat(lambda: pandas_rolling(series, window), number=number, repeat=repeats)
numpy_times = timeit.repeat(lambda: numpy_rolling(series.to_numpy(), window), number=number, repeat=repeats)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")

np.testing.assert_allclose(
    pandas_rolling(series, window).dropna(),
    numpy_rolling(series.values, window)
)

Pandas Mean Time: 0.400690 seconds, Std Dev: 0.010012 seconds
Numpy Mean Time : 0.699383 seconds, Std Dev: 0.010848 seconds


In [9]:
# Rolling multiple stats
# ==============================================================================
# 1. Pandas Implementation
def pandas_rolling(series, window):
    rolling = series.rolling(window=window)
    stats = [rolling.mean(), rolling.std(), rolling.median(), rolling.max(), rolling.min(), rolling.sum()]
    return stats

# 2. Nunmpy Implementation
def numpy_rolling(series, window):

    rolling = sliding_window_view(series, window)
    stats = [
        np.mean(rolling, axis=1),
        np.std(rolling, axis=1, ddof=1),
        np.median(rolling, axis=1),
        np.max(rolling, axis=1),
        np.min(rolling, axis=1),
        np.sum(rolling, axis=1)
    ]
    return stats

repeats = 5
number = 100
pandas_times = timeit.repeat(lambda: pandas_rolling(series, window), number=number, repeat=repeats)
numpy_times = timeit.repeat(lambda: numpy_rolling(series.to_numpy(), window), number=number, repeat=repeats)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")

Pandas Mean Time: 0.526125 seconds, Std Dev: 0.012349 seconds
Numpy Mean Time : 1.083005 seconds, Std Dev: 0.030257 seconds


# Optimization with numba

Durante el predict recursivo, se tiene que aplicar una función repetitiva sobre un array de numpy.

In [18]:
predictions = [np.random.randn(50) for _ in range(1000)]
predictions_pd = [pd.Series(np.random.randn(50)) for _ in range(1000)]

def numpy_version(predictions):
    for last_window in predictions:
        np.median(last_window)

@njit
def numpy_numba_version(predictions):
    for last_window in predictions:
        np.median(last_window)

def pandas_version(predictions):
    for last_window in predictions:
        pd.Series(last_window).median()

repeats = 5
number = 10
numpy_times = timeit.repeat(lambda: numpy_version(predictions), number=number, repeat=repeats)
numba_times = timeit.repeat(lambda: numpy_numba_version(predictions), number=number, repeat=repeats)
pandas_times = timeit.repeat(lambda: pandas_version(predictions_pd), number=number, repeat=repeats)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
numba_mean = np.mean(numba_times)
numba_std = np.std(numba_times)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")
print(f"Numba Mean Time : {numba_mean:.6f} seconds, Std Dev: {numba_std:.6f} seconds")
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")

Numpy Mean Time : 0.154755 seconds, Std Dev: 0.004867 seconds
Numba Mean Time : 0.110940 seconds, Std Dev: 0.072713 seconds
Pandas Mean Time: 0.607512 seconds, Std Dev: 0.004432 seconds


In [22]:
predictions = [np.random.randn(50) for _ in range(1000)]
predictions_pd = [pd.Series(np.random.randn(50)) for _ in range(1000)]

def numpy_version(predictions):
    for last_window in predictions:
        stats = [
            np.mean(last_window),
            np.std(last_window),
            np.median(last_window),
            np.max(last_window),
            np.min(last_window),
            np.sum(last_window)
        ]

@njit
def numpy_numba_version(predictions):
    for last_window in predictions:
        stats = [
            np.mean(last_window),
            np.std(last_window),
            np.median(last_window),
            np.max(last_window),
            np.min(last_window),
            np.sum(last_window)
        ]

def pandas_version(predictions):
    for last_window in predictions:
        lw = pd.Series(last_window)
        stats = [
            lw.mean(),
            lw.std(),
            lw.median(),
            lw.max(),
            lw.min(),
            lw.sum()
        ]

repeats = 5
number = 10
numpy_times = timeit.repeat(lambda: numpy_version(predictions), number=number, repeat=repeats)
numba_times = timeit.repeat(lambda: numpy_numba_version(predictions), number=number, repeat=repeats)
pandas_times = timeit.repeat(lambda: pandas_version(predictions_pd), number=number, repeat=repeats)
numpy_mean = np.mean(numpy_times)
numpy_std = np.std(numpy_times)
numba_mean = np.mean(numba_times)
numba_std = np.std(numba_times)
pandas_mean = np.mean(pandas_times)
pandas_std = np.std(pandas_times)
print(f"Numpy Mean Time : {numpy_mean:.6f} seconds, Std Dev: {numpy_std:.6f} seconds")
print(f"Numba Mean Time : {numba_mean:.6f} seconds, Std Dev: {numba_std:.6f} seconds")
print(f"Pandas Mean Time: {pandas_mean:.6f} seconds, Std Dev: {pandas_std:.6f} seconds")

Numpy Mean Time : 0.464398 seconds, Std Dev: 0.016344 seconds
Numba Mean Time : 0.165382 seconds, Std Dev: 0.188210 seconds
Pandas Mean Time: 1.486151 seconds, Std Dev: 0.015396 seconds


In [55]:
series = pd.Series(np.array([np.nan, 2, np.nan, 4, 5, 6, 7, 8, 9, 10]))
series_2 = pd.Series(np.array([1, 2, 3, 4, 5, np.nan, 7, 8, 9, 10]))

In [56]:
pd.DataFrame({'1':series,
              '2': series_2
}).rolling(window=3, min_periods=3, closed='left').mean()

Unnamed: 0,1,2
0,,
1,,
2,,
3,,2.0
4,,3.0
5,,4.0
6,5.0,
7,6.0,
8,7.0,
9,8.0,8.0


In [None]:
RollingFeatures(
    stats = str, [str],
    window_sizes = int, [int],
    features_names = None, list,
    min_periods=None=window_size, int, [int],
    closed='left', [str],
    # fill_strategy=ffil, [], #warning, ffill no esta disponible durante predict, usa el argument fill_strategy_predict
    # fill_strategy_predict=None, [],
)

transform_batch
transform



# Skforecast

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\jaesc2\GitHub\skforecast


In [19]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.preprocessing import TimeSeriesDifferentiator

# Fixtures
from skforecast.ForecasterAutoreg.tests.fixtures_ForecasterAutoreg import y as y_categorical
from skforecast.ForecasterAutoreg.tests.fixtures_ForecasterAutoreg import exog as exog_categorical
from skforecast.ForecasterAutoreg.tests.fixtures_ForecasterAutoreg import data  # to test results when using differentiation

from skforecast.preprocessing import RollingFeatures

In [30]:
rolling = RollingFeatures(
    stats = ['mean', 'std', 'median', 'max', 'min', 'sum', 'mean'],
    window_sizes = [5, 6, 7, 6, 5, 5, 10],
    # min_periods = [5, 5, 5, 5, 5, 4]
)

In [31]:
print('Stats: ', rolling.stats)
print('Number of stats: ', rolling.n_stats)
print('Window sizes: ', rolling.window_sizes)
print('Max window size: ', rolling.max_window_size)
print('Min periods: ', rolling.min_periods)
print('Features names: ', rolling.features_names)
print('Fill strategy: ', rolling.fill_strategy)

Stats:  ['mean', 'std', 'median', 'max', 'min', 'sum', 'mean']
Number of stats:  7
Window sizes:  [5, 6, 7, 6, 5, 5, 10]
Max window size:  10
Min periods:  [5, 6, 7, 6, 5, 5, 10]
Features names:  ['roll_mean_5', 'roll_std_6', 'roll_median_7', 'roll_max_6', 'roll_min_5', 'roll_sum_5', 'roll_mean_10']
Fill strategy:  None


In [33]:
import pprint
pprint.pp(rolling.unique_rolling_windows)

{'5_5': {'params': {'window': 5, 'min_periods': 5},
         'stats_idx': [0, 4, 5],
         'stats_names': ['roll_mean_5', 'roll_min_5', 'roll_sum_5'],
         'rolling_obj': None},
 '6_6': {'params': {'window': 6, 'min_periods': 6},
         'stats_idx': [1, 3],
         'stats_names': ['roll_std_6', 'roll_max_6'],
         'rolling_obj': None},
 '7_7': {'params': {'window': 7, 'min_periods': 7},
         'stats_idx': [2],
         'stats_names': ['roll_median_7'],
         'rolling_obj': None},
 '10_10': {'params': {'window': 10, 'min_periods': 10},
           'stats_idx': [6],
           'stats_names': ['roll_mean_10'],
           'rolling_obj': None}}


In [39]:
series = pd.Series(np.arange(15), name='series')
# series.iloc[7] = np.nan

df_roll = rolling.transform_batch(series)
df_roll

Unnamed: 0,roll_mean_5,roll_std_6,roll_median_7,roll_max_6,roll_min_5,roll_sum_5,roll_mean_10
9,7.0,1.870829,6.0,9.0,5.0,35.0,4.5
10,8.0,1.870829,7.0,10.0,6.0,40.0,5.5
11,9.0,1.870829,8.0,11.0,7.0,45.0,6.5
12,10.0,1.870829,9.0,12.0,8.0,50.0,7.5
13,11.0,1.870829,10.0,13.0,9.0,55.0,8.5
14,12.0,1.870829,11.0,14.0,10.0,60.0,9.5


In [43]:
type(rolling.unique_rolling_windows['5_5']['rolling_obj'])

pandas.core.window.rolling.Rolling

In [47]:
df_roll.fillna(df_roll.mean())

Unnamed: 0,roll_mean_5,roll_std_6,roll_median_7,roll_max_6,roll_min_5,roll_sum_5
0,7.0,1.870829,7.0,9.5,5.0,35.0
1,7.0,1.870829,7.0,9.5,5.0,35.0
2,7.0,1.870829,7.0,9.5,5.0,35.0
3,7.0,1.870829,7.0,9.5,5.0,35.0
4,2.0,1.870829,7.0,9.5,0.0,10.0
5,3.0,1.870829,7.0,5.0,1.0,15.0
6,4.0,1.870829,3.0,6.0,2.0,20.0
7,7.0,1.870829,7.0,9.5,5.0,35.0
8,7.0,1.870829,7.0,9.5,5.0,35.0
9,7.0,1.870829,7.0,9.5,5.0,35.0


In [46]:
df_roll.ffill()

Unnamed: 0,roll_mean_5,roll_std_6,roll_median_7,roll_max_6,roll_min_5,roll_sum_5
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,2.0,,,,0.0,10.0
5,3.0,1.870829,,5.0,1.0,15.0
6,4.0,1.870829,3.0,6.0,2.0,20.0
7,4.0,1.870829,3.0,6.0,2.0,20.0
8,4.0,1.870829,3.0,6.0,2.0,20.0
9,4.0,1.870829,3.0,6.0,2.0,20.0


In [34]:
df_roll['roll_mean_5'].fillna(df_roll['roll_mean_5'].median())

6      4.0
7     10.5
8     10.5
9     10.5
10    10.5
11    10.5
12    10.0
13    11.0
14    12.0
Name: roll_mean_5, dtype: float64

In [35]:
df_roll['roll_mean_5'].bfill()

6      4.0
7     10.0
8     10.0
9     10.0
10    10.0
11    10.0
12    10.0
13    11.0
14    12.0
Name: roll_mean_5, dtype: float64

In [31]:
df_roll['roll_mean_5'].fillna(method='ffill')

  df_roll['roll_mean_5'].fillna(method='ffill')


6      4.0
7      4.0
8      4.0
9      4.0
10     4.0
11     4.0
12    10.0
13    11.0
14    12.0
Name: roll_mean_5, dtype: float64

In [39]:
df_roll['roll_mean_5'].fillna(1.)

6      4.0
7      1.0
8      1.0
9      1.0
10     1.0
11     1.0
12    10.0
13    11.0
14    12.0
Name: roll_mean_5, dtype: float64

In [None]:
rolling = RollingFeatures(
    stats = ['mean', 'std', 'median', 'max', 'min', 'sum'],
    window_sizes = [5, 6, 7, 6, 5, 5],
    # min_periods = [5, 5, 5, 5, 5, 4]
)

In [58]:
@njit
def _np_min_max_ratio_jit(x):
    return np.min(x) / np.max(x)

In [54]:
@njit
def _np_mean_jit(x):
    return np.mean(x)

In [50]:
@njit
def _np_cv_jit(x):
    a_a, b_b = 0, 0
    for i in x:
        a_a = a_a + i
        b_b = b_b + i * i
    var = b_b / (len(x)) - ((a_a / (len(x))) ** 2)
    var = var * (len(x) / (len(x) - 1))
    std = np.sqrt(var)

    return std / np.mean(x)

In [52]:
arr = np.arange(10000)

In [56]:
%%timeit

for _ in range(1000):
    _np_cv_jit(arr)

13 ms ± 40.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [55]:
%%timeit

for _ in range(1000):
    _np_mean_jit(arr)

9.54 ms ± 7.74 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [59]:
%%timeit

for _ in range(1000):
    _np_min_max_ratio_jit(arr)

3.92 ms ± 137 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
arr = np.arange(10000)

In [5]:
rolling = RollingFeatures(
    stats = ['mean', 'std', 'median', 'max', 'min', 'sum'],
    window_sizes = [10, 10, 10, 10, 10, 10],
)

In [6]:
%%timeit

for _ in range(1000):
    rolling.transform(arr)

14.9 ms ± 614 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
rolling = RollingFeatures(
    stats = ['mean', 'std', 'min', 'max', 'sum', 'median', 'ratio_min_max', 'coef_variation'],
    window_sizes = [10, 10, 10, 10, 10, 10, 10, 10],
)

In [8]:
%%timeit

for _ in range(1000):
    rolling.transform(arr)

16.8 ms ± 902 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
series = pd.Series(np.arange(10000), name='y')

In [17]:
df_1 = rolling.transform_batch(series)
df_2 = rolling.transform_batch_2(series)

pd.testing.assert_frame_equal(df_1, df_2)

In [18]:
df_2.head()

Unnamed: 0,roll_mean_10,roll_std_10,roll_min_10,roll_max_10,roll_sum_10,roll_median_10,roll_ratio_min_max_10,roll_coef_variation_10
9,4.5,3.02765,0.0,9.0,45.0,4.5,0.0,0.672811
10,5.5,3.02765,1.0,10.0,55.0,5.5,0.1,0.550482
11,6.5,3.02765,2.0,11.0,65.0,6.5,0.181818,0.465792
12,7.5,3.02765,3.0,12.0,75.0,7.5,0.25,0.403687
13,8.5,3.02765,4.0,13.0,85.0,8.5,0.307692,0.356194


In [16]:
%%timeit

rolling.transform_batch_2(series).to_numpy()

5.5 ms ± 129 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
