In [6]:
# | default_exp classification.mlclassifier

In [1]:
# | export
import numpy as np
import pandas as pd

# import torch
from datasetsforecast.m4 import M4
from sklearn.preprocessing import LabelEncoder

In [2]:
## Prepare data

# Load Labels
label_df = pd.read_parquet("data/evaluation_df.parquet").set_index("unique_id")["best_model"]

# Load M4 dataset
groups = ["Yearly", "Monthly", "Quarterly", "Hourly", "Weekly", "Daily"]
m4_df_bucket = []
for group in groups:
    await M4.async_download("data", group=group)
    Y_df, *_ = M4.load(directory="data", group=group)
    m4_df_bucket.append(Y_df)
m4_dataset = pd.concat(m4_df_bucket)

m4_dataset = m4_dataset.sort_values(["unique_id", "ds"]).drop_duplicates(
    subset=["unique_id", "ds"]
)
no_of_datapoints = m4_dataset.groupby("unique_id").apply(len).to_dict()

# Convert to wide format (fixed)
m4_dataset = m4_dataset.pivot(index="unique_id", columns="ds", values="y")

# Merge with labels
m4_dataset = m4_dataset.merge(label_df, left_index=True, right_index=True, how="right")

best_model = m4_dataset["best_model"].to_dict()
df_min = m4_dataset.drop("best_model", axis=1).min(axis=1)
df_max = m4_dataset.drop("best_model", axis=1).max(axis=1)

m4_dataset = (m4_dataset.drop("best_model", axis=1) - df_min.values.reshape(-1, 1)) / (
    df_max - df_min
).values.reshape(-1, 1)

m4_dataset["best_model"] = m4_dataset.index.map(best_model)
m4_dataset["no_of_datapoints"] = m4_dataset.index.map(no_of_datapoints)

m4_dataset.sort_values("no_of_datapoints", inplace=True)
# m4_dataset.drop("no_of_datapoints", axis=1, inplace=True)
# clean up the memory
del df_max
del df_min
del m4_df_bucket, no_of_datapoints

# Encode categorical labels
le = LabelEncoder()
m4_dataset["best_model"] = le.fit_transform(m4_dataset["best_model"])
m4_dataset.columns = m4_dataset.columns.astype(str)

# Save as optimized Parquet
m4_dataset.to_parquet("data/m4_preprocessed.parquet", engine="fastparquet", compression="snappy")

print("✅ Preprocessing Complete! Data saved as Parquet.")

  no_of_datapoints = m4_dataset.groupby("unique_id").apply(len).to_dict()


✅ Preprocessing Complete! Data saved as Parquet.


In [3]:
df = pd.read_parquet("data/m4_preprocessed.parquet")
# df.drop_duplicates(inplace=True)

In [7]:
df_len = (
    df.drop(["best_model", "no_of_datapoints"], axis=1)
    .apply(lambda series: series.dropna().values, axis=1)
    .map(len)
)

In [18]:
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/src/commons.stats.ipynb.

# %% auto 0
__all__ = ["ensure_tensor", "extract_stats_features"]

from itertools import groupby

import antropy as ant
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm

# %% ../../nbs/src/commons.stats.ipynb 1
import torch
from nolds import dfa, hurst_rs
from scipy.fftpack import fft
from scipy.signal import find_peaks, welch
from scipy.stats import variation
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.tsa.stattools import acf, adfuller, kpss, pacf

# import PyWavelets

# %% ../../nbs/src/commons.stats.ipynb 2
# Ensure tensor input


def ensure_tensor(series):
    if isinstance(series, pd.Series):
        return torch.tensor(series.values, dtype=torch.float32)
    elif isinstance(series, np.ndarray):
        return torch.tensor(series, dtype=torch.float32)
    elif isinstance(series, torch.Tensor):
        return series.float()
    else:
        raise ValueError("Input must be a pandas Series, NumPy array, or PyTorch tensor")


# Feature extraction function


def extract_stats_features(series, max_lag=10):
    series = ensure_tensor(series)
    series_np = series.cpu().numpy()

    features = {}

    # Basic Stats
    features["mean"] = torch.mean(series).item()
    features["std"] = torch.std(series).item()
    features["var"] = torch.var(series).item()
    features["skewness"] = stats.skew(series_np)
    features["kurtosis"] = stats.kurtosis(series_np)
    features["min"] = torch.min(series).item()
    features["max"] = torch.max(series).item()
    features["range"] = features["max"] - features["min"]
    features["median"] = torch.median(series).item()
    features["iqr"] = np.percentile(series_np, 75) - np.percentile(series_np, 25)
    features["mad"] = torch.median(torch.abs(series - torch.median(series))).item()
    features["cv"] = variation(series_np)

    # Normality Tests
    _, features["shapiro_p"] = stats.shapiro(series_np)
    _, features["ks_p"] = stats.kstest(series_np, "norm")
    _, features["jarque_bera_p"] = stats.jarque_bera(series_np)

    # Stationarity Tests
    features["adf_p"] = adfuller(series_np)[1]
    features["kpss_p"] = kpss(series_np, regression="c")[1]

    # Autocorrelation
    if len(series) // 2 < max_lag:
        max_lag = len(series) // 2
    acf_values = acf(series_np, nlags=max_lag)
    pacf_values = pacf(series_np, nlags=max_lag)
    for lag in range(1, max_lag + 1):
        features[f"acf_{lag}"] = abs(acf_values[lag])
        features[f"pacf_{lag}"] = abs(pacf_values[lag])

    # Ljung-Box Test for white noise
    features["ljungbox_p"] = abs(acorr_ljungbox(series_np, lags=[1]).iloc[0, 1])

    # Fourier Transform (Dominant Frequency)
    fft_values = np.abs(fft(series_np))
    features["fft_peak"] = np.max(fft_values)

    # Power Spectral Density (PSD) peak
    freqs, psd = welch(series_np)
    features["psd_peak"] = np.max(psd)

    # Hurst Exponent
    features["hurst_exponent"] = hurst_rs(series_np)
    features["dfa"] = dfa(series_np)

    # Sparsity Measure
    features["sparsity"] = np.sum(series_np == 0) / len(series_np)

    # Entropy Measures
    features["perm_entropy"] = ant.perm_entropy(series_np)
    features["spectral_entropy"] = ant.spectral_entropy(series_np, sf=1.0)
    features["svd_entropy"] = ant.svd_entropy(series_np)
    features["approx_entropy"] = ant.app_entropy(series_np)
    features["sample_entropy"] = ant.sample_entropy(series_np)

    # Change-point detection
    peaks, properties = find_peaks(series_np, prominence=1)
    features["num_peaks"] = len(peaks)
    features["peak_prominence_mean"] = (
        np.mean(properties["prominences"]) if len(properties["prominences"]) > 0 else 0
    )

    # Trend and Seasonality
    stl = seasonal_decompose(series_np, period=6)
    features["sd_trend_std"] = np.std(stl.trend)
    features["sd_seasonal_std"] = np.std(stl.seasonal)
    features["sd_resid_std"] = np.std(stl.resid)

    # Trend and Seasonality
    stl = STL(series_np, period=6).fit()
    features["stl_trend_std"] = np.std(stl.trend)
    features["stl_seasonal_std"] = np.std(stl.seasonal)
    features["stl_resid_std"] = np.std(stl.resid)

    # Longest flat segment
    features["longest_flat_segment"] = max(
        [len(list(g)) for k, g in groupby(series_np) if k == 0], default=0
    )

    # # Wavelet Transform Features
    # coeffs = PyWavelets.wavedec(series_np, 'db4', level=3)
    # features['wavelet_energy'] = sum(np.sum(np.abs(c)**2) for c in coeffs)
    # features['wavelet_entropy'] = stats.entropy(np.hstack(coeffs))

    # Higher Order Moments
    features["mean_abs_change"] = np.mean(np.abs(np.diff(series_np)))
    features["longest_positive_run"] = max(
        [sum(1 for _ in g) for k, g in groupby(series_np > 0) if k], default=0
    )

    return pd.Series(features)

In [19]:
df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,9926,9927,9928,9929,9930,9931,9932,9933,best_model,no_of_datapoints
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Y22913,0.385933,0.698040,1.000000,0.659313,0.336100,0.252585,0.356881,0.337253,0.314206,0.431248,...,,,,,,,,,1,19
Y22914,0.000000,0.020058,0.304270,0.331231,0.348923,0.376489,0.416931,0.483002,0.552318,0.627689,...,,,,,,,,,5,19
Y22923,0.330275,0.229358,0.651376,0.834862,0.935780,0.733945,0.706422,0.834862,0.715596,1.000000,...,,,,,,,,,5,19
Y22924,0.174625,0.091405,0.190996,0.098226,0.000000,0.061392,0.070941,0.068213,0.068213,0.166439,...,,,,,,,,,6,19
Y22925,0.002457,0.054545,0.288452,0.181327,0.000000,0.029484,0.101229,0.112039,0.377396,0.520885,...,,,,,,,,,2,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D588,0.011088,0.012060,0.009486,0.010323,0.009403,0.009608,0.008087,0.005610,0.004579,0.005872,...,,,,,,,,,1,4754
D585,0.032336,0.034550,0.028690,0.030424,0.028330,0.028797,0.025310,0.019674,0.017183,0.020116,...,,,,,,,,,1,4754
D2194,0.000105,0.000110,0.000067,0.000036,0.000043,0.000043,0.000043,0.000086,0.000046,0.000048,...,,,,,,,,,3,7856
D2047,0.303717,0.288174,0.287502,0.285045,0.280434,0.263320,0.266955,0.278331,0.287553,0.287553,...,,,,,,,,,6,8533


In [None]:
features = (
    df.drop(["best_model", "no_of_datapoints"], axis=1)
    .apply(lambda series: series.dropna().values, axis=1)
    .map(extract_stats_features)
)

  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
look-up table. The actual p-value is greater than the p-value returned.

  features["kpss_p"] = kpss(series_np, regression="c")[1]
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  fre

In [21]:
from pycaret.classification import *

s = setup(df, target="best_model", session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,best_model
2,Target type,Multiclass
3,Original data shape,"(47752, 9934)"
4,Transformed data shape,"(47752, 7857)"
5,Transformed train set shape,"(33426, 7857)"
6,Transformed test set shape,"(14326, 7857)"
7,Numeric features,9933
8,Rows with missing values,100.0%
9,Preprocess,True


Collecting numpy<1.27,>=1.21 (from pycaret)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pandas<2.2.0 (from pycaret)
  Downloading pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib<3.8.0 (from pycaret)
  Downloading matplotlib-3.7.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting scikit-learn>1.4.0 (from 