In [2]:
# | default_exp commons.stats

In [1]:
# | export
from itertools import groupby

import antropy as ant
import numpy as np
import pandas as pd
import pywt
import scipy.stats as stats
import statsmodels.api as sm
import torch
from nolds import dfa, hurst_rs
from scipy.fftpack import fft
from scipy.signal import find_peaks, welch
from scipy.stats import variation
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.tsa.stattools import acf, adfuller, kpss, pacf

In [2]:
# | hide
import warnings

from statsmodels.tools.sm_exceptions import InterpolationWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=InterpolationWarning)


# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module="scipy.signal._spectral_py")
warnings.filterwarnings("ignore", category=UserWarning, module="scipy")
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Suppress InterpolationWarning specifically
warnings.filterwarnings("ignore", category=InterpolationWarning)

In [3]:
# | export


def ensure_tensor(series):
    if isinstance(series, pd.Series):
        return torch.tensor(series.values, dtype=torch.float32)
    elif isinstance(series, np.ndarray):
        return torch.tensor(series, dtype=torch.float32)
    elif isinstance(series, torch.Tensor):
        return series.float()
    else:
        raise ValueError("Input must be a pandas Series, NumPy array, or PyTorch tensor")


def get_seasonality(series_name):
    mapping = {"D": 7, "W": 4, "M": 12, "Q": 4, "Y": 2}
    return mapping.get(series_name[0], 6)  # Default period is 6 if not found


def extract_stats_features(series, max_lag=10):
    if isinstance(series, pd.DataFrame):
        series = series.iloc[:, 0]  # Take first column if DataFrame
    if series.empty:
        return np.nan

    name, series = series.name, torch.tensor([i for i in series.values if i])
    series = ensure_tensor(series)
    series_np = series.cpu().numpy()
    features = {}

    # Basic Stats
    features["mean"] = torch.mean(series).item()
    features["std"] = torch.std(series).item()
    features["var"] = torch.var(series).item()
    features["skewness"] = stats.skew(series_np)
    features["kurtosis"] = stats.kurtosis(series_np)
    features["min"] = torch.min(series).item()
    features["max"] = torch.max(series).item()
    features["range"] = features["max"] - features["min"]
    features["median"] = torch.median(series).item()
    features["iqr"] = np.percentile(series_np, 75) - np.percentile(series_np, 25)
    features["mad"] = torch.mean(torch.abs(series - torch.mean(series))).item()
    features["medad"] = torch.median(torch.abs(series - torch.median(series))).item()
    features["cv"] = variation(series_np)

    # Normality Tests
    _, features["shapiro_p"] = stats.shapiro(series_np)
    _, features["ks_p"] = stats.kstest(series_np, "norm")
    _, features["jarque_bera_p"] = stats.jarque_bera(series_np)

    # Stationarity Tests
    features["adf_p"] = adfuller(series_np)[1]
    features["kpss_p"] = kpss(series_np, regression="c")[1]

    # Autocorrelation
    max_possible_lag = len(series_np) // 2
    max_lag = min(max_lag, max_possible_lag)
    acf_values = acf(series_np, nlags=max_lag)
    pacf_values = pacf(series_np, nlags=max_lag)
    for lag in range(1, max_lag + 1):
        features[f"acf_{lag}"] = abs(acf_values[lag])
        features[f"pacf_{lag}"] = abs(pacf_values[lag])

    # Ljung-Box Test
    features["ljungbox_p"] = abs(acorr_ljungbox(series_np, lags=[1]).iloc[0, 1])

    # Fourier Transform (Dominant Frequency)
    fft_values = np.abs(fft(series_np))
    features["fft_peak"] = np.max(fft_values)

    # Power Spectral Density (PSD) peak
    freqs, psd = welch(series_np)
    features["psd_peak"] = np.max(psd)

    # Hurst Exponent & Detrended Fluctuation Analysis (DFA)
    features["hurst_exponent"] = hurst_rs(series_np)
    features["dfa"] = dfa(series_np)

    # Sparsity Measure
    features["sparsity"] = np.sum(series_np == 0) / len(series_np)

    # Entropy Measures
    features["perm_entropy"] = ant.perm_entropy(series_np)
    features["spectral_entropy"] = ant.spectral_entropy(series_np, sf=1.0)
    features["svd_entropy"] = ant.svd_entropy(series_np)
    features["approx_entropy"] = ant.app_entropy(series_np)
    features["sample_entropy"] = ant.sample_entropy(series_np)

    # Change-point detection
    peaks, properties = find_peaks(series_np, prominence=1)
    features["num_peaks"] = len(peaks)
    features["peak_prominence_mean"] = (
        np.mean(properties["prominences"]) if len(properties["prominences"]) > 0 else 0
    )

    # Determine Seasonality Period
    seasonality_period = get_seasonality(name)

    if len(series_np) >= 2 * seasonality_period:  # Ensure enough data for decomposition
        # STL Decomposition
        stl = STL(series_np, period=seasonality_period).fit()
        features["stl_trend_std"] = np.std(stl.trend)
        features["stl_seasonal_std"] = np.std(stl.seasonal)
        features["stl_resid_std"] = np.std(stl.resid)

    # Longest flat segment
    features["longest_flat_segment"] = max(
        [len(list(g)) for k, g in groupby(series_np) if k == 0], default=0
    )

    # Wavelet Transform Features
    coeffs = pywt.wavedec(series_np, "db4", level=3)
    features["wavelet_energy"] = sum(np.sum(np.abs(c) ** 2) for c in coeffs)
    features["wavelet_entropy"] = stats.entropy(np.hstack(coeffs))

    # Higher Order Moments
    features["mean_abs_change"] = np.mean(np.abs(np.diff(series_np)))
    features["longest_positive_run"] = max(
        [sum(1 for _ in g) for k, g in groupby(series_np > 0) if k], default=0
    )

    return pd.Series(features)

In [7]:
# | hide
# Example Usage
if __name__ == "__main__":
    ts = pd.Series(np.random.randn(1000))  # Example time series
    features = extract_stats_features(ts, max_lag=10)
    print(features)

TypeError: 'NoneType' object is not subscriptable

In [5]:
df = pd.read_parquet("data/m4_preprocessed.parquet")
df.drop_duplicates(inplace=True)
X = df.drop(["no_of_datapoints", "best_model"], axis=1)
y = df["best_model"]


# Function to extract features for a single row
def extract_features(idx, name):
    features = extract_stats_features(X.iloc[idx].dropna(), max_lag=10)
    return name, features


# Sequential processing with progress bar
results = [extract_features(idx, name) for idx, name in tqdm(enumerate(X.index), total=len(X))]

# Convert results to DataFrame
df_features = pd.DataFrame(dict(results)).T  # Transpose to get features as columns

100%|█████████████████████████████████████| 47752/47752 [32:35<00:00, 24.42it/s]


Feature extraction completed and saved to 'extracted_features.parquet'


In [6]:
df_features.to_csv("extracted_features.csv", index=None)

In [11]:
'")

Feature extraction completed and saved to 'extracted_features.parquet'


Unnamed: 0,acf_1,acf_10,acf_2,acf_3,acf_4,acf_5,acf_6,acf_7,acf_8,acf_9,...,std,stl_resid_std,stl_seasonal_std,stl_trend_std,svd_entropy,var,wavelet_energy,wavelet_entropy,best_model,no_of_datapoints
Y22913,0.706185,,0.231034,0.010984,0.010708,0.037744,0.032304,0.092026,0.172112,0.001022,...,0.247295,0.059620,0.006553,0.202324,1.114601,0.061155,17.994615,-inf,1,19
Y22914,0.761659,,0.641802,0.500637,0.343435,0.207558,0.068684,0.056773,0.148922,0.263092,...,0.283854,0.026421,0.020242,0.270024,0.492154,0.080573,19.004846,-inf,5,19
Y22923,0.711719,,0.465551,0.200412,0.052367,0.062458,0.241425,0.395415,0.436295,,...,0.308721,0.049795,0.063825,0.282586,1.006097,0.095309,20.892659,-inf,5,19
Y22924,0.775289,,0.615189,0.480710,0.295660,0.175367,0.012067,0.165475,0.252303,0.331738,...,0.289991,0.030185,0.017386,0.279840,0.710574,0.084095,10.780327,-inf,6,19
Y22925,0.730553,,0.539781,0.451989,0.372102,0.231849,0.076516,0.141856,0.176206,0.215315,...,0.315652,0.049301,0.030795,0.298273,0.933972,0.099636,12.516983,-inf,2,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D588,0.999082,0.991035,0.998164,0.997260,0.996350,0.995444,0.994555,0.993680,0.992805,0.991919,...,0.269322,0.003482,0.001866,0.269225,0.086059,0.072534,1065.649835,-inf,1,4754
D585,0.998927,0.989803,0.997857,0.996821,0.995772,0.994729,0.993730,0.992761,0.991798,0.990801,...,0.272792,0.005606,0.003002,0.272595,0.109154,0.074415,1572.477462,-inf,1,4754
D2194,0.995312,0.952161,0.990666,0.986006,0.981097,0.976146,0.971283,0.966446,0.961689,0.956951,...,0.142995,0.006462,0.003409,0.142529,0.363630,0.020448,169.724502,-inf,3,7856
D2047,0.999095,0.993624,0.998150,0.997420,0.996803,0.996315,0.995946,0.995653,0.995084,0.994353,...,0.234797,0.006347,0.004600,0.234521,0.169960,0.055130,1428.450726,-inf,6,8533


In [1]:
from pycaret.classification import *

ImportError: cannot import name '_print_elapsed_time' from 'sklearn.utils' (/home/pranav-pc/.cache/pypoetry/virtualenvs/ts-EBaOKu-T-py3.11/lib/python3.11/site-packages/sklearn/utils/__init__.py)