In [256]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import math
import numpy as np
import math
from typing import Union, List, Tuple
import scipy
from numpy.random import standard_normal

In [257]:
weights = [0.2, 0.8]

X, y = make_classification(
    n_samples=100000, 
    n_features=20, 
    n_informative=2,      
    weights=weights, 
    random_state=42,
    n_redundant=2)

num_samples = X.shape[0]

categorical_col1 = np.random.choice(['A', 'B', 'C'], size=num_samples)
categorical_col2 = np.random.choice(['X', 'Y', 'Z'], size=num_samples)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [258]:
X = np.asarray(df_train["feature_0"])

In [259]:
"""Convenção de hurst é que separar as amostras em pequenas amostras na qual o tamanho é proporcional à potência de dois"""

'Convenção de hurst é que separar as amostras em pequenas amostras na qual o tamanho é proporcional à potência de dois'

In [260]:
def hurst_v1(X):
    X = np.asarray(X)
    #rolling = np.diff(X)
    rolling = X
    size = len(X)
    exp = np.floor(math.log2(size)).astype(int)
    subsample_sizes = []
    for pow in range(1, exp + 1):
        subsample = 2 ** pow
        if size % subsample == 0:
            subsample_sizes.append(subsample)
    for subsample in subsample_sizes:
        window_indices = [np.arange(i, i + subsample) for i in range(0, size, subsample)]
        mean = np.mean(rolling[window_indices], axis=1)
        S = np.std(rolling[window_indices], axis=1, ddof=1)
        demeaned = rolling[window_indices] - mean[:, None]
        cumsum = np.cumsum(demeaned, axis=1)
        R = np.max(cumsum, axis=1) - np.min(cumsum, axis=1)
        r_s = R / S
    return r_s


In [261]:
def hurst_v2(X: Union[np.ndarray, List[float]]) -> np.ndarray:
    """
    Calculate the Hurst exponent using a rescaled range (R/S) analysis approach.

    The Hurst exponent is a measure of long-term memory of time series. It relates 
    to the autocorrelations of the time series and the rate at which these decrease 
    as the lag between pairs of values increases. This implementation uses a 
    vectorized approach for improved performance.

    Parameters
    ----------
    X : ndarray of shape (n_samples,) or list of float
        Input 1D time series data for which to calculate the Hurst exponent.

    Returns
    -------
    float
        The estimated Hurst exponent value. Interpretation:
        - 0 < H < 0.5: Mean-reverting (anti-persistent) series
        - H = 0.5: Geometric Brownian motion (uncorrelated steps)
        - 0.5 < H < 1: Trending (persistent) series with long-term memory
        - H = 1: Perfectly trending series

    Raises
    ------
    ValueError
        If input data has less than 10 samples (insufficient for reliable estimation).

    Notes
    -----
    The method works by:
    1. Calculating differences of the input series
    2. Dividing the series into windows of varying sizes (powers of 2)
    3. Calculating the rescaled range (R/S) for each window size
    4. Performing linear regression on log(R/S) vs log(window size)
    5. The slope of this regression gives the Hurst exponent estimate

    The implementation uses vectorized operations for better performance compared
    to iterative versions. For reliable results, the time series should have at
    least 100-1000 data points.
    """
    X = np.asarray(X)
    rolling = np.diff(X)
    size = len(rolling)
    
    if size < 10:
        raise ValueError("Dados insuficientes para calcular o expoente de Hurst.")
    
    max_pow = int(np.floor(math.log2(size)))
    subsamples = [2 ** pow for pow in range(1, max_pow + 1)]

    r_s = np.zeros(len(subsamples), dtype=float)
    
    for i in range(len(subsamples)):
        length_windows = size // subsamples[i]
        
        windows = rolling[:length_windows * subsamples[i]].reshape(length_windows, subsamples[i])
        
        mean = np.mean(windows, axis=1, keepdims=True)
        S = np.std(windows, axis=1, ddof=1)
        demeaned = windows - mean
        cumsum = np.cumsum(demeaned, axis=1)
        R = np.max(cumsum, axis=1) - np.min(cumsum, axis=1)
        r_s[i] = np.mean(R / S)
    
    log_sizes = np.log(subsamples)
    log_r_s = np.log(r_s)
    slope, _, _, _, _ = scipy.stats.linregress(log_sizes, log_r_s)
    
    return slope

In [262]:
hurst_v2(X)

np.float64(0.1586313770243084)

In [263]:
def hurst_v3(X: Union[np.ndarray, List[float]]) -> Tuple[float, float]:
    """
    Calculate the Hurst exponent using a rescaled range (R/S) analysis approach with p-value for random walk hypothesis.

    The Hurst exponent is a measure of long-term memory of time series. It relates 
    to the autocorrelations of the time series and the rate at which these decrease 
    as the lag between pairs of values increases.

    Parameters
    ----------
    X : Union[np.ndarray, List[float]]
        Input 1D time series data for which to calculate the Hurst exponent.
        Must contain at least 10 samples.

    Returns
    -------
    Tuple[float, float]
        (Hurst exponent, p-value for H=0.5 hypothesis)
        The estimated Hurst exponent value. Interpretation:
        - 0 < H < 0.5: Mean-reverting (anti-persistent) series
        - H = 0.5: Geometric Brownian motion (random walk)
        - 0.5 < H < 1: Trending (persistent) series with long-term memory
        - H = 1: Perfectly trending series
        p-value interpretation:
        - p < threshold: Reject random walk hypothesis (significant persistence/mean-reversion)
        - p >= threshold: Cannot reject random walk hypothesis

    Raises
    ------
    ValueError
        If input data has less than 10 samples (insufficient for reliable estimation).
    TypeError
        If input is not a list or numpy array.
    """
    X = np.asarray(X, dtype=np.float64)
    rolling = np.diff(X)
    size = len(rolling)
    
    if size < 10:
        raise ValueError("Insufficient data points (minimum 10 required)")

    max_power = int(np.floor(math.log2(size)))
    window_sizes = [2 ** power for power in range(1, max_power + 1)]

    rescaled_ranges = _calculate_rescaled_ranges(rolling, window_sizes)
    
    log_sizes = np.log(window_sizes)
    log_r_s = np.log(rescaled_ranges)
    slope, _, _, _, se = scipy.stats.linregress(log_sizes, log_r_s)
    
    p_value = _hypothesis_test_random_walk(slope, se, len(window_sizes))

    return float(slope), float(p_value)


def _calculate_rescaled_ranges(
    rolling: np.ndarray,
    window_sizes: List[int]
) -> np.ndarray:
    """Helper function to calculate rescaled ranges (R/S) for each window size."""
    r_s = np.zeros(len(window_sizes), dtype=np.float64)

    for i, window_size in enumerate(window_sizes):
        n_windows = len(rolling) // window_size
        truncated_size = n_windows * window_size
        
        windows = rolling[:truncated_size].reshape(n_windows, window_size)
        
        means = np.mean(windows, axis=1, keepdims=True)
        std_devs = np.std(windows, axis=1, ddof=1)
        demeaned = windows - means
        cumulative_sums = np.cumsum(demeaned, axis=1)
        ranges = np.max(cumulative_sums, axis=1) - np.min(cumulative_sums, axis=1)
        
        r_s[i] = np.mean(ranges / std_devs)

    return r_s


def _hypothesis_test_random_walk(hurst: float, se: float, n: int) -> float:
    """Helper function to test if Hurst exponent is significantly different from random_walk (0.5)"""
    random_walk = 0.5
    t_stat = (hurst - random_walk) / se 
    ddof = n - 2
    return 2 * scipy.stats.t.sf(abs(t_stat), ddof)

In [264]:
hurst_v3(X)

(0.1586313770243084, 6.899049706887254e-10)

# Na hora do teste

In [265]:
# Configuração
np.random.seed(42)
n_points = 1000
trend = np.linspace(0, 10, n_points)
noise_trend = np.cumsum(standard_normal(n_points) * 0.1)
mean_reversion = np.zeros(n_points)
for t in range(1, n_points):
    mean_reversion[t] = mean_reversion[t-1] * 0.6 + standard_normal() * 0.5
brownian = np.cumsum(standard_normal(n_points))

In [266]:
hurst_v3(noise_trend)

(0.6419241912818302, 0.000952023886777138)

In [267]:
hurst_v3(mean_reversion)

(0.3685698852341455, 0.05402692976986088)

In [268]:
hurst_v3(brownian)

(0.5970314294136879, 0.04084100480394357)