In [211]:
import numpy as np
from tinyshift.stats import StatisticalInterval
from sklearn.utils import check_array
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import pandas as pd

In [212]:

weights = [0.2, 0.8]

X, y = make_classification(
    n_samples=100000, 
    n_features=20, 
    n_informative=2,      
    weights=weights, 
    random_state=42,
    n_redundant=2)

num_samples = X.shape[0]

categorical_col1 = np.random.choice(['A', 'B', 'C'], size=num_samples)
categorical_col2 = np.random.choice(['X', 'Y', 'Z'], size=num_samples)

df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [585]:
def hampel_filter_v1(X, rolling_window=3, factor=3):
    X = np.asarray(X)
    is_outlier = np.zeros(X.shape[0], dtype=bool)

    for i in range(rolling_window, X.shape[0] - rolling_window):
        window = X[i - rolling_window : i]
        median = np.median(window)
        mad = np.median(np.abs(window - np.median(window)))
        threshold = factor * mad
        
        if abs(X[i - rolling_window] - median) > threshold:
            is_outlier[i - rolling_window] = True

    
    return is_outlier

In [580]:
def hampel_filter_v2(X, rolling_window=3, factor=3):
    X = np.asarray(X)
    is_outlier = np.zeros(X.shape[0], dtype=bool)
    window_indeces = [np.arange(i - rolling_window, i) for i in range(rolling_window, X.shape[0] - rolling_window)]
    medians = np.median(X[window_indeces], axis=1)[:, None]
    mad = np.median(np.abs(X[window_indeces] - medians), axis=1)
    thresholds = factor * mad 
    for i, indeces in enumerate(window_indeces):
        is_outlier[indeces] = abs(X[i] - medians[i]) > thresholds[i]
    return is_outlier

In [586]:
%%time
outlier_v1 = hampel_filter(df_train["feature_0"], rolling_window=3)

CPU times: user 2.89 s, sys: 19.6 ms, total: 2.91 s
Wall time: 2.96 s


In [587]:
%%time
outlier_v2 = hampel_filter_v2(df_train["feature_0"], rolling_window=3)

CPU times: user 250 ms, sys: 8.95 ms, total: 259 ms
Wall time: 264 ms


In [588]:
dados = np.array([1, 2, 3, 10, 4, 3, 2, 1])  # 10 é um outlier

In [591]:
hampel_filter_v2(dados)

array([False, False, False, False, False, False, False, False])

In [422]:
filtered_v2 

array([False,  True, False, ..., False, False, False], shape=(80000,))