### Imports

In [25]:
import sys
import os
sys.path.append(os.path.abspath("../.."))
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from tinyshift.drift import CatDrift, ConDrift
import numpy as np
from tinyshift.stats import chebyshev_guaranteed_percentage
from tinyshift.series import hurst_exponent, foreca
from numpy.random import standard_normal

### Data

In [26]:
n_samples = 100000 
n_features = 20  
n_informative = 2 
n_redundant = 2  
weights = [0.2, 0.8]  
np.random.seed(42)
X, y = make_classification(n_samples=n_samples, 
                           n_features=n_features, 
                           n_informative=n_informative, 
                           n_redundant=n_redundant, 
                           flip_y=0.05, 
                           random_state=42,
                           )

category_col = np.random.choice([0, 1], size=n_samples, p=weights)

diverse_category_col = np.random.choice([0, 1, 2, 3, 4], size=n_samples)


In [27]:
def generate_random_dates_within_interval(start_date, years, n_samples):
    """
    Generates a set of random dates within a specified interval.

    Parameters:
    - start_date: The starting date (datetime object).
    - years: The number of years for the interval (int).
    - n_samples: The number of random date samples to generate (int).

    Returns:
    - np.array: Array of randomly sampled dates within the interval.
    """
    np.random.seed(42)

    weeks = years * 52
    result = np.empty(n_samples, dtype="datetime64[s]")
    current_date = start_date

    for i in range(0, n_samples, n_samples // weeks):
        date_range = pd.date_range(start=current_date, periods=7, freq="D")
        segment_size = min(n_samples // weeks, n_samples - i)
        result[i : i + segment_size] = np.random.choice(
            date_range, size=segment_size, replace=True
        )
        current_date += timedelta(7)

    return result

In [28]:
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
df["unique_id"] = 1
df['discrete_0'] = category_col
df['discrete_1'] = diverse_category_col
df['target'] = y
df['datetime'] = generate_random_dates_within_interval(datetime(2021, 1, 1), 4, n_samples)
df_train = df[df["datetime"] < '2024']
df_reference = df[(df["datetime"] >= '2024-01-01') & (df["datetime"] < '2024-07-01')].copy()
df_test = df[(df["datetime"] >= '2024-07-01')].copy()

In [None]:
np.random.seed(42)
n_replace = int(len(df) * 0.05)
idx = df.sample(n=n_replace, random_state=42).index
df.loc[idx, "unique_id"] = 2
df_train = df[df["datetime"] < "2024"]
df_reference = df[(df["datetime"] >= "2024-01-01") & (df["datetime"] < "2024-07-01")].copy()
df_test = df[(df["datetime"] >= "2024-07-01")].copy()

Atribuídos unique_id=2 a 5000 linhas (5.00%)
unique_id
1    95000
2     5000
Name: count, dtype: int64


In [30]:
X_train = df_train[df_train.columns[:-2]]
y_train = df_train["target"]

### Training

In [31]:
rf = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1, class_weight="balanced")
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### df_test

In [32]:
y_prob = rf.predict_proba(df_test[df_test.columns[:-2]])
y_pred = rf.predict(df_test[df_test.columns[:-2]])
y_prob = y_prob[np.arange(len(df_test)), y_pred]
df_test["y_prob"] = y_prob
df_test["prediction"] = y_pred

### df_reference

In [33]:
y_prob = rf.predict_proba(df_reference[df_reference.columns[:-2]])
y_pred = rf.predict(df_reference[df_reference.columns[:-2]])
y_prob = y_prob[np.arange(len(y_prob)), y_pred]
df_reference["y_prob"] = y_prob
df_reference["prediction"] = y_pred

# Discrete Data Drift

### Median Absolute Deviation (MAD)

In [46]:
%%time
tracker = CatDrift(df_reference, func="chebyshev", freq="W", drift_limit='mad', time_col="datetime", target_col="discrete_1")

CPU times: user 9.45 ms, sys: 0 ns, total: 9.45 ms
Wall time: 8.36 ms


In [47]:
%%time
tracker.predict(df_reference, time_col="datetime", id_col="unique_id", target_col="discrete_1")

CPU times: user 5.34 ms, sys: 3.39 ms, total: 8.73 ms
Wall time: 7.72 ms


Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.023623,False
1,1,2024-01-14,0.032022,False
2,1,2024-01-21,0.027684,False
3,1,2024-01-28,0.04336,False
4,1,2024-02-04,0.032039,False
5,1,2024-02-11,0.026642,False
6,1,2024-02-18,0.042232,False
7,1,2024-02-25,0.031039,False
8,1,2024-03-03,0.018809,False
9,1,2024-03-10,0.036373,False


#### Reference Plot

In [36]:
tracker.predict(df_test, time_col="datetime", id_col="unique_id", target_col="discrete_1")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-07-07,0.024383,False
1,1,2024-07-14,0.026516,False
2,1,2024-07-21,0.034076,False
3,1,2024-07-28,0.029449,False
4,1,2024-08-04,0.039414,False
5,1,2024-08-11,0.034282,False
6,1,2024-08-18,0.027887,False
7,1,2024-08-25,0.040502,False
8,1,2024-09-01,0.032947,False
9,1,2024-09-08,0.036093,False


### Interquartile Range (iqr)

In [37]:
%%time
tracker = CatDrift(df_reference, func="chebyshev", drift_limit='iqr', freq="W", time_col="datetime", id_col="unique_id", target_col="discrete_1", method="expanding")

CPU times: user 7.33 ms, sys: 0 ns, total: 7.33 ms
Wall time: 6.53 ms


In [38]:
%%time
tracker.predict(df_reference, time_col="datetime", id_col="unique_id", target_col="discrete_1")

CPU times: user 8.63 ms, sys: 0 ns, total: 8.63 ms
Wall time: 7.84 ms


Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.023623,False
1,1,2024-01-14,0.032022,False
2,1,2024-01-21,0.027684,False
3,1,2024-01-28,0.04336,False
4,1,2024-02-04,0.032039,False
5,1,2024-02-11,0.026642,False
6,1,2024-02-18,0.042232,False
7,1,2024-02-25,0.031039,False
8,1,2024-03-03,0.018809,False
9,1,2024-03-10,0.036373,False


### Quantile Interval & Jacknife Method

In [39]:
%%time
tracker = CatDrift(df_reference, func="chebyshev", drift_limit=("quantile", None, 0.95), time_col="datetime", id_col="unique_id", target_col="discrete_1", freq="W", method="expanding")

CPU times: user 4.67 ms, sys: 3.79 ms, total: 8.46 ms
Wall time: 7.58 ms


# Continuous Data Drift

## Wasserstein Distance

In [40]:
%%time
ws = ConDrift(df_reference, func="ws", freq="W", drift_limit="auto", id_col="unique_id", target_col="feature_1", time_col="datetime")

CPU times: user 48.6 ms, sys: 3 μs, total: 48.6 ms
Wall time: 47.9 ms


In [41]:
%%time
ws.predict(df_reference, id_col="unique_id", target_col="feature_1", time_col="datetime")

CPU times: user 38.8 ms, sys: 3.99 ms, total: 42.8 ms
Wall time: 42 ms


Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.045129,False
1,1,2024-01-14,0.059187,False
2,1,2024-01-21,0.099801,False
3,1,2024-01-28,0.042846,False
4,1,2024-02-04,0.07665,False
5,1,2024-02-11,0.030873,False
6,1,2024-02-18,0.070656,False
7,1,2024-02-25,0.053244,False
8,1,2024-03-03,0.046958,False
9,1,2024-03-10,0.055509,False


In [42]:
%%time
ws = ConDrift(df_reference, func="ws", freq="W", drift_limit="auto", id_col="unique_id", target_col="feature_1", time_col="datetime")

CPU times: user 54 ms, sys: 23 μs, total: 54 ms
Wall time: 52.9 ms


In [43]:
%%time
ws.predict(df_test, id_col="unique_id", target_col="feature_1", time_col="datetime")

CPU times: user 39.8 ms, sys: 3 μs, total: 39.8 ms
Wall time: 38.9 ms


Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-07-07,0.050105,False
1,1,2024-07-14,0.079363,False
2,1,2024-07-21,0.141737,True
3,1,2024-07-28,0.06283,False
4,1,2024-08-04,0.027785,False
5,1,2024-08-11,0.068626,False
6,1,2024-08-18,0.038329,False
7,1,2024-08-25,0.047665,False
8,1,2024-09-01,0.111672,True
9,1,2024-09-08,0.039121,False
