### Imports

In [1]:
import sys
import os
sys.path.append(os.path.abspath("../.."))
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from tinyshift.tracker import CatDrift, ConDrift
import numpy as np
from tinyshift.stats import chebyshev_guaranteed_percentage
from tinyshift.series import hurst_exponent, foreca
from numpy.random import standard_normal

### Data

In [2]:
n_samples = 100000 
n_features = 20  
n_informative = 2 
n_redundant = 2  
weights = [0.2, 0.8]  
np.random.seed(42)
X, y = make_classification(n_samples=n_samples, 
                           n_features=n_features, 
                           n_informative=n_informative, 
                           n_redundant=n_redundant, 
                           flip_y=0.05, 
                           random_state=42,
                           )

category_col = np.random.choice([0, 1], size=n_samples, p=weights)

diverse_category_col = np.random.choice([0, 1, 2, 3, 4], size=n_samples)


In [3]:
def generate_random_dates_within_interval(start_date, years, n_samples):
    """
    Generates a set of random dates within a specified interval.

    Parameters:
    - start_date: The starting date (datetime object).
    - years: The number of years for the interval (int).
    - n_samples: The number of random date samples to generate (int).

    Returns:
    - np.array: Array of randomly sampled dates within the interval.
    """
    np.random.seed(42)

    weeks = years * 52
    result = np.empty(n_samples, dtype="datetime64[s]")
    current_date = start_date

    for i in range(0, n_samples, n_samples // weeks):
        date_range = pd.date_range(start=current_date, periods=7, freq="D")
        segment_size = min(n_samples // weeks, n_samples - i)
        result[i : i + segment_size] = np.random.choice(
            date_range, size=segment_size, replace=True
        )
        current_date += timedelta(7)

    return result

In [4]:
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
df["unique_id"] = 1
df['discrete_0'] = category_col
df['discrete_1'] = diverse_category_col
df['target'] = y
df['datetime'] = generate_random_dates_within_interval(datetime(2021, 1, 1), 4, n_samples)
df_train = df[df["datetime"] < '2024']
df_reference = df[(df["datetime"] >= '2024-01-01') & (df["datetime"] < '2024-07-01')].copy()
df_test = df[(df["datetime"] >= '2024-07-01')].copy()

In [5]:
X_train = df_train[df_train.columns[:-2]]
y_train = df_train["target"]

### Training

In [None]:
rf = RandomForestClassifier(random_state=42, oob_score=True, n_jobs=-1, class_weight="balanced")
rf.fit(X_train, y_train)

### df_test

In [None]:
y_prob = rf.predict_proba(df_test[df_test.columns[:-2]])
y_pred = rf.predict(df_test[df_test.columns[:-2]])
y_prob = y_prob[np.arange(len(df_test)), y_pred]
df_test["y_prob"] = y_prob
df_test["prediction"] = y_pred

### df_reference

In [None]:
y_prob = rf.predict_proba(df_reference[df_reference.columns[:-2]])
y_pred = rf.predict(df_reference[df_reference.columns[:-2]])
y_prob = y_prob[np.arange(len(y_prob)), y_pred]
df_reference["y_prob"] = y_prob
df_reference["prediction"] = y_pred

# Discrete Data Drift

### Median Absolute Deviation (MAD)

In [None]:
tracker = CatDrift(df_reference, func="chebyshev", freq="W", drift_limit='mad', time_col="datetime", target_col="discrete_1")
tracker.predict(df_reference, time_col="datetime", id_col="unique_id", target_col="discrete_1")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.000104,False
1,1,2024-01-14,0.003061,False
2,1,2024-01-21,0.024436,False
3,1,2024-01-28,0.03349,False
4,1,2024-02-04,0.018948,False
5,1,2024-02-11,0.000282,False
6,1,2024-02-18,0.005303,False
7,1,2024-02-25,0.029938,False
8,1,2024-03-03,0.018149,False
9,1,2024-03-10,0.02896,False


#### Reference Plot

In [None]:
tracker.predict(df_test, time_col="datetime", id_col="unique_id", target_col="discrete_1")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-07-07,0.012021,False
1,1,2024-07-14,0.003653,False
2,1,2024-07-21,0.016244,False
3,1,2024-07-28,0.004718,False
4,1,2024-08-04,0.012901,False
5,1,2024-08-11,0.017629,False
6,1,2024-08-18,0.012513,False
7,1,2024-08-25,0.010431,False
8,1,2024-09-01,0.022417,False
9,1,2024-09-08,0.019443,False


### Interquartile Range (iqr)

In [None]:
tracker = CatDrift(df_reference, func="chebyshev", drift_limit='iqr', freq="W", time_col="datetime", id_col="unique_id", target_col="discrete_1", method="expanding")
tracker.predict(df_reference, time_col="datetime", id_col="unique_id", target_col="discrete_1")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.000104,False
1,1,2024-01-14,0.003061,False
2,1,2024-01-21,0.024436,False
3,1,2024-01-28,0.03349,False
4,1,2024-02-04,0.018948,False
5,1,2024-02-11,0.000282,False
6,1,2024-02-18,0.005303,False
7,1,2024-02-25,0.029938,False
8,1,2024-03-03,0.018149,False
9,1,2024-03-10,0.02896,False


### Quantile Interval & Jacknife Method

In [None]:
tracker = CatDrift(df_reference, func="chebyshev", drift_limit=("quantile", None, 0.95), time_col="datetime", id_col="unique_id", target_col="discrete_1", freq="W", method="jackknife")
tracker.predict(df_reference, time_col="datetime", id_col="unique_id", target_col="discrete_1")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.000104,False
1,1,2024-01-14,0.003061,False
2,1,2024-01-21,0.024436,False
3,1,2024-01-28,0.03349,False
4,1,2024-02-04,0.018948,False
5,1,2024-02-11,0.000282,False
6,1,2024-02-18,0.005303,False
7,1,2024-02-25,0.029938,False
8,1,2024-03-03,0.018149,False
9,1,2024-03-10,0.02896,False


# Continuous Data Drift

## Wasserstein Distance

In [None]:
ws = ConDrift(df_reference, func="ws", freq="W", drift_limit="auto", id_col="unique_id", target_col="feature_1", time_col="datetime")

In [None]:
ws.predict(df_reference, id_col="unique_id", target_col="feature_1", time_col="datetime")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-01-07,0.036837,False
1,1,2024-01-14,0.060039,False
2,1,2024-01-21,0.096586,True
3,1,2024-01-28,0.035891,False
4,1,2024-02-04,0.069498,False
5,1,2024-02-11,0.032843,False
6,1,2024-02-18,0.067842,False
7,1,2024-02-25,0.057296,False
8,1,2024-03-03,0.03564,False
9,1,2024-03-10,0.056241,False


In [None]:
ws = ConDrift(df_reference, func="ws", freq="W", drift_limit="auto", id_col="unique_id", target_col="feature_1", time_col="datetime")

In [None]:
ws.predict(df_test, id_col="unique_id", target_col="feature_1", time_col="datetime")

Unnamed: 0,unique_id,datetime,metric,drift
0,1,2024-07-07,0.063965,False
1,1,2024-07-14,0.076502,False
2,1,2024-07-21,0.126251,True
3,1,2024-07-28,0.071962,False
4,1,2024-08-04,0.030056,False
5,1,2024-08-11,0.076564,False
6,1,2024-08-18,0.037549,False
7,1,2024-08-25,0.046158,False
8,1,2024-09-01,0.108532,True
9,1,2024-09-08,0.036845,False
