# Monitoreo y detección de drift


Simulamos drift alterando la distribución de `test_preparation_course` y evaluamos el cambio con KS test.


In [1]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ks_2samp

CONFIG_PATH = Path("config.json")
if not CONFIG_PATH.exists():
    CONFIG_PATH = Path("../../config.json").resolve()

with CONFIG_PATH.open(encoding="utf-8") as cfg:
    config = json.load(cfg)

project_root = CONFIG_PATH.parent
clean_path = project_root / config["data"]["clean_dataset"]
drift_report_path = project_root / config["paths"]["drift_report"]
drift_plot_path = project_root / config["paths"]["drift_plot"]
drift_report_path.parent.mkdir(parents=True, exist_ok=True)

sns.set_theme(style="whitegrid")
df = pd.read_csv(clean_path)
target = config["features"]["target"]
print(f"Dataset base: {df.shape}")


Dataset base: (1000, 8)


In [2]:
rng = np.random.default_rng(seed=42)
drift_df = df.copy()

completed_mask = drift_df["test_preparation_course"].eq("completed")
if completed_mask.any():
    subset = drift_df[completed_mask].sample(frac=0.7, random_state=42)
    drift_df.loc[subset.index, "test_preparation_course"] = "none"
    drift_df.loc[subset.index, target] = (
        drift_df.loc[subset.index, target] - rng.normal(loc=7, scale=3, size=len(subset))
    ).clip(lower=0, upper=100)

print("Distribución original de test_preparation_course:")
print(df["test_preparation_course"].value_counts(normalize=True))
print("\nDistribución alterada:")
print(drift_df["test_preparation_course"].value_counts(normalize=True))


Distribución original de test_preparation_course:
test_preparation_course
none         0.642
completed    0.358
Name: proportion, dtype: float64

Distribución alterada:
test_preparation_course
none         0.893
completed    0.107
Name: proportion, dtype: float64


 74.61647879 60.94872778 35.05040347 58.55913178 46.36180608 64.66662419
 76.80190791 61.61827638 67.59747197 80.57787739 41.89374765 85.8766478
 42.3646491  58.14977773 80.55458709 45.04278863 64.33237598 72.46358845
 66.28498347 61.05640065 73.40307244 38.90366781 56.76180217 58.70753699
 65.5750572  53.21924505 47.53672819 52.44131818 59.15206173 64.61308312
 43.34184237 74.52046943 74.47344365 51.04822164 58.77023749 57.3705372
 65.99652912 73.30351603 64.64994257 73.34393421 48.38571367 57.32921335
 90.96325931 43.79726279 44.1326418  85.10613532 53.37146746 80.95901365
 55.41111796 74.91663354 71.82542675 55.51517607 75.59749335 50.09516494
 50.04860931 56.00465509 61.5117408  51.24133301 49.86632026 78.61995829
 58.04617522 57.38705538 75.42607236 57.57391297 63.82705897 70.39986164
 47.75835686 76.50851777 58.57272279 71.92854394 66.28175794 68.52438093
 49.12322882 38.92803962 78.62967429 75.98577782 43.08916154 83.14521368
 61.58751894 68.53908256 89.40820702 67.96251764 58.5

In [3]:
alpha = 0.05
statistic, pvalue = ks_2samp(df[target], drift_df[target])
drift_detected = pvalue < alpha
report = pd.DataFrame([
    {
        "metric": "ks_test_math_score",
        "statistic": statistic,
        "pvalue": pvalue,
        "alpha": alpha,
        "mean_original": df[target].mean(),
        "mean_drift": drift_df[target].mean(),
        "drift_detected": drift_detected,
    }
])
report.to_csv(drift_report_path, index=False, encoding="utf-8")
status = '⚠️ DRIFT' if drift_detected else '✅ Estable'
print(report)
print(f'Resultado KS: statistic={statistic:.4f}, pvalue={pvalue:.4f} -> {status} (alpha={alpha})')
print(f'Reporte guardado en {drift_report_path}')


               metric  statistic    pvalue  alpha  mean_original  mean_drift  \
0  ks_test_math_score      0.057  0.077611   0.05         66.089   64.369593   

   drift_detected  
0           False  
Resultado KS: statistic=0.0570, pvalue=0.0776 -> ✅ Estable (alpha=0.05)
Reporte guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\reports\drift_report.csv


In [4]:
fig, ax = plt.subplots(figsize=(6, 4))
sns.kdeplot(df[target], label="Original", ax=ax)
sns.kdeplot(drift_df[target], label="Drift", ax=ax)
ax.set_title("Distribución de math_score")
ax.legend()
fig.tight_layout()
fig.savefig(drift_plot_path, dpi=200)
plt.close(fig)
print(f"Curva de densidad guardada en {drift_plot_path}")


Curva de densidad guardada en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\reports\drift_math_score.png
