In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp, chisquare
from sklearn.metrics import mutual_info_score
from scipy.spatial.distance import jensenshannon

from ft_engineering import prepare_features, load_data
import joblib


# PSI 
def psi(expected, actual, buckets=10):
    expected = pd.Series(expected)
    actual = pd.Series(actual)

    quantiles = np.linspace(0, 1, buckets+1)
    expected_bins = np.quantile(expected, quantiles)
    actual_bins = np.quantile(actual, quantiles)

    psi_value = 0
    for i in range(buckets):
        e = expected[(expected >= expected_bins[i]) & (expected < expected_bins[i+1])].count() / len(expected)
        a = actual[(actual >= actual_bins[i]) & (actual < actual_bins[i+1])].count() / len(actual)

        if e == 0: e = 0.0001
        if a == 0: a = 0.0001

        psi_value += (e - a) * np.log(e / a)

    return psi_value


# DRIFT REPORT
def compute_drift(base_df, new_df):

    numeric_cols = base_df.select_dtypes(include=['int64','float64']).columns
    categorical_cols = base_df.select_dtypes(include=['object','bool']).columns

    drift_results = {}

    # Numéricas
    for col in numeric_cols:
        ks_p = ks_2samp(base_df[col], new_df[col]).pvalue
        psi_val = psi(base_df[col], new_df[col])
        js = jensenshannon(base_df[col].value_counts(normalize=True),
                           new_df[col].value_counts(normalize=True))
        drift_results[col] = {
            "KS_pvalue": ks_p,
            "PSI": psi_val,
            "JS": js
        }

    # Categóricas
    for col in categorical_cols:
        base_counts = base_df[col].value_counts()
        new_counts  = new_df[col].value_counts()
        chi, p = chisquare(new_counts.align(base_counts, fill_value=0)[0],
                           base_counts.align(new_counts, fill_value=0)[0])

        drift_results[col] = {
            "Chi2": chi,
            "Chi2_pvalue": p
        }

    return pd.DataFrame(drift_results).T


if __name__ == "__main__":
    print("\n=== Ejecutando monitoreo ===")

    # 1) Históricos
    baseline_path = "../../data/raw/telco_churn_baseline.csv"
    baseline = pd.read_csv(baseline_path)

    # 2) Nuevos datos (simulación)
    new_data_path = "../../data/raw/telco_churn_new_batch.csv"
    new_data = pd.read_csv(new_data_path)

    # 3) Calcular drift
    drift_table = compute_drift(baseline, new_data)

    print("\n=== Resultados Data Drift ===")
    print(drift_table)

    # 4) Guardar
    output_path = "../monitoring/data_drift_report.csv"
    drift_table.to_csv(output_path)

    print(f"\nReporte guardado en: {output_path}")
