In [3]:
# === Comparación de alertas reales vs simuladas (Actual) =======================
import json, pandas as pd
from pathlib import Path

# --- Ajusta estas rutas si difieren en tu repo ---
CSV_PATH = Path("alertas marzo-agosto 2025 enriched.csv")
COMPACT_JSON_PATH = Path("alerts_summary_compact.json")

# Ventana temporal en UTC (para comparar con created_at que viene tz-aware)
COUNT_FROM_UTC = pd.Timestamp("2025-03-01", tz="UTC")
COUNT_TO_UTC   = pd.Timestamp("2025-08-31 23:59:59", tz="UTC")

# ------------------- Carga de datos -------------------
df = pd.read_csv(CSV_PATH, dtype={"alert_id":"string"}, encoding="utf-8-sig", low_memory=False)

# Campos clave defensivos
df["customer_sub_sub_type"] = df.get("customer_sub_sub_type", "").astype(str)
df["rule_code"] = df.get("rule_code", "").astype(str).str.strip()
df["alert_id"]  = df.get("alert_id").astype("string")

# ------------------- Filtro por rango de fechas (UTC) -------------------
# Parseamos siempre con utc=True para unificar (si viene naive, lo trata como UTC)
if "created_at" in df.columns:
    created_at_utc = pd.to_datetime(df["created_at"], errors="coerce", utc=True)
    mask_time = created_at_utc.isna() | created_at_utc.between(COUNT_FROM_UTC, COUNT_TO_UTC)
    df = df[mask_time].copy()

# ------------------- Filtrado retail sub-sub -------------------
retail_mask = df["customer_sub_sub_type"].isin(["R-Low", "R-High"])
df_retail = df[retail_mask].copy()

# ------------------- Cómputo reales: alertas únicas por regla -------------------
real_counts = (
    df_retail.dropna(subset=["rule_code", "alert_id"])
             .groupby("rule_code", as_index=False)["alert_id"]
             .nunique()
             .rename(columns={"alert_id":"reales"})
)

# ------------------- Carga simuladas (compact) -------------------
with open(COMPACT_JSON_PATH, "r", encoding="utf-8") as f:
    compact = json.load(f)

sim_rows = [{"rule_code": regla, "simuladas_actual": int(vals.get("actual", 0))}
            for regla, vals in compact.items()]
sim_df = pd.DataFrame(sim_rows)

# ------------------- Merge + métricas -------------------
out = pd.merge(sim_df, real_counts, on="rule_code", how="outer").fillna({"simuladas_actual": 0, "reales": 0})
out["simuladas_actual"] = out["simuladas_actual"].astype(int)
out["reales"] = out["reales"].astype(int)

out["delta"] = out["simuladas_actual"] - out["reales"]
out["ratio_sim_vs_real"] = out.apply(lambda r: (r["simuladas_actual"] / r["reales"]) if r["reales"] > 0 else None, axis=1)

# Orden: por mayor |delta| descendente, luego alfabético
out = out.sort_values(by=["delta","rule_code"], ascending=[False, True]).reset_index(drop=True)

# Totales
totals = pd.DataFrame([{
    "rule_code": "TOTAL",
    "simuladas_actual": int(out["simuladas_actual"].sum()),
    "reales": int(out["reales"].sum()),
    "delta": int(out["delta"].sum()),
    "ratio_sim_vs_real": (out["simuladas_actual"].sum()/out["reales"].sum()) if out["reales"].sum() > 0 else None
}])

out_with_total = pd.concat([out, totals], ignore_index=True)
display(out_with_total[["rule_code","reales","simuladas_actual","delta","ratio_sim_vs_real"]])


Unnamed: 0,rule_code,reales,simuladas_actual,delta,ratio_sim_vs_real
0,NUMCCI,0,13047,13047,
1,NUMCCO,0,12218,12218,
2,SUMCCI,0,3637,3637,
3,P-LBAL,192,3712,3520,19.333333
4,OUT>%IN,300,3685,3385,12.283333
5,HNR-OUT,488,3373,2885,6.911885
6,SUMCCO,0,2770,2770,
7,P-HSUMI,252,2520,2268,10.0
8,HNR-IN,529,2477,1948,4.68242
9,P-HSUMO,165,2013,1848,12.2
