In [1]:
import pandas as pd, numpy as np
pd.set_option("display.max_rows", 120)
pd.set_option("display.float_format", lambda x: f"{x:,.2f}")

PATH = "../data/high_alerts_enriched_with_tx.csv"  # ajusta si es necesario

alerts = pd.read_csv(PATH, encoding="utf-8-sig")
alerts.columns = [str(c).strip().replace("\ufeff","") for c in alerts.columns]

# Normalizaciones básicas
alerts["rule_code"] = alerts["rule_code"].astype(str).str.strip()
alerts["customer_sub_type"] = alerts["customer_sub_type"].astype(str).str.strip()
alerts["created_at"] = pd.to_datetime(alerts["created_at"], errors="coerce")
alerts["day"] = alerts["created_at"].dt.floor("D")

# Explode de subject_ids (clientes)
alerts["subject_ids"] = alerts["subject_ids"].astype(str)
alerts["subject_ids"] = alerts["subject_ids"].str.replace(r"[^\d,]", "", regex=True)
alerts["subject_ids"] = alerts["subject_ids"].str.split(",").apply(lambda xs: [x.strip() for x in xs if x.strip()!=''])
a = alerts.explode("subject_ids").rename(columns={"subject_ids":"customer_id"})

# Explode de external_transaction_ids (puede venir NA o varios separados por coma)
a["external_transaction_ids"] = a["external_transaction_ids"].astype(str)
a["external_transaction_ids"] = a["external_transaction_ids"].str.replace(r"[^\d,]", "", regex=True)
a["external_transaction_ids"] = a["external_transaction_ids"].str.split(",").apply(lambda xs: [x.strip() for x in xs if x.strip()!=''])
a = a.explode("external_transaction_ids").rename(columns={"external_transaction_ids":"tx_id"})

# Llaves de análisis
a["tx_key"] = np.where(a["tx_id"].notna() & (a["tx_id"]!=""),
                       a["customer_id"].astype(str).str.strip()+"|"+a["tx_id"].astype(str).str.strip(),
                       np.nan)

# Deja solo columnas útiles
cols = ["alert_id","rule_code","status","customer_sub_type","customer_id","tx_id","tx_key","day"]
a = a[cols].drop_duplicates()

print("Filas después del explode:", len(a))
print("Alertas únicas:", a["alert_id"].nunique(), "| Reglas distintas:", a["rule_code"].nunique())
a.head(8)


Filas después del explode: 2774
Alertas únicas: 2147 | Reglas distintas: 35


Unnamed: 0,alert_id,rule_code,status,customer_sub_type,customer_id,tx_id,tx_key,day
0,82271,AAD-LA,Not Suspicious,SIN_SEGMENTO,26375,201955096,26375|201955096,2025-03-07 00:00:00+00:00
0,82271,AAD-LA,Not Suspicious,SIN_SEGMENTO,26375,68816944,26375|68816944,2025-03-07 00:00:00+00:00
1,83230,P-TLO,Not Suspicious,Retail,24618,202249097,24618|202249097,2025-03-11 00:00:00+00:00
2,83231,PGAV-OUT,Not Suspicious,Retail,24618,202249097,24618|202249097,2025-03-11 00:00:00+00:00
3,83238,P-HVO,Suspicious,Retail,6144,202249095,6144|202249095,2025-03-11 00:00:00+00:00
4,83239,HNR-OUT,Suspicious,Retail,6144,202249095,6144|202249095,2025-03-11 00:00:00+00:00
5,83240,HASUMO,Not Suspicious,Retail,24618,202249096,24618|202249096,2025-03-11 00:00:00+00:00
5,83240,HASUMO,Not Suspicious,Retail,24618,202249097,24618|202249097,2025-03-11 00:00:00+00:00


In [2]:
def pair_counts(df_keys):
    """
    df_keys: DataFrame con columnas [key, rule_code]
    Retorna pares (rule_a, rule_b) con:
     - dice (2|A∩B| / (|A|+|B|))
     - support_A = |A|
     - support_B = |B|
     - co_hits   = |A∩B|
     - co_ratio_AtoB = |A∩B| / |A|
    """
    # Conjunto por key
    grp = df_keys.groupby("key")["rule_code"].apply(lambda s: sorted(set(s)))
    # Conteo individuales
    from collections import Counter
    cA = Counter([r for rules in grp for r in rules])

    # Conteo pares (combinaciones sin orden)
    from itertools import combinations
    cAB = Counter()
    for rules in grp:
        for a,b in combinations(rules, 2):
            if a > b: a,b = b,a
            cAB[(a,b)] += 1

    rows=[]
    for (a,b),ab in cAB.items():
        A = cA[a]; B = cA[b]
        dice = 2*ab/(A+B) if (A+B)>0 else 0
        rows.append({"rule_a":a,"rule_b":b,"dice":dice,"co_hits":ab,"A":A,"B":B})
    df = pd.DataFrame(rows).sort_values("dice", ascending=False)

    # ratios direccionales
    df["co_ratio_A→B"] = df["co_hits"]/df["A"]
    df["co_ratio_B→A"] = df["co_hits"]/df["B"]
    return df

# --- MISMA TRANSACCIÓN (cuando hay tx_id)
tx_df = a.dropna(subset=["tx_key"]).rename(columns={"tx_key":"key"})
pairs_tx = pair_counts(tx_df[["key","rule_code"]])
print("=== Similaridad (Dice) por MISMA transacción ===")
display(pairs_tx.sort_values(["dice","co_hits"], ascending=False).head(30))

# --- MISMO DÍA (clave = cliente x día)
day_df = a.copy()
day_df["key"] = day_df["customer_id"].astype(str)+"|"+day_df["day"].astype(str)
pairs_day = pair_counts(day_df[["key","rule_code"]])
print("\n=== Similaridad (Dice) por MISMO día ===")
display(pairs_day.sort_values(["dice","co_hits"], ascending=False).head(30))

# --- Siempre acompañada (A→B ≥ 0.95), con soporte mínimo
MIN_SUPP = 30  # ajusta umbral de cuántas veces aparece A
def always_with(pairs, min_supp=MIN_SUPP, thr=0.95):
    out = pairs.loc[(pairs["A"]>=min_supp) & (pairs["co_ratio_A→B"]>=thr),
                    ["rule_a","rule_b","A","co_hits","co_ratio_A→B"]]
    out = out.rename(columns={"rule_a":"rule_A","rule_b":"rule_B","A":"support_A",
                              "co_ratio_A→B":"co_ratio_A→B"})
    return out.sort_values(["co_ratio_A→B","support_A"], ascending=[False,False])

print("\n=== 'Siempre acompañada' por MISMA transacción ===")
display(always_with(pairs_tx))

print("\n=== 'Siempre acompañada' por MISMO DÍA ===")
display(always_with(pairs_day))


=== Similaridad (Dice) por MISMA transacción ===


Unnamed: 0,rule_a,rule_b,dice,co_hits,A,B,co_ratio_A→B,co_ratio_B→A
13,P-LBAL,P-TLI,0.63,40,42,85,0.95,0.47
101,P-HVI,RVT-IN,0.6,39,65,64,0.6,0.61
67,AAD-RD,AAD-RS,0.57,2,2,5,1.0,0.4
30,P-HSUMO,P-TLO,0.47,30,75,53,0.4,0.57
45,HASUMI,PGAV-IN,0.42,55,163,99,0.34,0.56
70,HNR-IN,RVT-IN,0.39,62,255,64,0.24,0.97
8,HNR-OUT,RVT-OUT,0.35,14,61,18,0.23,0.78
89,P-HSUMI,P-TLI,0.34,30,93,85,0.32,0.35
5,IN>%OUT,OUT>%IN,0.33,86,126,391,0.68,0.22
117,HANUMI,P-HVI,0.33,29,111,65,0.26,0.45



=== Similaridad (Dice) por MISMO día ===


Unnamed: 0,rule_a,rule_b,dice,co_hits,A,B,co_ratio_A→B,co_ratio_B→A
78,AAD-RD,AAD-RS,0.67,1,1,2,1.0,0.5
13,P-LBAL,P-TLI,0.63,40,42,84,0.95,0.48
134,P-HVI,RVT-IN,0.57,22,33,44,0.67,0.5
30,P-HSUMO,P-TLO,0.51,29,62,52,0.47,0.56
45,HASUMI,PGAV-IN,0.47,54,133,98,0.41,0.55
117,P-HSUMI,P-TLI,0.38,29,70,84,0.41,0.35
8,HNR-OUT,RVT-OUT,0.37,12,49,16,0.24,0.75
83,HNR-IN,RVT-IN,0.37,42,184,44,0.23,0.95
5,IN>%OUT,OUT>%IN,0.32,46,74,211,0.62,0.22
67,P-LVAL,P-TLI,0.3,25,83,84,0.3,0.3



=== 'Siempre acompañada' por MISMA transacción ===


Unnamed: 0,rule_A,rule_B,support_A,co_hits,co_ratio_A→B
13,P-LBAL,P-TLI,42,40,0.95



=== 'Siempre acompañada' por MISMO DÍA ===


Unnamed: 0,rule_A,rule_B,support_A,co_hits,co_ratio_A→B
13,P-LBAL,P-TLI,42,40,0.95


In [3]:
def solo_stats(level="tx"):
    """
    level: 'tx' para misma transacción, 'day' para mismo día
    Calcula, por regla:
      - total_hits (nº de keys donde aparece)
      - solo_hits   (keys donde solo está esa regla)
      - solo_rate   = solo_hits / total_hits
    """
    if level=="tx":
        df = a.dropna(subset=["tx_key"]).rename(columns={"tx_key":"key"})
    else:
        df = a.copy()
        df["key"] = df["customer_id"].astype(str)+"|"+df["day"].astype(str)

    g = df.groupby("key")["rule_code"].apply(lambda s: list(s))
    # total por regla
    total = df.groupby("rule_code")["key"].nunique().rename("total_hits")
    # solos
    solo_rules = []
    for rules in g:
        if len(set(rules))==1:
            solo_rules.append(rules[0])
    solo = pd.Series(solo_rules).value_counts().rename("solo_hits")
    out = (pd.concat([total, solo], axis=1).fillna(0).astype(int))
    out["solo_rate"] = out["solo_hits"] / out["total_hits"].replace(0,np.nan)
    return out.sort_values(["solo_rate","total_hits"], ascending=[True,False])

print("=== Reglas que NUNCA (o casi nunca) saltan solas — MISMA transacción ===")
solo_tx = solo_stats("tx"); display(solo_tx.head(25))

print("\n=== Reglas que NUNCA (o casi nunca) saltan solas — MISMO día ===")
solo_day = solo_stats("day"); display(solo_day.head(25))


=== Reglas que NUNCA (o casi nunca) saltan solas — MISMA transacción ===


Unnamed: 0,total_hits,solo_hits,solo_rate
AAD-RD,2,0,0.0
RVT-IN,64,1,0.02
P-LBAL,42,2,0.05
P-TLO,53,3,0.06
P-TLI,85,5,0.06
P-HVI,65,10,0.15
RVT-OUT,18,3,0.17
IN>%OUT,126,24,0.19
P-HSUMO,75,15,0.2
PGAV-IN,99,21,0.21



=== Reglas que NUNCA (o casi nunca) saltan solas — MISMO día ===


Unnamed: 0,total_hits,solo_hits,solo_rate
AAD-RD,1,0,0.0
RVT-IN,44,1,0.02
P-LBAL,42,1,0.02
P-TLO,52,3,0.06
P-TLI,84,5,0.06
P-HVI,33,2,0.06
RVT-OUT,16,3,0.19
P-HSUMO,62,12,0.19
P-HSUMI,70,14,0.2
IN>%OUT,74,15,0.2


In [4]:
def coverage_of(A, B, base_pairs):
    """
    Cobertura direccional A→B usando el dataframe de pares (tx o day).
    """
    row = base_pairs[(base_pairs["rule_a"]==min(A,B)) & (base_pairs["rule_b"]==max(A,B))]
    if row.empty:
        return None
    r = row.iloc[0]
    if A<=B:
        return float(r["co_ratio_A→B"])
    else:
        return float(r["co_ratio_B→A"])

def check_subset_pairs(pairs, tag):
    pairs_to_check = [
        ("P-LBAL","P-TLI"),
        ("RVT-IN","HNR-IN"),
        ("RVT-OUT","HNR-OUT"),
        ("OUT>AVG","P-TLO"),
        ("P-HSUMO","P-TLO"),
        ("P-HVI","P-HSUMI"),
        ("P-HVO","P-HSUMO"),
        ("PGAV-IN","P-TLI"),
        ("PGAV-OUT","P-TLO"),
        ("NBCOU","NCOU")
    ]
    rows=[]
    for A,B in pairs_to_check:
        cov_A_B = coverage_of(A,B,pairs)
        cov_B_A = coverage_of(B,A,pairs)
        rows.append({"pair":f"{A} vs {B}", f"{tag}_A→B":cov_A_B, f"{tag}_B→A":cov_B_A})
    return pd.DataFrame(rows)

print("=== Coberturas por MISMA transacción (A→B) en pares clave ===")
display(check_subset_pairs(pairs_tx, "tx"))

print("\n=== Coberturas por MISMO día (A→B) en pares clave ===")
display(check_subset_pairs(pairs_day, "day"))


=== Coberturas por MISMA transacción (A→B) en pares clave ===


Unnamed: 0,pair,tx_A→B,tx_B→A
0,P-LBAL vs P-TLI,0.95,0.47
1,RVT-IN vs HNR-IN,0.97,0.24
2,RVT-OUT vs HNR-OUT,0.78,0.23
3,OUT>AVG vs P-TLO,0.21,0.21
4,P-HSUMO vs P-TLO,0.4,0.57
5,P-HVI vs P-HSUMI,0.38,0.27
6,P-HVO vs P-HSUMO,0.26,0.08
7,PGAV-IN vs P-TLI,0.27,0.32
8,PGAV-OUT vs P-TLO,0.11,0.72
9,NBCOU vs NCOU,,



=== Coberturas por MISMO día (A→B) en pares clave ===


Unnamed: 0,pair,day_A→B,day_B→A
0,P-LBAL vs P-TLI,0.95,0.48
1,RVT-IN vs HNR-IN,0.95,0.23
2,RVT-OUT vs HNR-OUT,0.75,0.24
3,OUT>AVG vs P-TLO,0.22,0.21
4,P-HSUMO vs P-TLO,0.47,0.56
5,P-HVI vs P-HSUMI,0.36,0.17
6,P-HVO vs P-HSUMO,0.3,0.05
7,PGAV-IN vs P-TLI,0.28,0.32
8,PGAV-OUT vs P-TLO,0.12,0.73
9,NBCOU vs NCOU,,


In [5]:
# === Verificación: "apagar" reglas candidatas y chequear si se pierden alertas sospechosas ===
import pandas as pd
import numpy as np
from datetime import datetime

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# --- Parámetros editables -----------------------------------------------------
PATH_ALERTS = "../data/high_alerts_enriched_with_tx.csv"

# Reglas a apagar (según propuesta)
RULES_OFF = [
    "P-LBAL",      # subset de P-TLI
    "RVT-IN",      # contenida por HNR-IN
    "RVT-OUT",     # contenida por HNR-OUT
    # Elegir UNA entre NBCOU / NCOU (deja comentada la que mantienes)
    "NBCOU",
    # "NCOU",
    # Si decides también apagar P-TLO porque te quedas con PGAV-OUT, destápala:
    # "P-TLO",
]

# (Opcional) filtrar por segmento. Usa None para "todos"
SEGMENTS = None
# Ejemplos:
# SEGMENTS = ["Investment Vehicle"]
# SEGMENTS = ["Retail", "Big Companies"]

# --- Carga y normalización ----------------------------------------------------
alerts = pd.read_csv(PATH_ALERTS, encoding="utf-8-sig")
alerts.columns = [str(c).strip().replace("\ufeff","") for c in alerts.columns]

# Normaliza
alerts["rule_code"] = alerts["rule_code"].astype(str).str.strip().str.upper()
alerts["status"]    = alerts["status"].astype(str).str.strip().str.title()  # "Suspicious" / "Not Suspicious"
alerts["customer_sub_type"] = alerts["customer_sub_type"].astype(str).str.strip()
alerts["created_at"] = pd.to_datetime(alerts["created_at"], errors="coerce")

# Estandariza tx ids y subject ids como strings limpias (para matching)
alerts["external_transaction_ids_norm"] = (
    alerts["external_transaction_ids"].astype(str).str.replace(" ", "", regex=False).str.upper()
)
alerts["subject_ids_norm"] = alerts["subject_ids"].astype(str).str.replace(" ", "", regex=False)

# Filtro por segmento (si aplica)
if SEGMENTS:
    alerts_f = alerts[alerts["customer_sub_type"].isin(SEGMENTS)].copy()
else:
    alerts_f = alerts.copy()

# --- Separar "apagadas" vs "activas" -----------------------------------------
off_mask   = alerts_f["rule_code"].isin([r.upper() for r in RULES_OFF])
alerts_off = alerts_f[off_mask].copy()
alerts_on  = alerts_f[~off_mask].copy()

# --- Análisis de pérdidas de alertas -----------------------------------------
# 1) ¿Cuántas alertas totales y sospechosas se "pierden" al apagar?
lost_total = len(alerts_off)
lost_susp  = int((alerts_off["status"] == "Suspicious").sum())

# 2) Entre las sospechosas "perdidas", ver si están cubiertas por otra regla
sus_lost = alerts_off[alerts_off["status"] == "Suspicious"].copy()

def covered_by_other_rule(row, on_df):
    sid = row["subject_ids_norm"]
    txs = row["external_transaction_ids_norm"]
    dt  = row["created_at"].normalize()  # solo la fecha
    # a) match exacto por (subject, tx_id) si existe un tx_id
    covered_tx = False
    if txs and txs != "NA" and txs != "NAN":
        covered_tx = (
            on_df[(on_df["subject_ids_norm"] == sid) &
                  (on_df["external_transaction_ids_norm"] == txs)]
            .shape[0] > 0
        )
    # b) si no hay tx_id usable, probar por misma fecha y sujeto
    covered_day = (
        on_df[(on_df["subject_ids_norm"] == sid) &
              (on_df["created_at"].dt.normalize() == dt)]
        .shape[0] > 0
    )
    return bool(covered_tx or covered_day)

if not sus_lost.empty:
    sus_lost["covered_by_other_rule"] = sus_lost.apply(covered_by_other_rule, axis=1, on_df=alerts_on)
else:
    sus_lost["covered_by_other_rule"] = []

truly_lost = sus_lost[~sus_lost["covered_by_other_rule"]].copy()

# --- Tablas resumen -----------------------------------------------------------
by_rule = (alerts_off
           .groupby("rule_code", as_index=False)
           .agg(lost_alerts=("alert_id","count"),
                lost_suspicious=("status", lambda s: int((s=="Suspicious").sum())),
                lost_not_suspicious=("status", lambda s: int((s=="Not Suspicious").sum())))
           .sort_values("lost_alerts", ascending=False))

by_rule_sus = (sus_lost
               .groupby("rule_code", as_index=False)
               .agg(sus_lost=("alert_id","count"),
                    sus_covered=("covered_by_other_rule", lambda s: int(s.sum())),
                    sus_truly_lost=("covered_by_other_rule", lambda s: int((~s).sum())))
               .sort_values("sus_lost", ascending=False))

# --- Output -------------------------------------------------------------------
print("=== Configuración de prueba ===")
print(f"Reglas 'apagadas': {', '.join(RULES_OFF)}")
print(f"Segmentos: {'Todos' if not SEGMENTS else ', '.join(SEGMENTS)}\n")

print("=== Impacto bruto de apagar reglas ===")
print(f"- Alertas que se irían: {lost_total:,}")
print(f"- Sospechosas que se irían (antes de chequear cobertura): {lost_susp:,}\n")

print("== Detalle por regla (impacto bruto) ==")
display(by_rule)

print("\n=== Cobertura de SOSPECHOSAS perdidas por otras reglas ===")
print(f"- Sospechosas perdidas (bruto): {len(sus_lost):,}")
print(f"- Cubiertas por otra regla (misma transacción o mismo día del mismo cliente): {int(sus_lost['covered_by_other_rule'].sum()):,}")
print(f"- Sospechosas realmente perdidas (sin cobertura): {len(truly_lost):,}\n")

display(by_rule_sus)

if len(truly_lost) > 0:
    print("\n>>> ATENCIÓN: hay sospechosas que se perderían SIN cobertura. Muestra de casos:")
    display(truly_lost[["alert_id","rule_code","subject_ids","subject_names","created_at",
                         "external_transaction_ids","status","customer_sub_type"]].head(20))
else:
    print("\n✅ OK: Al apagar este set de reglas, no se pierden alertas sospechosas sin que otra regla las cubra (por misma transacción o mismo día).")



=== Configuración de prueba ===
Reglas 'apagadas': P-LBAL, RVT-IN, RVT-OUT, NBCOU
Segmentos: Todos

=== Impacto bruto de apagar reglas ===
- Alertas que se irían: 103
- Sospechosas que se irían (antes de chequear cobertura): 6

== Detalle por regla (impacto bruto) ==


Unnamed: 0,rule_code,lost_alerts,lost_suspicious,lost_not_suspicious
1,RVT-IN,45,3,41
0,P-LBAL,42,3,39
2,RVT-OUT,16,0,16



=== Cobertura de SOSPECHOSAS perdidas por otras reglas ===
- Sospechosas perdidas (bruto): 6
- Cubiertas por otra regla (misma transacción o mismo día del mismo cliente): 5
- Sospechosas realmente perdidas (sin cobertura): 1



Unnamed: 0,rule_code,sus_lost,sus_covered,sus_truly_lost
0,P-LBAL,3,3,0
1,RVT-IN,3,2,1



>>> ATENCIÓN: hay sospechosas que se perderían SIN cobertura. Muestra de casos:


Unnamed: 0,alert_id,rule_code,subject_ids,subject_names,created_at,external_transaction_ids,status,customer_sub_type
1450,147315,RVT-IN,50297,JELVES CARVAJAL EMILIO ALFONSO,2025-07-09 13:48:57.479536+00:00,210184071,Suspicious,SIN_SEGMENTO
