# Análisis de Datos

Leer csv "high_alerts_enriched_with_tx.csv"

In [11]:
import pandas as pd
import numpy as np
import re

PATH = "../data/high_alerts_enriched_with_tx.csv"

def read_csv_smart(path):
    encs = ["utf-8-sig", "latin-1", "cp1252"]
    last_err = None
    for enc in encs:
        try:
            df = pd.read_csv(path, sep=None, engine="python", encoding=enc)
            print(f"Leído con encoding='{enc}'")
            return df
        except Exception as e:
            last_err = e
            continue
    raise last_err

def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (out.columns
                   .str.replace("\ufeff", "", regex=False)
                   .str.strip()
                   .str.strip("\"'")
                   .str.rstrip(";"))
    return out

df = read_csv_smart(PATH)
df = normalize_cols(df)

# parsear fecha
if "created_at" in df.columns:
    df["created_at"] = pd.to_datetime(df["created_at"], utc=True, errors="coerce")

print("OK: archivo leído y columnas normalizadas.")


Leído con encoding='utf-8-sig'
OK: archivo leído y columnas normalizadas.


Chequeo forma, tipos de datos

In [12]:
print("shape:", df.shape)
print("\nDtypes:")
print(df.dtypes)

print("\nHead(3):")
display(df.head(3))

shape: (2147, 16)

Dtypes:
alert_id                                  int64
rule_code                                object
subject_ids                               int64
subject_names                            object
number_of_transactions                    int64
created_at                  datetime64[ns, UTC]
status                                   object
external_transaction_ids                 object
tx_direction                             object
tx_base_amount                           object
customer_type                            object
customer_account_balance                 object
customer_networth                        object
customer_income                          object
customer_expected_amount                 object
customer_sub_type                        object
dtype: object

Head(3):


Unnamed: 0,alert_id,rule_code,subject_ids,subject_names,number_of_transactions,created_at,status,external_transaction_ids,tx_direction,tx_base_amount,customer_type,customer_account_balance,customer_networth,customer_income,customer_expected_amount,customer_sub_type
0,82271,AAD-LA,26375,MOREL BULICIC JORGE RAFAEL,2,2025-03-07 12:27:52.142219+00:00,Not Suspicious,"201955096, 68816944","NA, NA","NA, NA","NA, NA","NA, NA","NA, NA","NA, NA","NA, NA",SIN_SEGMENTO
1,83230,P-TLO,24618,GARCIA LABORA WALDO,1,2025-03-11 12:33:06.388175+00:00,Not Suspicious,202249097,Outbound,230017346.0,Individual,366027051.0,SIN CLASIFICACION,Entre 5 y 10 millones,1500000000.0,Retail
2,83231,PGAV-OUT,24618,GARCIA LABORA WALDO,1,2025-03-11 12:33:06.388175+00:00,Not Suspicious,202249097,Outbound,230017346.0,Individual,366027051.0,SIN CLASIFICACION,Entre 5 y 10 millones,1500000000.0,Retail


% de alertas por segmento

In [13]:
seg_col = "customer_sub_type"
if seg_col not in df.columns:
    raise ValueError(f"Falta la columna '{seg_col}'")

seg_counts = df[seg_col].fillna("SIN_SEGMENTO").value_counts(dropna=False)
seg_pct = (seg_counts / seg_counts.sum() * 100).round(2)

res_seg = pd.DataFrame({"alertas": seg_counts, "%": seg_pct})

print("\nTop segmentos por cantidad de alertas:")
display(res_seg.head(10))



Top segmentos por cantidad de alertas:


Unnamed: 0_level_0,alertas,%
customer_sub_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Retail,1023,47.65
SIN_SEGMENTO,684,31.86
Investment Vehicle,181,8.43
Imports/Exports,142,6.61
LV Related- Organization,82,3.82
Institutional Clients,18,0.84
Big Companies,13,0.61
Non-Profit,3,0.14
"Retail, Imports/Exports",1,0.05


Top reglas

In [14]:
rule_counts = df["rule_code"].fillna("SIN_REGLA").value_counts()
rule_pct = (rule_counts / rule_counts.sum() * 100).round(2)
res_rules = pd.DataFrame({"alertas": rule_counts, "%": rule_pct})
display(res_rules.head(20))


Unnamed: 0_level_0,alertas,%
rule_code,Unnamed: 1_level_1,Unnamed: 2_level_1
PGAV-OUT,331,15.42
OUT>%IN,214,9.97
HNR-IN,187,8.71
HASUMI,135,6.29
PGAV-IN,99,4.61
P-LVAL,88,4.1
P-TLI,85,3.96
OCMC_1,84,3.91
HASUMO,76,3.54
IN>%OUT,74,3.45


Resumen por segmento: % alertas y % sospechosas

In [15]:
# Usamos df (ya leído/limpio)
seg_col = "customer_sub_type"
if seg_col not in df.columns:
    raise ValueError(f"Falta la columna '{seg_col}'")

# Segmento limpio para agrupar
df_seg = df.copy()
df_seg["segment"] = df_seg[seg_col].fillna("SIN_SEGMENTO").astype(str).str.strip()

# Flag de sospechosa (ojo: 'not suspicious' != 'suspicious')
status_norm = df_seg["status"].astype(str).str.strip().str.lower()
df_seg["is_suspicious"] = status_norm.eq("suspicious")

# Agregación por segmento
summary = (df_seg.groupby("segment", as_index=False)
                 .agg(alertas=("alert_id", "size"),
                      sospechosas=("is_suspicious", "sum")))

# Porcentajes
summary["%_alertas"] = (summary["alertas"] / summary["alertas"].sum() * 100).round(2)
summary["%_sospechosas"] = (summary["sospechosas"] / summary["alertas"] * 100).round(2)
summary["%_falsos_positivos"] = (100 - summary["%_sospechosas"]).round(2)

# Orden por tamaño
summary = summary.sort_values("alertas", ascending=False).reset_index(drop=True)

display(summary)


Unnamed: 0,segment,alertas,sospechosas,%_alertas,%_sospechosas,%_falsos_positivos
0,Retail,1023,70,47.65,6.84,93.16
1,SIN_SEGMENTO,684,25,31.86,3.65,96.35
2,Investment Vehicle,181,14,8.43,7.73,92.27
3,Imports/Exports,142,4,6.61,2.82,97.18
4,LV Related- Organization,82,7,3.82,8.54,91.46
5,Institutional Clients,18,0,0.84,0.0,100.0
6,Big Companies,13,0,0.61,0.0,100.0
7,Non-Profit,3,0,0.14,0.0,100.0
8,"Retail, Imports/Exports",1,1,0.05,100.0,0.0


Top segmentos por tasa de falsos positivos

In [16]:
umbral_min_alertas = 20  # ajusta si quieres
top_fp = (summary.loc[summary["alertas"] >= umbral_min_alertas]
                  .sort_values("%_falsos_positivos", ascending=False)
                  .reset_index(drop=True))

print(f"Segmentos con más falsos positivos (≥{umbral_min_alertas} alertas):")
display(top_fp.head(10))


Segmentos con más falsos positivos (≥20 alertas):


Unnamed: 0,segment,alertas,sospechosas,%_alertas,%_sospechosas,%_falsos_positivos
0,Imports/Exports,142,4,6.61,2.82,97.18
1,SIN_SEGMENTO,684,25,31.86,3.65,96.35
2,Retail,1023,70,47.65,6.84,93.16
3,Investment Vehicle,181,14,8.43,7.73,92.27
4,LV Related- Organization,82,7,3.82,8.54,91.46


Top 5 reglas por segmento

In [17]:
# Celda — Top-5 reglas por segmento (con % y tasa de FP)
seg_col = "customer_sub_type"
df_rules = df.copy()

# Segmento limpio y flag de sospechosa
df_rules["segment"] = df_rules[seg_col].fillna("SIN_SEGMENTO").astype(str).str.strip()
status_norm = df_rules["status"].astype(str).str.strip().str.lower()
df_rules["is_suspicious"] = status_norm.eq("suspicious")

# Totales por segmento
tot_seg = df_rules.groupby("segment")["alert_id"].size().rename("total_seg")

# Agregado Segmento × Regla
tbl = (df_rules.groupby(["segment", "rule_code"], as_index=False)
              .agg(alertas=("alert_id", "size"),
                   sospechosas=("is_suspicious", "sum")))

# Métricas dentro del segmento
tbl = tbl.merge(tot_seg.reset_index(), on="segment", how="left")
tbl["%_en_segmento"] = (tbl["alertas"] / tbl["total_seg"] * 100).round(2)
tbl["%_sospechosas"] = (tbl["sospechosas"] / tbl["alertas"] * 100).round(2)
tbl["%_falsos_positivos"] = (100 - tbl["%_sospechosas"]).round(2)

# Top-N por segmento (puedes ajustar estos parámetros)
top_n = 5
umbral_min_alertas_segmento = 20  # evita segmentos muy chicos
valid_segments = tot_seg[tot_seg >= umbral_min_alertas_segmento].index

top_rules_by_seg = (tbl[tbl["segment"].isin(valid_segments)]
                    .sort_values(["segment", "alertas"], ascending=[True, False])
                    .groupby("segment")
                    .head(top_n)
                    .reset_index(drop=True))

display(top_rules_by_seg)

# (Opcional) enfocar en un segmento específico
SEG_FOCUS = "Institutional Clients"
display(top_rules_by_seg[top_rules_by_seg["segment"] == SEG_FOCUS])


Unnamed: 0,segment,rule_code,alertas,sospechosas,total_seg,%_en_segmento,%_sospechosas,%_falsos_positivos
0,Imports/Exports,HASUMI,37,2,142,26.06,5.41,94.59
1,Imports/Exports,IN>%OUT,18,2,142,12.68,11.11,88.89
2,Imports/Exports,AAD-SMUR,16,0,142,11.27,0.0,100.0
3,Imports/Exports,OUT>%IN,11,0,142,7.75,0.0,100.0
4,Imports/Exports,OUT>AVG,11,0,142,7.75,0.0,100.0
5,Investment Vehicle,P-LVAL,31,3,181,17.13,9.68,90.32
6,Investment Vehicle,OUT>%IN,22,1,181,12.15,4.55,95.45
7,Investment Vehicle,PGAV-OUT,18,0,181,9.94,0.0,100.0
8,Investment Vehicle,NCU,14,1,181,7.73,7.14,92.86
9,Investment Vehicle,HANUMI,11,1,181,6.08,9.09,90.91


Unnamed: 0,segment,rule_code,alertas,sospechosas,total_seg,%_en_segmento,%_sospechosas,%_falsos_positivos


Ver cantidad de falsos positivos por segmento

In [18]:
# Celda — Distribución de falsos positivos por segmento
seg_col = "customer_sub_type"
if seg_col not in df.columns:
    raise ValueError(f"Falta la columna '{seg_col}'")

tmp = df.copy()
tmp["segment"] = tmp[seg_col].fillna("SIN_SEGMENTO").astype(str).str.strip()

# Normalizamos status y marcamos Falso Positivo (todo lo que NO sea 'Suspicious')
status_norm = tmp["status"].astype(str).str.strip().str.lower()
tmp["is_fp"] = ~status_norm.eq("suspicious")

total_alertas = len(tmp)
total_fp = int(tmp["is_fp"].sum())
pct_fp_global = (total_fp / total_alertas * 100) if total_alertas else 0.0

# Agregado por segmento
seg_summary_fp = (
    tmp.groupby("segment", as_index=False)
       .agg(alertas=("alert_id", "size"),
            falsos_positivos=("is_fp", "sum"))
)

# Métricas
seg_summary_fp["%_del_total_alertas"] = (seg_summary_fp["alertas"] / total_alertas * 100).round(2)
seg_summary_fp["%_del_total_FP"]      = (seg_summary_fp["falsos_positivos"] / total_fp * 100).round(2) if total_fp else 0
seg_summary_fp["tasa_FP_en_segmento"] = (seg_summary_fp["falsos_positivos"] / seg_summary_fp["alertas"] * 100).round(2)

# Orden principal: participación en el total de FP
seg_summary_fp = seg_summary_fp.sort_values("%_del_total_FP", ascending=False).reset_index(drop=True)

print(f"Total de alertas: {total_alertas:,}")
print(f"Falsos positivos (Not Suspicious): {total_fp:,} ({pct_fp_global:.2f}% del total)\n")

display(seg_summary_fp)


Total de alertas: 2,147
Falsos positivos (Not Suspicious): 2,026 (94.36% del total)



Unnamed: 0,segment,alertas,falsos_positivos,%_del_total_alertas,%_del_total_FP,tasa_FP_en_segmento
0,Retail,1023,953,47.65,47.04,93.16
1,SIN_SEGMENTO,684,659,31.86,32.53,96.35
2,Investment Vehicle,181,167,8.43,8.24,92.27
3,Imports/Exports,142,138,6.61,6.81,97.18
4,LV Related- Organization,82,75,3.82,3.7,91.46
5,Institutional Clients,18,18,0.84,0.89,100.0
6,Big Companies,13,13,0.61,0.64,100.0
7,Non-Profit,3,3,0.14,0.15,100.0
8,"Retail, Imports/Exports",1,0,0.05,0.0,0.0


In [21]:
import re
import pandas as pd

def normalize_name(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).upper().strip()
    s = re.sub(r"\s+", " ", s)
    return s

# Copiamos dataset y preparamos claves
df_cli = df.copy()

# Clave por nombre (normalizada) y segmento limpio
df_cli["client_name_norm"] = df_cli["subject_names"].apply(normalize_name)
df_cli["segment"] = df_cli["customer_sub_type"].fillna("SIN_SEGMENTO").astype(str).str.strip()

# Flag de falso positivo (cualquier status != 'Suspicious')
status_norm = df_cli["status"].astype(str).str.strip().str.lower()
df_cli["is_fp"] = ~status_norm.eq("suspicious")

# Agregado por cliente (nombre) y segmento
cli_summary = (df_cli.groupby(["client_name_norm", "segment"], as_index=False)
                    .agg(alertas=("alert_id", "size"),
                         fp=("is_fp", "sum")))

cli_summary["fp_rate_pct"] = (cli_summary["fp"] / cli_summary["alertas"] * 100).round(2)

# Contribución de cada cliente al total de FP
total_fp = int(cli_summary["fp"].sum())
cli_summary["pct_fp_total"] = (cli_summary["fp"] / total_fp * 100).round(3) if total_fp else 0

# Nombre legible (modo del nombre original por clave normalizada)
name_mode = (df_cli.groupby("client_name_norm")["subject_names"]
                 .agg(lambda s: s.value_counts(dropna=False).index[0])
                 .rename("client_name"))
cli_summary = cli_summary.merge(name_mode, on="client_name_norm", how="left")

# Ranking por FP
cli_rank_fp = cli_summary.sort_values(["fp", "alertas"], ascending=False).reset_index(drop=True)

print("Top clientes por falsos positivos (aporte al total):")
display(cli_rank_fp[["client_name","segment","alertas","fp","fp_rate_pct","pct_fp_total"]].head(15))


Top clientes por falsos positivos (aporte al total):


Unnamed: 0,client_name,segment,alertas,fp,fp_rate_pct,pct_fp_total
0,AGRICOLA SIEMEL LIMITADA,SIN_SEGMENTO,57,57,100.0,2.813
1,SEGURO INVERSIONES SPA,SIN_SEGMENTO,54,54,100.0,2.665
2,NUTRICION Y ALIMENTO S.A.,SIN_SEGMENTO,45,43,95.56,2.122
3,CHILEAN MARRONI FROZEN SPA,SIN_SEGMENTO,38,38,100.0,1.876
4,MARTIN TARUD CAMILA,Retail,29,29,100.0,1.431
5,KURASZ ZAJACZKOWSKA ARTURO,Retail,28,28,100.0,1.382
6,GTD GRUPO TELEDUCTOS S.A.,Investment Vehicle,21,21,100.0,1.037
7,MARTIN TARUD VICENTE REINALDO,Retail,21,21,100.0,1.037
8,CAJA DE COMPENSACION 18 DE SEPTIEMBRE,Investment Vehicle,20,20,100.0,0.987
9,GARIBALDI FRITIS MAURIZIO,Retail,21,19,90.48,0.938


In [1]:
import pandas as pd
import re, unicodedata, csv

# --- Rutas (ajústalas si cambian) ---
SEG_PATH  = "../data/Segmentacion Clientes al 0209.csv"
ALERTS_PATH = "../data/high_alerts_enriched_with_tx.csv"
TX_PATH   = "../data/transacciones_cash_2025.csv"

MARCH_START_UTC = pd.Timestamp("2025-03-01", tz="UTC")
MARCH_START     = pd.Timestamp("2025-03-01")

def read_csv_smart(path, sep=None, dtype=None):
    for enc in ("utf-8-sig","latin-1","cp1252"):
        try:
            return pd.read_csv(path, sep=sep, engine="python", dtype=dtype, encoding=enc)
        except Exception:
            continue
    return pd.read_csv(path, sep=sep or ",", engine="python", dtype=dtype,
                       encoding="latin-1", on_bad_lines="skip")

def read_csv_robust_tx(path, dtype=None):
    attempts = [
        dict(sep=None, engine="python", dtype=dtype),
        dict(sep=",",   engine="python", dtype=dtype),
        dict(sep=",",   engine="python", dtype=dtype, on_bad_lines="skip"),
        dict(sep=",",   engine="python", dtype=dtype, quoting=csv.QUOTE_NONE, escapechar="\\", on_bad_lines="skip"),
    ]
    for enc in ("utf-8-sig","latin-1","cp1252"):
        for kw in attempts:
            try:
                return pd.read_csv(path, encoding=enc, **kw)
            except Exception:
                continue
    raise RuntimeError(f"No pude leer {path}")

def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (out.columns
                   .str.replace("\ufeff","", regex=False)
                   .str.strip().str.strip("\"'").str.rstrip(";"))
    return out

def norm_name(s: str) -> str:
    if pd.isna(s): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return re.sub(r"\s+"," ", s.upper().strip())

# --- Carga y normalización ---

# Segmentación (snapshot)
seg = read_csv_smart(SEG_PATH, sep=";", dtype=str)
seg = normalize_cols(seg)
seg["NOMBRE_CLI_norm"] = seg["NOMBRE_CLI"].apply(norm_name)

# Alertas HIGH enriquecidas
alerts = read_csv_smart(ALERTS_PATH, sep=None)
alerts = normalize_cols(alerts)
alerts["created_at"] = pd.to_datetime(alerts.get("created_at"), utc=True, errors="coerce")
alerts = alerts.loc[alerts["created_at"] >= MARCH_START_UTC].copy()
alerts["subject_names_norm"] = alerts["subject_names"].apply(norm_name)
alerts["segment_alert"] = alerts["customer_sub_type"].astype(str).str.strip().replace({"nan":"SIN_SEGMENTO","": "SIN_SEGMENTO"})
alerts["_is_fp"] = alerts["status"].astype(str).str.strip().str.lower().ne("suspicious")  # True = FP

# Transacciones Cash (marzo+)
tx = read_csv_robust_tx(TX_PATH, dtype=str)
tx = normalize_cols(tx)
tx["tx_date_time"]      = pd.to_datetime(tx["tx_date_time"], errors="coerce")
tx = tx.loc[tx["tx_date_time"] >= MARCH_START].copy()
tx["customer_name_norm"]= tx["customer_name"].apply(norm_name)
tx["customer_sub_type"] = tx["customer_sub_type"].astype(str).str.strip().str.rstrip(";").replace({"nan":"SIN_SEGMENTO","": "SIN_SEGMENTO"})


In [None]:
clientes_activos = tx["customer_name_norm"].nunique()
print(f"Clientes ACTIVOS (≥1 transacción Cash desde 2025-03-01): {clientes_activos:,}")

# Cuántos de esos activos aparecen en alertas (≥1 alerta)
set_activos = set(tx["customer_name_norm"])
set_alertas = set(alerts["subject_names_norm"])
activos_con_alerta = len(set_activos & set_alertas)
print(f"Activos con ≥1 alerta: {activos_con_alerta:,} ({activos_con_alerta/clientes_activos*100:.2f}%)")



Clientes ACTIVOS (≥1 transacción Cash desde 2025-03-01): 16,234
Activos con ≥1 alerta: 745 (4.59%)


In [12]:
# === ¿De los clientes activos con alerta SIN SEGMENTO (132), cuántos tienen segmento en su ÚLTIMA transacción? ===
import pandas as pd
import unicodedata, re, csv

# ---------------- Rutas (ajusta si cambian) ----------------
SEG_PATH  = "../data/Segmentacion Clientes al 0209.csv"
ALERTS_PATH = "../data/high_alerts_enriched_with_tx.csv"
TX_PATH   = "../data/transacciones_cash_2025.csv"

# ---------------- Helpers ----------------
def _norm_name(s: str) -> str:
    if pd.isna(s): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return re.sub(r"\s+"," ", s.upper().strip())

def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (out.columns
                   .str.replace("\ufeff","",regex=False)
                   .str.strip().str.strip("\"'")
                   .str.rstrip(";"))
    return out

def _read_csv(path, sep=None, dtype=None, robust_tx=False):
    if robust_tx:
        attempts = [
            dict(sep=None, engine="python", dtype=dtype),
            dict(sep=",", engine="python", dtype=dtype),
            dict(sep=",", engine="python", dtype=dtype, on_bad_lines="skip"),
            dict(sep=",", engine="python", dtype=dtype, quoting=csv.QUOTE_NONE, escapechar="\\", on_bad_lines="skip"),
        ]
        for enc in ("utf-8-sig","latin-1","cp1252"):
            for kw in attempts:
                try:
                    return pd.read_csv(path, encoding=enc, **kw)
                except Exception:
                    continue
        raise RuntimeError(f"No pude leer {path}")
    else:
        for enc in ("utf-8-sig","latin-1","cp1252"):
            try:
                return pd.read_csv(path, sep=sep, engine="python", dtype=dtype, encoding=enc)
            except Exception:
                continue
        return pd.read_csv(path, sep=sep or ",", engine="python", dtype=dtype, encoding="latin-1", on_bad_lines="skip")

def _is_valid_segment(x: str) -> bool:
    x = str(x).strip().rstrip(";")
    return (x not in ("", "nan", "None", "SIN_SEGMENTO")) and (x != "NA")

# ---------------- Cargar / normalizar ----------------
seg = _read_csv(SEG_PATH, sep=";", dtype=str)
seg = _normalize_cols(seg)
if "NOMBRE_CLI" not in seg.columns:
    raise ValueError("Segmentación no trae NOMBRE_CLI.")
seg["NOMBRE_CLI_norm"] = seg["NOMBRE_CLI"].apply(_norm_name)
set_seg = set(seg["NOMBRE_CLI_norm"])

alerts = _read_csv(ALERTS_PATH, sep=None)
alerts = _normalize_cols(alerts)
alerts["created_at"] = pd.to_datetime(alerts.get("created_at"), utc=True, errors="coerce")
alerts = alerts.loc[alerts["created_at"] >= pd.Timestamp("2025-03-01", tz="UTC")].copy()
alerts["subject_names_norm"] = alerts["subject_names"].apply(_norm_name)

tx = _read_csv(TX_PATH, dtype=str, robust_tx=True)
tx = _normalize_cols(tx)
tx["tx_date_time"] = pd.to_datetime(tx["tx_date_time"], errors="coerce")
tx = tx.loc[tx["tx_date_time"] >= pd.Timestamp("2025-03-01")]
tx["customer_name_norm"] = tx["customer_name"].apply(_norm_name)
tx["customer_sub_type"] = tx["customer_sub_type"].astype(str).str.strip().str.rstrip(";")

# ---------------- Universo: clientes ACTIVOS con ≥1 alerta ----------------
set_activos = set(tx["customer_name_norm"])
set_alertas = set(alerts["subject_names_norm"])
alert_clients = set_activos & set_alertas
n_alert_clients = len(alert_clients)

# Grupo A: dentro de esos, los que NO están en la tabla de segmentación (≈ 132)
alert_clients_no_seg = alert_clients - set_seg
n_no_seg = len(alert_clients_no_seg)

# ---------------- Última transacción de cada cliente NO segmentado ----------------
# Ordenamos por fecha y nos quedamos con la última fila por cliente
tx_no_seg = tx.loc[tx["customer_name_norm"].isin(alert_clients_no_seg)].copy()
tx_no_seg = tx_no_seg.sort_values(["customer_name_norm","tx_date_time"])
last_tx = tx_no_seg.groupby("customer_name_norm").tail(1).copy()

# ¿Su última tx trae un segmento válido?
last_tx["segmento_valido_en_ultima_tx"] = last_tx["customer_sub_type"].apply(_is_valid_segment)

# Conteos
con_segmento_ultima = int(last_tx["segmento_valido_en_ultima_tx"].sum())
sin_segmento_ultima = int((~last_tx["segmento_valido_en_ultima_tx"]).sum())

print("=== Clientes activos con alerta (marzo→hoy) y SIN segmento en la tabla de segmentación ===")
print(f"- Total en este grupo (esperado ~132): {n_no_seg:,}")
print(f"- Con segmento válido en su ÚLTIMA transacción: {con_segmento_ultima:,} ({con_segmento_ultima/n_no_seg*100:,.2f}%)")
print(f"- Sin segmento en su ÚLTIMA transacción       : {sin_segmento_ultima:,} ({sin_segmento_ultima/n_no_seg*100:,.2f}%)\n")

# Distribución de segmentos (última tx) para los que sí tienen uno
dist_seg = (last_tx.loc[last_tx["segmento_valido_en_ultima_tx"], "customer_sub_type"]
            .value_counts().rename("clientes"))
print("Top segmentos (última tx) dentro de los que sí traen segmento:")
display(dist_seg.head(10).to_frame())

# Muestras
ej_con = (last_tx.loc[last_tx["segmento_valido_en_ultima_tx"],
                      ["customer_name_norm","customer_sub_type","tx_date_time"]]
          .sort_values("tx_date_time", ascending=False)
          .head(15)
          .rename(columns={"customer_name_norm":"cliente", "customer_sub_type":"segmento_ultima_tx"}))

ej_sin = (last_tx.loc[~last_tx["segmento_valido_en_ultima_tx"],
                      ["customer_name_norm","customer_sub_type","tx_date_time"]]
          .sort_values("tx_date_time", ascending=False)
          .head(15)
          .rename(columns={"customer_name_norm":"cliente", "customer_sub_type":"valor_ultima_tx"}))

print("\nEjemplos con segmento en última tx:")
display(ej_con)

print("Ejemplos sin segmento en última tx (vacío/NA/SIN_SEGMENTO):")
display(ej_sin)


=== Clientes activos con alerta (marzo→hoy) y SIN segmento en la tabla de segmentación ===
- Total en este grupo (esperado ~132): 132
- Con segmento válido en su ÚLTIMA transacción: 17 (12.88%)
- Sin segmento en su ÚLTIMA transacción       : 115 (87.12%)

Top segmentos (última tx) dentro de los que sí traen segmento:


Unnamed: 0_level_0,clientes
customer_sub_type,Unnamed: 1_level_1
Retail,9
Imports/Exports,4
Institutional Clients,2
Investment Vehicle,2



Ejemplos con segmento en última tx:


Unnamed: 0,cliente,segmento_ultima_tx,tx_date_time
196517,CORREA MIGLIANO JUAN EDOARDO,Retail,2025-08-27
196470,VON UNGER DE LA CRUZ JUAN CA,Retail,2025-08-27
196366,PROA SPA,Imports/Exports,2025-08-27
195929,GRAYSON GROUP SPA,Imports/Exports,2025-08-27
195738,IAN TAYLOR CHILE S.A.,Imports/Exports,2025-08-27
191884,MIQUEL MACDONALD EDUARDO,Retail,2025-08-22
192443,FACTORING SECURITY S.A.,Institutional Clients,2025-08-22
190397,CHARPENTIER CANALAECHEVERR MAURICE,Retail,2025-08-21
186845,INMOBILIARIA COSTA AZUL SPA,Investment Vehicle,2025-08-19
188049,WENZ KUPFER SEBASTIAN,Retail,2025-08-19


Ejemplos sin segmento en última tx (vacío/NA/SIN_SEGMENTO):


Unnamed: 0,cliente,valor_ultima_tx,tx_date_time
195796,SEGURO INVERSIONES SPA,,2025-08-27
196443,ORDENES BERRIOS LUIS,,2025-08-27
195599,ROSAS MONTECINOS JORGE,,2025-08-27
196162,IMPORTADORA CAPRILE LTDA.,,2025-08-27
196135,TRAVERSO ROJAS PABLO,,2025-08-27
195887,COMERCIALIZADORA S Y A CHILE S.A.,,2025-08-27
195221,CHILEAN MARRONI FROZEN SPA,,2025-08-27
194419,VICENCIO SANTIBANEZ FELIPE,,2025-08-26
194444,RONDEAU . NICOLAS GABRIEL,,2025-08-26
193928,CONSTRUCTORA NR LTDA.,,2025-08-26


In [13]:
# === Breakdown de los 613 segmentados (activos con ≥1 alerta) por segmento ===
import pandas as pd
import re, unicodedata

# --- helpers mínimos (por si no están definidos) ---
def normalize_name(s: str) -> str:
    if pd.isna(s): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"\s+", " ", s.upper().strip())
    return s

# --- asegurar columnas normalizadas ---
if "NOMBRE_CLI_norm" not in seg.columns and "NOMBRE_CLI" in seg.columns:
    seg["NOMBRE_CLI_norm"] = seg["NOMBRE_CLI"].apply(normalize_name)

if "subject_names_norm" not in alerts.columns and "subject_names" in alerts.columns:
    alerts["subject_names_norm"] = alerts["subject_names"].apply(normalize_name)

if "customer_name_norm" not in tx.columns:
    base_name_col = "customer_name" if "customer_name" in tx.columns else None
    if base_name_col is None:
        raise ValueError("No encuentro 'customer_name' en transacciones para normalizar.")
    tx["customer_name_norm"] = tx[base_name_col].apply(normalize_name)

# --- filtrar transacciones desde marzo para definir 'activos' ---
tx["tx_date_time"] = pd.to_datetime(tx["tx_date_time"], errors="coerce")
tx_mar = tx.loc[tx["tx_date_time"] >= pd.Timestamp("2025-03-01")].copy()

# --- conjuntos (nombres normalizados) ---
set_activos = set(tx_mar["customer_name_norm"])                 # activos (≥1 tx Cash)
set_alertas = set(alerts["subject_names_norm"])                 # clientes con ≥1 alerta
set_seg     = set(seg["NOMBRE_CLI_norm"])                       # universo con segmento

# clientes activos con alerta
activos_con_alerta = set_activos & set_alertas
# de esos, los que sí están en la tabla de segmentación (los ~613)
segmentados_con_alerta = sorted(list(activos_con_alerta & set_seg))

# --- preparar tabla de segmentación 'limpia' (última versión si hay fecha) ---
seg_clean = seg.copy()
if "Fecha_Actulizacion" in seg_clean.columns:
    seg_clean["_Fecha_Act_dt"] = pd.to_datetime(seg_clean["Fecha_Actulizacion"], dayfirst=True, errors="coerce")
    seg_clean = seg_clean.sort_values("_Fecha_Act_dt", na_position="last")
seg_clean = seg_clean.drop_duplicates(subset=["NOMBRE_CLI_norm"], keep="last")

# --- traer el segmento y contar únicos por segmento ---
df_segmentados = (
    pd.DataFrame({"NOMBRE_CLI_norm": segmentados_con_alerta})
      .merge(seg_clean[["NOMBRE_CLI_norm","Segmentos_CLI2"]], on="NOMBRE_CLI_norm", how="left")
)

df_segmentados["Segmentos_CLI2"] = (
    df_segmentados["Segmentos_CLI2"].astype(str).str.strip().str.rstrip(";").replace({"nan": pd.NA})
)

counts = df_segmentados["Segmentos_CLI2"].fillna("SIN_SEGMENTO").value_counts().rename("clientes")
pct    = (counts / counts.sum() * 100).round(2).rename("%")

print("=== Clientes SEGMENTADOS con ≥1 alerta (desde marzo): distribución por segmento ===")
display(pd.concat([counts, pct], axis=1))

# (Opcional) chequeo rápido de totales
print(f"\nChequeo: segmentados con alerta contados = {counts.sum():,} (esperado ~613)")


=== Clientes SEGMENTADOS con ≥1 alerta (desde marzo): distribución por segmento ===


Unnamed: 0_level_0,clientes,%
Segmentos_CLI2,Unnamed: 1_level_1,Unnamed: 2_level_1
Retail,438,71.45
Investment Vehicle,78,12.72
Imports/Exports,56,9.14
Institutional Clients,18,2.94
LV Related- Organization,15,2.45
Big Companies,7,1.14
Non-Profit,1,0.16



Chequeo: segmentados con alerta contados = 613 (esperado ~613)


In [1]:
# Top-15 por # alertas (ya lo tienes, lo reusamos aquí)
top15_ids = list(counts_cli.head(15).index)

# Regla más frecuente por cliente
rule_mode = (alerts.groupby(["subject_names_norm","rule_code"])["alert_id"].size()
                   .rename("n")
                   .reset_index()
                   .sort_values(["subject_names_norm","n"], ascending=[True, False])
                   .groupby("subject_names_norm").first()["rule_code"]
                   .rename("top_rule"))

# Tasa FP por cliente
cli_fp = (alerts.groupby("subject_names_norm")
                .agg(alertas=("alert_id","size"),
                     fp=("_is_fp","sum"))
                .assign(fp_rate_pct=lambda d: (d["fp"]/d["alertas"]*100).round(2)))

# Resumen mensual compacto: “AAAA-MM:count; …”
def month_compact(g):
    s = (g["created_at"].dt.tz_convert("UTC").dt.to_period("M").astype(str)
           .value_counts().sort_index())
    return "; ".join([f"{m}:{c}" for m,c in s.items()])

timeline = (alerts.groupby("subject_names_norm")
                  .apply(month_compact)
                  .rename("timeline"))

top15_table = (pd.Index(top15_ids, name="subject_names_norm").to_frame(index=False)
               .merge(name_mode, left_on="subject_names_norm", right_index=True, how="left")
               .merge(seg_mode,  left_on="subject_names_norm", right_index=True, how="left")
               .merge(rule_mode, left_on="subject_names_norm", right_index=True, how="left")
               .merge(cli_fp.reset_index(), on="subject_names_norm", how="left")
               .merge(timeline, left_on="subject_names_norm", right_index=True, how="left")
               .rename(columns={"subject_names_norm":"client_norm",
                                "segment_alert":"segmento",
                                "alertas":"n_alertas",
                                "fp":"n_fp"}))

print("Top-15 clientes — regla más frecuente, tasa FP y línea de tiempo mensual:")
display(top15_table[["client_name","segmento","top_rule","n_alertas","n_fp","fp_rate_pct","timeline"]])


NameError: name 'counts_cli' is not defined