# Generar tabla enriquecida uniendo las alertas high con las transacciones cash desde marzo de 2025

In [6]:
import pandas as pd

TX_PATH = "../data/transacciones_cash_2025.csv"
ALERTS_PATH = "../data/high_alerts_2025.csv"
OUT_PATH = "../data/high_alerts_enriched_with_tx.csv"

# --- util: normalizar nombres de columnas ---
def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (out.columns
                     .str.replace("\ufeff", "", regex=False)  # BOM si lo hubiera
                     .str.strip()                              # espacios extremos
                     .str.strip("\"'")                         # comillas
                     .str.rstrip(";"))                         # ; finales (como customer_sub_type;)
    return out

# --- leer CSVs ---
tx = pd.read_csv(TX_PATH)                # suele venir con comas
alerts = pd.read_csv(ALERTS_PATH, sep=";")  # este viene con ; por lo que indico sep

tx = normalize_cols(tx)
alerts = normalize_cols(alerts)

# --- helpers para ids externos ---
def _normalize_single_id(token) -> str:
    if pd.isna(token): 
        return None
    t = str(token).strip().strip("[](){}\"'")
    if t.endswith(".0"):
        t = t[:-2]
    return t

def _split_external_ids(cell) -> list:
    if pd.isna(cell):
        return []
    s = str(cell)
    parts = s.split(",") if "," in s else [s]
    out = []
    for p in parts:
        norm = _normalize_single_id(p)
        if norm:
            out.append(norm)
    return out

# --- normalizar tx_id en transacciones para el join ---
tx = tx.assign(
    tx_id_norm=lambda d: d["tx_id"].astype(str)
                         .str.strip()
                         .str.strip("[](){}\"'")
                         .str.replace(r"\.0$", "", regex=True)
)

# columnas de interés a traer desde transacciones
cols_tx_interes = [
    "tx_direction",
    "tx_base_amount",
    "customer_type",
    "customer_account_balance",
    "customer_networth",
    "customer_income",
    "customer_expected_amount",
    "customer_sub_type",
]

present = [c for c in cols_tx_interes if c in tx.columns]
missing = [c for c in cols_tx_interes if c not in tx.columns]
if missing:
    print("Aviso: no se encontraron en transacciones las columnas ->", missing)

# --- desarmar ids externos preservando orden ---
alerts = alerts.copy()
alerts["ext_ids_list"] = alerts["external_transaction_ids"].apply(_split_external_ids)
exploded = alerts.explode("ext_ids_list", ignore_index=False)
exploded["seq"] = exploded.groupby("alert_id").cumcount()

# --- join (cada id externo -> fila de transacción Cash) ---
ex_join = exploded.merge(
    tx[["tx_id_norm"] + present],
    left_on="ext_ids_list",
    right_on="tx_id_norm",
    how="left"  # si no está (probable Buy/Sell), quedará NA
)

# --- limpiador de valores y agregador con colapso inteligente ---
def _clean_value(v) -> str:
    """Limpia cada valor: quita espacios, comillas y ';' finales. NaN -> 'NA'."""
    if pd.isna(v):
        return "NA"
    s = str(v).strip().strip("\"'").rstrip(";")
    return s

def _collapse_or_join(df_group: pd.DataFrame, value_col: str) -> str:
    """
    - Si TODOS los valores no-NA son iguales y no hay 'NA' -> devuelve ese único valor.
    - En otro caso -> devuelve los valores limpios separados por coma
      preservando el orden original (columna 'seq').
    """
    vals = df_group.sort_values("seq")[value_col].map(_clean_value).tolist()
    non_na = [x for x in vals if x not in ("", "NA")]
    if non_na and len(set(non_na)) == 1 and "NA" not in vals:
        return non_na[0]
    return ", ".join(vals)

# --- agregar por alerta usando el nuevo agregador ---
aggs = {col: (lambda g, c=col: _collapse_or_join(g, c)) for col in present}
by_alert = (
    ex_join.groupby("alert_id", group_keys=False)
           .apply(lambda g: pd.Series({k: fn(g) for k, fn in aggs.items()}))
           .reset_index()
)

# --- resultado final ---
df_high_alerts_enriched = alerts.drop(columns=["ext_ids_list"]).merge(by_alert, on="alert_id", how="left")

# guardar
df_high_alerts_enriched.to_csv(OUT_PATH, index=False)

print("Columnas añadidas:", present)
print(df_high_alerts_enriched.head(3))


  tx = pd.read_csv(TX_PATH)                # suele venir con comas


Columnas añadidas: ['tx_direction', 'tx_base_amount', 'customer_type', 'customer_account_balance', 'customer_networth', 'customer_income', 'customer_expected_amount', 'customer_sub_type']
   alert_id rule_code  subject_ids  \
0     82271    AAD-LA        26375   
1     83230     P-TLO        24618   
2     83231  PGAV-OUT        24618   

                                  subject_names  number_of_transactions  \
0  MOREL           BULICIC         JORGE RAFAEL                       2   
1         GARCIA          LABORA          WALDO                       1   
2         GARCIA          LABORA          WALDO                       1   

                         created_at          status external_transaction_ids  \
0  2025-03-07 12:27:52.142219+00:00  Not Suspicious      201955096, 68816944   
1  2025-03-11 12:33:06.388175+00:00  Not Suspicious                202249097   
2  2025-03-11 12:33:06.388175+00:00  Not Suspicious                202249097   

  tx_direction tx_base_amount custome

  .apply(lambda g: pd.Series({k: fn(g) for k, fn in aggs.items()}))
