In [1]:
# === P-TLO — Threshold por monto (Outbound Cash) ===============================
# Regla: When an Outbound Cash transaction is over [Amount] {var.Amount} CLP, apply {action}.
# Justificación: seleccionar el monto en el percentil 95 de la distribución.

import pandas as pd
import numpy as np

# -------- Parámetros editables --------
PATH = "../../data/tx/datos_trx__with_subsub.csv"
SUBSUBSEGMENTS = "R-Low"         # <-- ajusta el sub-subsegmento
PCTS = [90, 95, 97, 99]                   # percentiles a reportar

# -------- Carga mínima --------
df = pd.read_csv(PATH, dtype={"customer_id": "string"}, encoding="utf-8-sig")

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# -------- Filtro según regla --------
df["tx_base_amount"] = pd.to_numeric(df["tx_base_amount"], errors="coerce")
mask = (
    (df["tx_direction"].astype(str).str.title() == "Outbound") &
    (df["tx_type"].astype(str).str.title() == "Cash") &
    (df["tx_base_amount"] > 0)  # montos válidos y positivos en CLP
)
g = df.loc[mask, ["tx_base_amount"]].dropna()

if g.empty:
    print("No hay transacciones elegibles para P-TLO con los filtros dados.")
else:
    s = g["tx_base_amount"].astype(float)
    stats = {f"p{p}": float(np.percentile(s, p)) for p in PCTS}

    # Recomendado: el propio p95 (entero para CLP)
    recommended_amount = int(round(stats["p95"]))

    print("=== P-TLO — Percentiles de monto (CLP, Outbound Cash) ===")
    for p in PCTS:
        v = stats[f"p{p}"]
        print(f"p{p:>2}: {v:,.0f}")
    print(f"\nAmount recomendado (p95): {recommended_amount:,.0f} CLP")


  df = pd.read_csv(PATH, dtype={"customer_id": "string"}, encoding="utf-8-sig")


=== P-TLO — Percentiles de monto (CLP, Outbound Cash) ===
p90: 13,269,254
p95: 26,768,268
p97: 44,068,121
p99: 100,000,000

Amount recomendado (p95): 26,768,268 CLP


# Simulación alertas

In [10]:
# === P-TLO — Matching (Actual vs Reales) + Escenarios (subsub) ==================
# Lógica simulación:
#   tx_direction = Outbound
#   AND tx_base_amount > [Amount]
#   AND tx_date_time >= 2025-03-01 (inclusive)
#
# "Actual" se corre con customer_sub_type == SUBSEGMENT_REAL (p.ej., Retail)
# Escenarios (p95/p97/p99/...) se corren con customer_sub_sub_type in SUBSUBSEGMENTS_NEW (p.ej., ["R-Low"])

import pandas as pd
import numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ---------------------------------------------------------------------
# Paths, parámetros y segmentos
# ---------------------------------------------------------------------
TX_PATH      = "../../data/tx/datos_trx__with_subsub_oficial.csv"
ALERTS_PATH  = "../../data/alertas marzo-agosto 2025 enriched.csv"

SUBSEGMENT_REAL      = "Investment Vehicle"     # customer_sub_type (para matching y "Actual")
SUBSUBSEGMENTS_NEW   = ["R-High"]    # customer_sub_sub_type (para escenarios nuevos)

# Dirección (TLO = Outbound)
DIR = "Outbound"

PARAMS = {
    "Actual": {"Amount": 1349692960},
    "p95":    {"Amount": 444228552},
    "p97":    {"Amount": 561770153},
    "p99":    {"Amount": 948148200},
}

COUNT_FROM = pd.Timestamp("2025-02-21", tz="UTC")
COUNT_TO   = None

# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
def _to_dt_utc(s):
    return pd.to_datetime(s, errors="coerce", utc=True)

def _build_key(df, prefer_tx_id=True):
    has_txid = ("tx_id" in df.columns)
    if prefer_tx_id and has_txid:
        key = df["tx_id"].astype("string").str.strip()
        if key.notna().sum() == 0:
            has_txid = False
    if not prefer_tx_id or not has_txid:
        cust = df.get("customer_id", "").astype("string").fillna("").str.strip()
        ts   = _to_dt_utc(df.get("tx_date_time")).dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        amt  = pd.to_numeric(df.get("tx_base_amount"), errors="coerce").round(0).fillna(-1).astype("Int64").astype(str)
        d    = df.get("tx_direction", "").astype(str).str.title()
        key  = cust.str.cat([ts, amt, d], sep="|")
    return key

def _load_tx(path):
    tx = pd.read_csv(path, dtype={"customer_id":"string"}, encoding="utf-8-sig", low_memory=False)
    tx["customer_sub_type"]      = tx.get("customer_sub_type","").astype(str)
    tx["customer_sub_sub_type"]  = tx.get("customer_sub_sub_type","").astype(str)
    tx["tx_direction"]   = tx.get("tx_direction","").astype(str).str.title()
    tx["tx_base_amount"] = pd.to_numeric(tx.get("tx_base_amount"), errors="coerce")
    tx["tx_date_time"]   = _to_dt_utc(tx.get("tx_date_time"))
    return tx

def _load_reales_ptlo(path):
    dtypes = {"alert_id":"string","rule_code":"string","customer_id":"string","tx_id":"string"}
    real = pd.read_csv(path, dtype=dtypes, encoding="utf-8-sig", low_memory=False)
    real["rule_code"]        = real["rule_code"].astype(str).str.strip()
    real["customer_sub_type"]= real.get("customer_sub_type","").astype(str)
    real["tx_direction"]     = real.get("tx_direction","").astype(str).str.title()
    real["tx_base_amount"]   = pd.to_numeric(real.get("tx_base_amount"), errors="coerce")
    real["tx_date_time"]     = _to_dt_utc(real.get("tx_date_time"))
    return real

def _filter_dates(df, col="tx_date_time", start=COUNT_FROM, end=COUNT_TO):
    m = df[col] >= start
    if end is not None:
        m &= (df[col] <= end)
    return df.loc[m].copy()

# ---------------------------------------------------------------------
# 1) SIMULACIÓN "Actual" (segmento: customer_sub_type == SUBSEGMENT_REAL)
# ---------------------------------------------------------------------
tx_all = _load_tx(TX_PATH)

tx_actual = tx_all[tx_all["customer_sub_type"] == SUBSEGMENT_REAL].copy()
tx_actual = _filter_dates(tx_actual, "tx_date_time", COUNT_FROM, COUNT_TO)

g_act = tx_actual[(tx_actual["tx_direction"].eq(DIR)) & tx_actual["tx_base_amount"].notna()].copy()
A_act = float(PARAMS["Actual"]["Amount"])
sim_hits_actual = g_act[g_act["tx_base_amount"] > A_act].copy()
sim_hits_actual["match_key"] = _build_key(sim_hits_actual, prefer_tx_id=True)

# ---------------------------------------------------------------------
# 2) ALERTAS REALES (rule_code == P-TLO; customer_sub_type == SUBSEGMENT_REAL)
# ---------------------------------------------------------------------
real = _load_reales_ptlo(ALERTS_PATH)
real = real[(real["rule_code"] == "P-TLO") & (real["customer_sub_type"] == SUBSEGMENT_REAL)].copy()
real = real[(real["tx_date_time"].isna()) | (real["tx_date_time"] >= COUNT_FROM)]
if COUNT_TO is not None:
    real = real[(real["tx_date_time"].isna()) | (real["tx_date_time"] <= COUNT_TO)]
real["match_key"] = _build_key(real, prefer_tx_id=True)

# Matching
sim_keys  = set(sim_hits_actual["match_key"].dropna().unique())
real_keys = set(real["match_key"].dropna().unique())
reales_sin_sim = real[~real["match_key"].isin(sim_keys)].copy()
sim_sin_reales = sim_hits_actual[~sim_hits_actual["match_key"].isin(real_keys)].copy()

# Logs claros
print("=== P-TLO — Resumen matching (Actual vs Reales) ===")
print(f"Alertas reales: {len(real):,}")
print(f"Alertas simuladas: {len(sim_hits_actual):,}")
print(f"Alertas reales que no están en la simulación: {len(reales_sin_sim):,}")
print(f"Alertas en la simulación que no saltaron realmente: {len(sim_sin_reales):,}")

print("\n--- Reales sin simulación (head) ---")
display(reales_sin_sim[["alert_id","rule_code","customer_id","tx_id","tx_date_time","tx_direction","tx_base_amount","match_key"]].head(20))

print("\n--- Simulación sin reales (head) ---")
display(sim_sin_reales[["customer_id","tx_id","tx_date_time","tx_direction","tx_base_amount","match_key"]].head(20))

# ---------------------------------------------------------------------
# 3) ESCENARIOS (percentiles) en subsubsegmentos (customer_sub_sub_type in SUBSUBSEGMENTS_NEW)
# ---------------------------------------------------------------------
tx_new = tx_all[tx_all["customer_sub_sub_type"].isin(SUBSUBSEGMENTS_NEW)].copy()
tx_new = _filter_dates(tx_new, "tx_date_time", COUNT_FROM, COUNT_TO)
g_new  = tx_new[(tx_new["tx_direction"].eq(DIR)) & tx_new["tx_base_amount"].notna()].copy()

scenario_counts = {}
for esc, pars in PARAMS.items():
    amt = float(pars["Amount"])
    scenario_counts[esc] = int((g_new["tx_base_amount"] > amt).sum())

print("\n=== P-TLO — Escenarios (subsub) ===")
display(pd.DataFrame([scenario_counts]))


=== P-TLO — Resumen matching (Actual vs Reales) ===
Alertas reales: 134
Alertas simuladas: 137
Alertas reales que no están en la simulación: 0
Alertas en la simulación que no saltaron realmente: 3

--- Reales sin simulación (head) ---


Unnamed: 0,alert_id,rule_code,customer_id,tx_id,tx_date_time,tx_direction,tx_base_amount,match_key



--- Simulación sin reales (head) ---


Unnamed: 0,customer_id,tx_id,tx_date_time,tx_direction,tx_base_amount,match_key
664695,82606800,213118893,2025-08-22 00:00:00+00:00,Outbound,1494809244,213118893.0
665269,78966540,213138280,2025-08-22 00:00:00+00:00,Outbound,5586427427,213138280.0
665336,78966540,213145388,2025-08-22 00:00:00+00:00,Outbound,5586427427,213145388.0



=== P-TLO — Escenarios (subsub) ===


Unnamed: 0,Actual,p95,p97,p99
0,1,26,12,2
