In [1]:
# === P-HSUMO — Percentiles de la MÁXIMA suma 30d por cliente (Outbound Cash) ==
# LÓGICA EXACTA (parametrización):
#   tx_direction = Outbound AND tx_type = Cash
#   Por cliente: S30(t) = suma base en los últimos 30 días (incluye t)
#   Tomamos max_{t}(S30) por cliente y calculamos percentiles sobre esos máximos.

import pandas as pd, numpy as np

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# -------- EDITA AQUÍ -----------------------------------------------------------
PATH = "../../data/tx/datos_trx__with_subsub.csv"
SUBSUBSEGMENTS = "R-Low"   # <-- ajusta el sub-subsegmento
PCTS = [0.85, 0.90, 0.95, 0.97, 0.99]
# ------------------------------------------------------------------------------

# Carga y limpieza mínima
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df["tx_date_time"], errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df["tx_base_amount"], errors="coerce")
df["tx_direction"]   = df.get("tx_direction","").astype(str).str.title()
df["tx_type"]        = df.get("tx_type","").astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS)) 

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# Filtro: OUT + Cash + datos válidos
g = df[
    df["tx_direction"].eq("Outbound")
    & df["tx_type"].eq("Cash")
    & df["tx_date_time"].notna()
    & df["tx_base_amount"].notna()
][["customer_id","tx_date_time","tx_base_amount"]].copy()

if g.empty:
    print("P-HSUMO: No hay transacciones elegibles.")
else:
    max_rows = []
    for cid, sub in g.groupby("customer_id", sort=False):
        daily = (sub.set_index("tx_date_time")["tx_base_amount"]
                   .abs()
                   .resample("D").sum())
        if daily.empty:
            continue
        S30 = daily.rolling("30D").sum()
        max_rows.append({"customer_id": cid, "S30_max": float(S30.max())})

    R = pd.DataFrame(max_rows)
    if R.empty:
        print("P-HSUMO: No se pudieron construir ventanas 30d.")
    else:
        s = R["S30_max"].astype(float)
        q = s.quantile(PCTS) if len(s) else pd.Series(index=PCTS, dtype=float)

        out = pd.DataFrame({
            "percentil": [f"p{int(p*100)}" for p in PCTS],
            "Amount_30d_max_per_customer_CLP": [q.get(p, np.nan) for p in PCTS]
        })

        print("=== P-HSUMO — Máxima suma CLP en 30 días por cliente (percentiles) ===")
        print(f"Clientes considerados: {s.shape[0]:,}")
        display(out)

        if pd.notna(q.get(0.95, np.nan)):
            print(f"\nSugerencia {{var.Amount}} (p95 máx-30d por cliente): {q.get(0.95):,.0f} CLP")


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== P-HSUMO — Máxima suma CLP en 30 días por cliente (percentiles) ===
Clientes considerados: 9,381


Unnamed: 0,percentil,Amount_30d_max_per_customer_CLP
0,p85,60000000
1,p90,92000000
2,p95,159910500
3,p97,243220480
4,p99,507635771



Sugerencia {var.Amount} (p95 máx-30d por cliente): 159,910,500 CLP


# Simulación alertas

In [None]:
# === P-HSUMO — Simulación (ventanas cliente–día) ===============================
# Regla: OUT + Cash; S30 > Amount
# Unidad = ventanas cliente–día. Opcional: colapsar rachas para 1 alerta por cliente.

import pandas as pd, numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# -------- EDITA AQUÍ -----------------------------------------------------------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = ["I-1"]                 # <-- ajusta el sub-subsegmento
PARAMS = {
    "Actual": {"Amount": 9_941_685_250},  # igual que p90
    "p95": {"Amount": 16_938_751_151},
    "p97": {"Amount": 17_758_890_427},
    "p99": {"Amount": 18_579_029_703},
}
COLAPSAR_RACHAS = False  # True => cuenta 1er día de cada racha por cliente
# ------------------------------------------------------------------------------

df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df["tx_date_time"], errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df["tx_base_amount"], errors="coerce")
df["tx_direction"]   = df["tx_direction"].astype(str).str.title()
df["tx_type"]        = df["tx_type"].astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

g = df[
    df["tx_direction"].eq("Outbound") &
    df["tx_type"].eq("Cash") &
    df["tx_date_time"].notna() &
    df["tx_base_amount"].notna()
][["customer_id","tx_date_time","tx_base_amount"]].copy()

parts=[]
for cid, sub in g.groupby("customer_id", sort=False):
    daily = (sub.set_index("tx_date_time")["tx_base_amount"]
                .abs()
                .resample("D").sum())
    if daily.empty: 
        continue
    S30 = daily.rolling("30D").sum()
    parts.append(pd.DataFrame({"customer_id": cid, "date": S30.index, "S30": S30.values}))

M = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["customer_id","date","S30"])

def contar_alertas(dfm, amount, colapsar_rachas=False):
    m = dfm["S30"] > amount
    if not colapsar_rachas:
        return int(dfm.loc[m, ["customer_id","date"]].drop_duplicates().shape[0])
    # Colapsar rachas contiguas por cliente (1ª fecha de cada racha)
    df2 = dfm.loc[m, ["customer_id","date"]].sort_values(["customer_id","date"])
    # Una racha empieza cuando no hay día anterior consecutivo para el mismo cliente
    df2["prev_date"] = df2.groupby("customer_id")["date"].shift(1)
    df2["is_new_run"] = (df2["prev_date"].isna()) | ((df2["date"] - df2["prev_date"]).dt.days > 1)
    return int(df2.loc[df2["is_new_run"], ["customer_id","date"]].shape[0])

order = ["Actual","p90","p95","p97","p99"]
param_tbl = pd.DataFrame(PARAMS).T.loc[[k for k in order if k in PARAMS]].rename_axis("escenario").reset_index()
print("=== P-HSUMO — Parámetros (Amount) ==="); display(param_tbl)

counts = {k: contar_alertas(M, v["Amount"], COLAPSAR_RACHAS) for k,v in PARAMS.items()}
out = pd.DataFrame([{
    "alertas_" + k: counts.get(k, 0) for k in order
}])
print(f"=== P-HSUMO — Alertas por escenario (ventanas; colapsar_rachas={COLAPSAR_RACHAS}) ===")
display(out)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== P-HSUMO — Parámetros (Amount) ===


Unnamed: 0,escenario,Amount
0,Actual,9941685250
1,p95,16938751151
2,p97,17758890427
3,p99,18579029703


=== P-HSUMO — Alertas por escenario (ventanas; colapsar_rachas=False) ===


Unnamed: 0,alertas_Actual,alertas_p90,alertas_p95,alertas_p97,alertas_p99
0,241,0,1,1,1
