In [12]:
# === PGAV-IN — Percentiles por parámetro (Amount / Factor / Number) ===========
import pandas as pd, numpy as np, math

pd.set_option("display.float_format", lambda x: f"{x:,.2f}")

# -------- Parámetros editables --------
PATH = "../../data/tx_retail_whale.csv"      # <-- ruta a tu CSV
AMOUNT_QS = [0.85, 0.90, 0.95, 0.97, 0.99]
FACTOR_QS = [0.90,0.95, 0.97, 0.99]
NUMBER_QS = [0.50, 0.75, 0.90]

# -------- Carga mínima --------
tx = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
tx["tx_date_time"]   = pd.to_datetime(tx["tx_date_time"], errors="coerce")
tx["tx_base_amount"] = pd.to_numeric(tx["tx_base_amount"], errors="coerce")
tx["tx_direction"]   = tx["tx_direction"].astype(str).str.title()
tx["tx_type"]        = tx["tx_type"].astype(str).str.title()

# Columna de grupo
GROUP_COL = "customer_sub_type" if "customer_sub_type" in tx.columns else "customer_type"
if GROUP_COL not in tx.columns:
    raise KeyError("No encontré ni 'customer_sub_type' ni 'customer_type' en el CSV.")

# -------- Filtro: INBOUND + CASH --------
g = tx[
    (tx["tx_direction"].eq("Inbound")) &
    (tx["tx_type"].eq("Cash")) &
    (tx["tx_base_amount"].notna()) &
    (tx["tx_date_time"].notna())
].copy()

if g.empty:
    print("No hay transacciones elegibles para PGAV-IN.")
else:
    g = g.sort_values([GROUP_COL, "tx_date_time"]).reset_index(drop=True)

    # Rolling 7D por grupo (incluye actual) -> excluir actual para promedio/contador previos
    rs = g.groupby(GROUP_COL, group_keys=False).rolling("7D", on="tx_date_time")["tx_base_amount"]
    roll_sum_incl = rs.sum()
    roll_cnt_incl = rs.count()

    g["prev_sum7"] = roll_sum_incl.values - g["tx_base_amount"].values
    g["prev_cnt7"] = roll_cnt_incl.values - 1
    g["peer_avg7_excl"] = np.where(g["prev_cnt7"] > 0, g["prev_sum7"] / g["prev_cnt7"], np.nan)
    g["factor"] = np.where(g["peer_avg7_excl"] > 0, g["tx_base_amount"] / g["peer_avg7_excl"], np.nan)
    g["number_prev7"] = g["prev_cnt7"].clip(lower=0)

    def qdict(series, qs):
        s = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
        if len(s) == 0: return {q: np.nan for q in qs}
        q = s.quantile(qs)
        return {float(k): float(v) for k, v in q.items()}

    rows=[]
    for grp, sub in g.groupby(GROUP_COL):
        amt_q = qdict(sub["tx_base_amount"], AMOUNT_QS)
        fac_q = qdict(sub["factor"],        FACTOR_QS)
        num_q = qdict(sub["number_prev7"],  NUMBER_QS)

        row = {GROUP_COL: grp}
        for q,v in amt_q.items():
            row[f"Amount_p{int(q*100)}"] = v
        for q,v in fac_q.items():
            row[f"Factor_p{int(q*100)}_raw"] = v
            row[f"Factor_p{int(q*100)}"]     = int(math.ceil(v)) if np.isfinite(v) else np.nan
        for q,v in num_q.items():
            row[f"Number_p{int(q*100)}_raw"] = v
            row[f"Number_p{int(q*100)}"]     = int(math.floor(v)) if np.isfinite(v) else np.nan
        rows.append(row)

    out = pd.DataFrame(rows).sort_values(GROUP_COL).reset_index(drop=True)

    # Presentación
    pretty = out.copy()
    for q in AMOUNT_QS:
        pretty[f"Amount_p{int(q*100)}"] = pretty[f"Amount_p{int(q*100)}"].map(lambda x: f"{x:,.0f}" if pd.notna(x) else "—")
    for q in FACTOR_QS:
        pretty[f"Factor_p{int(q*100)}_raw"] = pretty[f"Factor_p{int(q*100)}_raw"].map(lambda x: f"{x:,.2f}" if pd.notna(x) else "—")
    for q in NUMBER_QS:
        pretty[f"Number_p{int(q*100)}_raw"] = pretty[f"Number_p{int(q*100)}_raw"].map(lambda x: f"{x:,.2f}" if pd.notna(x) else "—")

    cols = [GROUP_COL]
    cols += [f"Amount_p{int(q*100)}" for q in AMOUNT_QS]
    for q in FACTOR_QS: cols += [f"Factor_p{int(q*100)}_raw", f"Factor_p{int(q*100)}"]
    for q in NUMBER_QS: cols += [f"Number_p{int(q*100)}_raw", f"Number_p{int(q*100)}"]

    print("=== PGAV-IN — Percentiles por parámetro (por grupo) ===")
    display(pretty[cols])


=== PGAV-IN — Percentiles por parámetro (por grupo) ===


Unnamed: 0,customer_sub_type,Amount_p85,Amount_p90,Amount_p95,Amount_p97,Amount_p99,Factor_p90_raw,Factor_p90,Factor_p95_raw,Factor_p95,Factor_p97_raw,Factor_p97,Factor_p99_raw,Factor_p99,Number_p50_raw,Number_p50,Number_p75_raw,Number_p75,Number_p90_raw,Number_p90
0,Retail,334761019,400000000,617145971,873286902,1096447040,3.29,4,6.19,7,7.18,8,18.51,19,10.0,10,14.0,14,18.0,18


# Simulación alertas

In [13]:
# === PGAV-IN — Sensibilidad (Actual vs propuestos, por transacción) ===========
# LÓGICA EXACTA:
# tx_direction = Inbound
# AND tx_base_amount > [Amount]
# AND customer_type (o customer_sub_type) = [Type]   (aquí usamos el tipo que traiga el CSV)
# AND tx_base_amount > (promedio base_amount del grupo (mismo tx_direction & mismo grupo) en 7 días, excluyendo la tx) * [Factor]
# AND count del grupo en 7 días (excluyendo la tx) > [Number]
# Unidad = transacciones que cumplen

import pandas as pd, numpy as np, math

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

PATH="../../data/tx_retail_whale.csv"
PARAMS={
    #"Actual":{"Amount":5_000_388, "Factor":4, "Number":139},
    "p95":   {"Amount":400_000_000, "Factor":4, "Number":139},
    "p97":   {"Amount":873_300_000, "Factor":7, "Number":139},
    "p99":   {"Amount":1_096_500_000, "Factor":18,"Number":139},
}

tx=pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
tx["tx_date_time"]=pd.to_datetime(tx["tx_date_time"], errors="coerce")
tx["tx_base_amount"]=pd.to_numeric(tx["tx_base_amount"], errors="coerce")
tx["tx_direction"]=tx["tx_direction"].astype(str).str.title()
tx["tx_type"]=tx["tx_type"].astype(str).str.title()

GROUP_COL = "customer_sub_type" if "customer_sub_type" in tx.columns else "customer_type"
g = tx[(tx["tx_direction"].eq("Inbound")) & tx["tx_base_amount"].notna() & tx["tx_date_time"].notna() & tx[GROUP_COL].notna()].copy()
g=g.sort_values([GROUP_COL,"tx_date_time"]).reset_index(drop=True)

# Rolling 7D por grupo (incluye actual) -> excluir actual
rs = g.groupby(GROUP_COL, group_keys=False).rolling("7D", on="tx_date_time")["tx_base_amount"]
sum_incl = rs.sum()
cnt_incl = rs.count()
g["prev_sum7"] = sum_incl.values - g["tx_base_amount"].values
g["prev_cnt7"] = cnt_incl.values - 1
g["avg7_excl"] = np.where(g["prev_cnt7"]>0, g["prev_sum7"]/g["prev_cnt7"], np.nan)

order=["Actual","p95","p97","p99"]
param_tbl=pd.DataFrame(PARAMS).T.loc[[k for k in order if k in PARAMS]].rename_axis("escenario").reset_index()
print("=== PGAV-IN — Parámetros (Amount, Factor, Number) ==="); display(param_tbl)

counts={}
for k,v in PARAMS.items():
    A,F,N=v["Amount"], v["Factor"], v["Number"]
    m=(g["tx_base_amount"]>A) & (g["prev_cnt7"]>N) & (g["avg7_excl"]>0) & (g["tx_base_amount"]>g["avg7_excl"]*F)
    counts[k]=int(m.sum())

out=pd.DataFrame([{
    "alertas_actual":counts.get("Actual",0),
    "alertas_p95":counts.get("p95",0),
    "alertas_p97":counts.get("p97",0),
    "alertas_p99":counts.get("p99",0),
}])
print("=== PGAV-IN — Alertas por escenario (tx) ==="); display(out)


=== PGAV-IN — Parámetros (Amount, Factor, Number) ===


Unnamed: 0,escenario,Amount,Factor,Number
0,p95,400000000,4,139
1,p97,873300000,7,139
2,p99,1096500000,18,139


=== PGAV-IN — Alertas por escenario (tx) ===


Unnamed: 0,alertas_actual,alertas_p95,alertas_p97,alertas_p99
0,0,0,0,0
