In [2]:
# === RVT-IN — Percentiles de Number y Amount (tx round) en la mejor ventana 30d ===
# LÓGICA EXACTA (parametrización):
#   Dirección = Inbound, Tipo = Cash, "Round" (tx_amount % 1000 == 0)
#   Number: máximo # de tx round en 30d (por cliente)
#   Amount: máxima suma base de tx round en 30d (por cliente)
# NOTA: aquí NO exigimos base > 1000 CLP (a diferencia de HNR).

import pandas as pd, numpy as np, math
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# -------- Parámetros editables --------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = "I-2"     # <-- ajusta el sub-subsegmento
WINDOW_DAYS = 30
NUM_QS = [0.95, 0.97, 0.99]     # para Number
AMT_QS = [0.95, 0.97, 0.99]     # para Amount

# -------- Carga mínima --------
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")
df["tx_direction"]   = df.get("tx_direction", "").astype(str).str.title()
df["tx_type"]        = df.get("tx_type", "").astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

is_round = np.isfinite(df["tx_amount"]) & np.isclose(df["tx_amount"] % 1000.0, 0.0, atol=1e-9)
m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    is_round &
    df["tx_date_time"].notna() &
    df["tx_base_amount"].notna() &
    df["customer_id"].notna()
)
g = df.loc[m, ["customer_id","tx_date_time","tx_base_amount"]] \
      .sort_values(["customer_id","tx_date_time"]).copy()
g["amt"] = g["tx_base_amount"].abs().astype(float)

def max_count_sum_30d(ts: np.ndarray, amts: np.ndarray, days=30):
    """Devuelve (max_count, max_sum) en cualquier ventana [t, t+days]."""
    if ts.size == 0: return (0, 0.0)
    ts = np.sort(ts)
    amts = amts[np.argsort(ts)]
    j = 0; best_c = 0; best_s = 0.0
    prefix = np.concatenate([[0.0], np.cumsum(amts)])
    delta = np.timedelta64(days, "D")
    for i in range(ts.size):
        end = ts[i] + delta
        while j < ts.size and ts[j] <= end:
            j += 1
        c = j - i
        s = prefix[j] - prefix[i]
        if c > best_c: best_c = c
        if s > best_s: best_s = s
    return (best_c, best_s)

if g.empty:
    print("RVT-IN: no hay transacciones round elegibles.")
else:
    out_rows = []
    for cid, sub in g.groupby("customer_id", sort=False):
        times = sub["tx_date_time"].values
        amts  = sub["amt"].values
        mc, ms = max_count_sum_30d(times, amts, WINDOW_DAYS)
        out_rows.append({"customer_id": cid, "max_count_30d": mc, "max_sum_30d": ms})

    res = pd.DataFrame(out_rows)
    sN = res["max_count_30d"].astype(float)
    sA = res["max_sum_30d"].astype(float)

    qN = {f"p{int(p*100)}": (float(np.percentile(sN, int(p*100))) if len(sN) else np.nan) for p in NUM_QS}
    qA = {f"p{int(p*100)}": (float(np.percentile(sA, int(p*100))) if len(sA) else np.nan) for p in AMT_QS}

    df_number = pd.DataFrame({
        "percentil":    [f"p{int(p*100)}" for p in NUM_QS],
        "Number_raw":   [qN[f"p{int(p*100)}"] for p in NUM_QS],
        "Number_ceil":  [int(math.ceil(qN[f"p{int(p*100)}"])) if np.isfinite(qN[f"p{int(p*100)}"]) else np.nan for p in NUM_QS],
    })
    df_amount = pd.DataFrame({
        "percentil": [f"p{int(p*100)}" for p in AMT_QS],
        "Amount_CLP":[qA[f"p{int(p*100)}"] for p in AMT_QS],
    })

    print("=== RVT-IN — Number (máx # tx round en 30d por cliente) ===")
    display(df_number)
    print("=== RVT-IN — Amount (máx suma base de tx round en 30d por cliente) ===")
    display(df_amount)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== RVT-IN — Number (máx # tx round en 30d por cliente) ===


Unnamed: 0,percentil,Number_raw,Number_ceil
0,p95,15,15
1,p97,16,17
2,p99,17,18


=== RVT-IN — Amount (máx suma base de tx round en 30d por cliente) ===


Unnamed: 0,percentil,Amount_CLP
0,p95,13634301000
1,p97,14185870600
2,p99,14737440200


# Simulación alertas

In [3]:
# === RVT-IN — Simulación de alertas (Actual vs propuestos) =====================
# LÓGICA EXACTA:
# tx_direction = Inbound
# AND tx_type = Cash
# AND mod( tx_amount [default: 0.0001] , 1000 ) = 0     (redonda)
# AND count de tx redondas en 30 días > [Number]
# AND sum de tx_base_amount de las redondas en 30 días > [Amount]
# Unidad = ventanas cliente–día que cumplen (ambas condiciones)

import pandas as pd, numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ---- EDITA AQUÍ ----------------------------------------------------------------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = ["I-2"]                # <-- ajusta el sub-subsegmento
PARAMS = {
    "Actual": {"Number": 13, "Amount": 3_834_223_610},
    "p95":    {"Number":  15, "Amount":   13_634_301_000},
    "p97":    {"Number":  16, "Amount":   14_185_870_600},
    "p99":    {"Number":  17, "Amount": 14_737_440_200},
}
# -------------------------------------------------------------------------------

df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")
df["tx_direction"]   = df.get("tx_direction","").astype(str).str.title()
df["tx_type"]        = df.get("tx_type","").astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

amt_orig = df["tx_amount"].fillna(0.0001)
is_round = np.isfinite(amt_orig) & np.isclose(amt_orig % 1000, 0, atol=1e-9)

m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    df["tx_date_time"].notna() &
    df["customer_id"].notna() &
    is_round &
    df["tx_base_amount"].notna()
)
g = df.loc[m, ["customer_id","tx_date_time","tx_base_amount"]].copy()
g["amt"] = g["tx_base_amount"].abs().astype(float)

if g.empty:
    print("No hay transacciones elegibles para RVT-IN.")
else:
    parts=[]
    for cid, sub in g.groupby("customer_id", sort=False):
        daily_cnt = (sub.set_index("tx_date_time")
                        .assign(x=1)["x"]
                        .resample("D").sum()
                        .fillna(0.0))
        daily_sum = (sub.set_index("tx_date_time")["amt"]
                        .resample("D").sum()
                        .fillna(0.0))
        N30 = daily_cnt.rolling("30D").sum()
        S30 = daily_sum.rolling("30D").sum()
        parts.append(pd.DataFrame({"customer_id": cid, "date": N30.index, "N30": N30.values, "S30": S30.values}))
    M = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["customer_id","date","N30","S30"])

    order = ["Actual","p90","p95","p97","p99"]
    param_tbl = (pd.DataFrame(PARAMS).T
                   .loc[[k for k in order if k in PARAMS]]
                   .rename_axis("escenario")
                   .reset_index())
    print("=== RVT-IN — Parámetros (Number & Amount) ==="); display(param_tbl)

    counts={}
    for k,v in PARAMS.items():
        N, A = v["Number"], v["Amount"]
        m_ok = (M["N30"] > N) & (M["S30"] > A)
        counts[k] = int(M.loc[m_ok, ["customer_id","date"]].drop_duplicates().shape[0])

    out = pd.DataFrame([{
        "alertas_actual": counts.get("Actual",0),
        "alertas_p90":    counts.get("p90",0),
        "alertas_p95":    counts.get("p95",0),
        "alertas_p97":    counts.get("p97",0),
        "alertas_p99":    counts.get("p99",0),
    }])
    print("=== RVT-IN — Alertas por escenario (ventanas cliente–día) ==="); display(out)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== RVT-IN — Parámetros (Number & Amount) ===


Unnamed: 0,escenario,Number,Amount
0,Actual,13,3834223610
1,p95,15,13634301000
2,p97,16,14185870600
3,p99,17,14737440200


=== RVT-IN — Alertas por escenario (ventanas cliente–día) ===


Unnamed: 0,alertas_actual,alertas_p90,alertas_p95,alertas_p97,alertas_p99
0,38,0,0,0,0
