# Definición parámetros

In [3]:
# === HNR-IN — Percentiles de Number (redondas & >1000 CLP) en la mejor ventana 30d ===
# LÓGICA EXACTA (para el dataset de parametrización):
#   Dirección = Inbound
#   Tipo      = Cash
#   "Round"   = tx_amount múltiplo exacto de 1.000 (en moneda original)
#   "High"    = tx_base_amount > 1.000 CLP
#   Métrica   = Por cliente, máximo # de tx en cualquier ventana de 30 días (timestamp a timestamp)
# Salida: percentiles de ese máximo (p90/p95/p97/p99) y sugerencia (ceil p95)

import pandas as pd, numpy as np, math

# -------- Parámetros editables --------
PATH = "../../data/tx/datos_trx__with_subsub.csv"  # <-- tu CSV
SUBSUBSEGMENTS = ["R-High"]              # <-- ajusta el sub-subsegmento
WINDOW_DAYS = 30
BASE_MIN_CLP = 1000
PCTS = [95, 97, 99]

# -------- Carga mínima --------
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")       # moneda original
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")  # CLP base
df["tx_direction"]   = df.get("tx_direction", "").astype(str).str.title()
df["tx_type"]        = df.get("tx_type", "").astype(str).str.title()

if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# -------- Filtros HNR-IN --------
is_round = np.isfinite(df["tx_amount"]) & np.isclose(df["tx_amount"] % 1000.0, 0.0, atol=1e-9)
m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    is_round &
    df["tx_base_amount"].gt(BASE_MIN_CLP) &
    df["tx_date_time"].notna() &
    df["customer_id"].notna()
)
g = df.loc[m, ["customer_id", "tx_date_time"]].sort_values(["customer_id","tx_date_time"]).copy()

def max_count_30d(ts: np.ndarray, days=30) -> int:
    """Devuelve el máximo conteo de timestamps dentro de cualquier ventana [t, t+days]."""
    if ts.size == 0: return 0
    ts = np.sort(ts)
    j = 0
    best = 0
    delta = np.timedelta64(days, "D")
    for i in range(ts.size):
        end = ts[i] + delta
        while j < ts.size and ts[j] <= end:
            j += 1
        best = max(best, j - i)
    return best

if g.empty:
    print("HNR-IN: no hay transacciones elegibles.")
else:
    # Máximo por cliente
    out_rows = []
    for cid, sub in g.groupby("customer_id", sort=False):
        times = sub["tx_date_time"].values
        out_rows.append({"customer_id": cid, "max_count_30d": max_count_30d(times, WINDOW_DAYS)})

    res = pd.DataFrame(out_rows)
    s = res["max_count_30d"].astype(float)

    stats = {f"p{p}": (float(np.percentile(s, p)) if len(s) else np.nan) for p in PCTS}
    recommended = int(math.ceil(stats["p95"])) if np.isfinite(stats.get("p95", np.nan)) else np.nan

    print("=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===")
    print(f"Clientes con tx elegibles: {len(res):,}")
    for p in PCTS:
        v = stats[f"p{p}"]
        print(f"p{p:>2}: {v:.2f}" if np.isfinite(v) else f"p{p:>2}: NA")
    print(f"\nWhole Number recomendado (ceil p95): {recommended}")


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===
Clientes con tx elegibles: 154
p95: 4.35
p97: 5.00
p99: 5.47

Whole Number recomendado (ceil p95): 5


# Simulación alertas

In [6]:
# === HNR-IN — Simulación vs Reales (ventanas desde 2025-03-01) ================
# Lógica simulación:
#   INBOUND + CASH + tx_base_amount > 1000 + monto original "redondo" (mod 1000 = 0)
#   CNT30 (rolling 30D por cliente) > Number
# Conteo de ventanas SOLO desde 2025-03-01; el rolling usa TODO el historial.

import pandas as pd
import numpy as np

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ------------------------------ Config ---------------------------------
TX_PATH      = "../../data/tx/datos_trx__with_subsub_oficial.csv"
ALERTS_PATH  = "../../data/alertas marzo-agosto 2025 enriched.csv"
CUSTOMER_SUB_TYPE = "Retail"

COUNT_FROM   = pd.Timestamp("2025-02-21", tz="UTC")
COUNT_TO     = None  # p.ej., pd.Timestamp("2025-08-31 23:59:59", tz="UTC") si quieres acotar

PARAMS = {"Actual": {"Number": 6}}  # umbral CNT30 > 18

# ------------------------------ Helpers --------------------------------
def _to_dt_utc(s):
    return pd.to_datetime(s, errors="coerce", utc=True)

def _is_round_thousands(series_amount):
    amt = pd.to_numeric(series_amount, errors="coerce").fillna(0.0001)
    return np.isfinite(amt) & np.isclose(amt % 1000, 0, atol=1e-9)

def _restrict_windows_after(df, date_col="date", count_from=COUNT_FROM, count_to=COUNT_TO):
    m = df[date_col] >= count_from
    if count_to is not None:
        m &= (df[date_col] <= count_to)
    return df.loc[m].copy()

def _load_tx_retail(path):
    tx = pd.read_csv(path, dtype={"customer_id":"string"}, encoding="utf-8-sig", low_memory=False)
    tx["customer_sub_type"] = tx.get("customer_sub_type","").astype(str)
    tx = tx[tx["customer_sub_type"] == CUSTOMER_SUB_TYPE].copy()

    tx["tx_direction"]   = tx.get("tx_direction","").astype(str).str.title()
    tx["tx_type"]        = tx.get("tx_type","").astype(str).str.title()
    tx["tx_amount"]      = pd.to_numeric(tx.get("tx_amount"), errors="coerce")
    tx["tx_base_amount"] = pd.to_numeric(tx.get("tx_base_amount"), errors="coerce")
    tx["tx_date_time"]   = _to_dt_utc(tx.get("tx_date_time"))
    tx["customer_id"]    = tx["customer_id"].astype("string")
    return tx

def _simulate_hnr_in(tx, number_threshold):
    elig = (
        tx["tx_direction"].eq("Inbound") &
        tx["tx_type"].eq("Cash") &
        tx["tx_date_time"].notna() &
        tx["customer_id"].notna() &
        (tx["tx_base_amount"] > 1000) &
        _is_round_thousands(tx["tx_amount"])
    )
    g = tx.loc[elig, ["customer_id","tx_date_time"]].copy()
    if g.empty:
        return pd.DataFrame(columns=["customer_id","window_day"])

    parts = []
    for cid, sub in g.groupby("customer_id", sort=False):
        daily_cnt = (sub.set_index("tx_date_time")
                        .assign(x=1)["x"]
                        .resample("D").sum()
                        .fillna(0.0))
        cnt30 = daily_cnt.rolling("30D").sum()
        parts.append(pd.DataFrame({"customer_id": cid, "date": cnt30.index, "CNT30": cnt30.values}))

    M = pd.concat(parts, ignore_index=True)
    M = _restrict_windows_after(M, "date", COUNT_FROM, COUNT_TO)
    M = M.loc[M["CNT30"] > number_threshold, ["customer_id","date"]].drop_duplicates()
    M["window_day"] = pd.to_datetime(M["date"]).dt.normalize()
    return M[["customer_id","window_day"]]

def _load_reales_hnr_in(path):
    dtypes = {"alert_id":"string","rule_code":"string","customer_id":"string"}
    real = pd.read_csv(path, dtype=dtypes, encoding="utf-8-sig", low_memory=False)
    real["rule_code"] = real["rule_code"].astype(str).str.strip()
    real = real[(real["rule_code"] == "HNR-IN") & (real.get("customer_sub_type","").astype(str) == CUSTOMER_SUB_TYPE)].copy()

    real["tx_date_time"] = _to_dt_utc(real.get("tx_date_time"))
    real["created_at"]   = _to_dt_utc(real.get("created_at"))
    real["ref_dt"]       = real["tx_date_time"].where(real["tx_date_time"].notna(), real["created_at"])

    m = real["ref_dt"] >= COUNT_FROM
    if COUNT_TO is not None:
        m &= (real["ref_dt"] <= COUNT_TO)
    real = real.loc[m].copy()

    real["customer_id"] = real["customer_id"].astype("string").str.strip()
    real["window_day"]  = real["ref_dt"].dt.normalize()

    real_win = real[["customer_id","window_day"]].dropna().drop_duplicates().reset_index(drop=True)
    return real, real_win

# ------------------------------ Run ------------------------------------
tx_all = _load_tx_retail(TX_PATH)
N = int(PARAMS["Actual"]["Number"])

sim_win = _simulate_hnr_in(tx_all, N)
real_full, real_win = _load_reales_hnr_in(ALERTS_PATH)

# Comparación
sim_keys  = set(map(tuple, sim_win[["customer_id","window_day"]].itertuples(index=False, name=None)))
real_keys = set(map(tuple, real_win[["customer_id","window_day"]].itertuples(index=False, name=None)))

only_real = real_keys - sim_keys
only_sim  = sim_keys - real_keys

reales_sin_sim = pd.DataFrame(list(only_real), columns=["customer_id","window_day"]).sort_values(["customer_id","window_day"])
sim_sin_reales = pd.DataFrame(list(only_sim), columns=["customer_id","window_day"]).sort_values(["customer_id","window_day"])

# ------------------------------ Logs claros ----------------------------
print("=== HNR-IN — Resumen ===")
print(f"Alertas reales: {len(real_win):,}")
print(f"Alertas simuladas: {len(sim_win):,}")
print(f"Alertas reales que no están en la simulación: {len(reales_sin_sim):,}")
print(f"Alertas en la simulación que no saltaron realmente: {len(sim_sin_reales):,}")

print("\n--- Reales sin simulación (head) ---")
display(real_full.merge(reales_sin_sim, on=["customer_id","window_day"], how="inner").head(20)[
    ["alert_id","customer_id","window_day","tx_date_time","created_at","tx_amount","tx_base_amount","tx_direction","tx_type"]
])

print("\n--- Simulación sin reales (head) ---")
display(sim_sin_reales.head(20))


=== HNR-IN — Resumen ===
Alertas reales: 419
Alertas simuladas: 1,995
Alertas reales que no están en la simulación: 419
Alertas en la simulación que no saltaron realmente: 1,995

--- Reales sin simulación (head) ---


Unnamed: 0,alert_id,customer_id,window_day,tx_date_time,created_at,tx_amount,tx_base_amount,tx_direction,tx_type
0,80319,11624949,2025-02-21 00:00:00+00:00,2025-02-21 00:00:00+00:00,2025-03-03 12:16:29.539270+00:00,5000000.0,5000000.0,Inbound,Cash
1,80321,11624873,2025-02-21 00:00:00+00:00,2025-02-21 00:00:00+00:00,2025-03-03 12:16:29.539270+00:00,5000000.0,5000000.0,Inbound,Cash
2,80323,16606579,2025-02-21 00:00:00+00:00,2025-02-21 00:00:00+00:00,2025-03-03 12:16:29.539270+00:00,4500000.0,4500000.0,Inbound,Cash
3,80741,19959568,2025-02-21 00:00:00+00:00,2025-02-21 00:00:00+00:00,2025-03-03 13:14:53.772211+00:00,500000.0,500000.0,Inbound,Cash
4,80850,11624949,2025-02-24 00:00:00+00:00,2025-02-24 00:00:00+00:00,2025-03-04 12:24:25.577398+00:00,5000000.0,5000000.0,Inbound,Cash
5,80864,11624873,2025-02-24 00:00:00+00:00,2025-02-24 00:00:00+00:00,2025-03-04 12:24:25.577398+00:00,4800000.0,4800000.0,Inbound,Cash
6,80884,16606579,2025-02-24 00:00:00+00:00,2025-02-24 00:00:00+00:00,2025-03-04 12:24:25.577398+00:00,5000000.0,5000000.0,Inbound,Cash
7,80972,12855410,2025-02-24 00:00:00+00:00,2025-02-24 00:00:00+00:00,2025-03-04 12:27:07.763672+00:00,4000000.0,4000000.0,Inbound,Cash
8,81254,12855410,2025-02-25 00:00:00+00:00,2025-02-25 00:00:00+00:00,2025-03-05 12:20:11.208560+00:00,4000000.0,4000000.0,Inbound,Cash
9,81291,19959568,2025-02-25 00:00:00+00:00,2025-02-25 00:00:00+00:00,2025-03-05 12:20:47.462205+00:00,500000.0,500000.0,Inbound,Cash



--- Simulación sin reales (head) ---


Unnamed: 0,customer_id,window_day
1936,10210630,2025-08-05 00:00:00+00:00
1156,10268897,2025-05-08 00:00:00+00:00
292,10268897,2025-05-09 00:00:00+00:00
286,10268897,2025-05-10 00:00:00+00:00
1509,10268897,2025-05-11 00:00:00+00:00
1741,10268897,2025-05-12 00:00:00+00:00
645,10268897,2025-05-13 00:00:00+00:00
911,10268897,2025-05-14 00:00:00+00:00
904,10268897,2025-05-15 00:00:00+00:00
1473,10268897,2025-05-16 00:00:00+00:00
