# Definición parámetros

In [3]:
# === HNR-IN — Percentiles de Number (redondas & >1000 CLP) en la mejor ventana 30d ===
# LÓGICA EXACTA (para el dataset de parametrización):
#   Dirección = Inbound
#   Tipo      = Cash
#   "Round"   = tx_amount múltiplo exacto de 1.000 (en moneda original)
#   "High"    = tx_base_amount > 1.000 CLP
#   Métrica   = Por cliente, máximo # de tx en cualquier ventana de 30 días (timestamp a timestamp)
# Salida: percentiles de ese máximo (p90/p95/p97/p99) y sugerencia (ceil p95)

import pandas as pd, numpy as np, math

# -------- Parámetros editables --------
PATH = "../../data/tx/datos_trx__with_subsub.csv"  # <-- tu CSV
SUBSUBSEGMENTS = ["R-High"]              # <-- ajusta el sub-subsegmento
WINDOW_DAYS = 30
BASE_MIN_CLP = 1000
PCTS = [95, 97, 99]

# -------- Carga mínima --------
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")       # moneda original
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")  # CLP base
df["tx_direction"]   = df.get("tx_direction", "").astype(str).str.title()
df["tx_type"]        = df.get("tx_type", "").astype(str).str.title()

if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# -------- Filtros HNR-IN --------
is_round = np.isfinite(df["tx_amount"]) & np.isclose(df["tx_amount"] % 1000.0, 0.0, atol=1e-9)
m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    is_round &
    df["tx_base_amount"].gt(BASE_MIN_CLP) &
    df["tx_date_time"].notna() &
    df["customer_id"].notna()
)
g = df.loc[m, ["customer_id", "tx_date_time"]].sort_values(["customer_id","tx_date_time"]).copy()

def max_count_30d(ts: np.ndarray, days=30) -> int:
    """Devuelve el máximo conteo de timestamps dentro de cualquier ventana [t, t+days]."""
    if ts.size == 0: return 0
    ts = np.sort(ts)
    j = 0
    best = 0
    delta = np.timedelta64(days, "D")
    for i in range(ts.size):
        end = ts[i] + delta
        while j < ts.size and ts[j] <= end:
            j += 1
        best = max(best, j - i)
    return best

if g.empty:
    print("HNR-IN: no hay transacciones elegibles.")
else:
    # Máximo por cliente
    out_rows = []
    for cid, sub in g.groupby("customer_id", sort=False):
        times = sub["tx_date_time"].values
        out_rows.append({"customer_id": cid, "max_count_30d": max_count_30d(times, WINDOW_DAYS)})

    res = pd.DataFrame(out_rows)
    s = res["max_count_30d"].astype(float)

    stats = {f"p{p}": (float(np.percentile(s, p)) if len(s) else np.nan) for p in PCTS}
    recommended = int(math.ceil(stats["p95"])) if np.isfinite(stats.get("p95", np.nan)) else np.nan

    print("=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===")
    print(f"Clientes con tx elegibles: {len(res):,}")
    for p in PCTS:
        v = stats[f"p{p}"]
        print(f"p{p:>2}: {v:.2f}" if np.isfinite(v) else f"p{p:>2}: NA")
    print(f"\nWhole Number recomendado (ceil p95): {recommended}")


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===
Clientes con tx elegibles: 154
p95: 4.35
p97: 5.00
p99: 5.47

Whole Number recomendado (ceil p95): 5


# Simulación alertas

In [6]:
# === HNR-IN — Simulación de alertas (Actual vs propuestos) =====================
# LÓGICA EXACTA:
# tx_direction = Inbound
# AND tx_type = Cash
# AND tx_base_amount > 1000
# AND mod( tx_amount [default: 0.0001] , 1000 ) = 0   (redonda en moneda original)
# AND count de tx por {customer_id & tx_direction} en 30 días > [Number]
# Unidad = ventanas cliente–día que cumplen

import pandas as pd, numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ---- EDITA AQUÍ ----------------------------------------------------------------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = ["I-2"]               # <-- ajusta el sub-subsegmento
PARAMS = {
    "Actual": {"Number": 18},
    "p95":    {"Number": 15},
    "p97":    {"Number": 16},
    "p99":    {"Number": 17},
}
# -------------------------------------------------------------------------------

df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")
df["tx_direction"]   = df.get("tx_direction","").astype(str).str.title()
df["tx_type"]        = df.get("tx_type","").astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# "Round" en moneda original con default 0.0001
amt_orig = df["tx_amount"].fillna(0.0001)
is_round = np.isfinite(amt_orig) & np.isclose(amt_orig % 1000, 0, atol=1e-9)

m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    df["tx_date_time"].notna() &
    df["customer_id"].notna() &
    is_round &
    (df["tx_base_amount"] > 1000)
)
g = df.loc[m, ["customer_id","tx_date_time"]].copy()

if g.empty:
    print("No hay transacciones elegibles para HNR-IN.")
else:
    parts=[]
    for cid, sub in g.groupby("customer_id", sort=False):
        # conteo diario de redondas high-value
        daily_cnt = (sub.set_index("tx_date_time")
                        .assign(x=1)["x"]
                        .resample("D").sum()
                        .fillna(0.0))
        CNT30 = daily_cnt.rolling("30D").sum()
        parts.append(pd.DataFrame({"customer_id": cid, "date": CNT30.index, "CNT30": CNT30.values}))
    M = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["customer_id","date","CNT30"])

    order = ["Actual","p90","p95","p97","p99"]
    param_tbl = (pd.DataFrame(PARAMS).T
                   .loc[[k for k in order if k in PARAMS]]
                   .rename_axis("escenario")
                   .reset_index())
    print("=== HNR-IN — Parámetros (Number) ==="); display(param_tbl)

    counts={}
    for k,v in PARAMS.items():
        N = v["Number"]
        m_ok = (M["CNT30"] > N)
        counts[k] = int(M.loc[m_ok, ["customer_id","date"]].drop_duplicates().shape[0])

    out = pd.DataFrame([{
        "alertas_actual": counts.get("Actual",0),
        "alertas_p90":    counts.get("p90",0),
        "alertas_p95":    counts.get("p95",0),
        "alertas_p97":    counts.get("p97",0),
        "alertas_p99":    counts.get("p99",0),
    }])
    print("=== HNR-IN — Alertas por escenario (ventanas cliente–día) ==="); display(out)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== HNR-IN — Parámetros (Number) ===


Unnamed: 0,escenario,Number
0,Actual,18
1,p95,15
2,p97,16
3,p99,17


=== HNR-IN — Alertas por escenario (ventanas cliente–día) ===


Unnamed: 0,alertas_actual,alertas_p90,alertas_p95,alertas_p97,alertas_p99
0,0,0,14,4,2


In [3]:
# === HNR-IN — Simulación vs Reales (ventanas desde 2025-03-01) ================
# Lógica simulación:
#   INBOUND + CASH + tx_base_amount > 1000 + monto original "redondo" (mod 1000 = 0)
#   CNT30 (rolling 30D por cliente) > Number
# Conteo de ventanas SOLO desde 2025-03-01; el rolling usa TODO el historial.

import pandas as pd
import numpy as np

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ------------------------------ Config ---------------------------------
TX_PATH      = "../../data/tx/datos_trx__with_subsub_oficial.csv"
ALERTS_PATH  = "../../data/alertas marzo-agosto 2025 enriched.csv"
CUSTOMER_SUB_TYPE = "Retail"

COUNT_FROM   = pd.Timestamp("2025-03-01", tz="UTC")
COUNT_TO     = None  # p.ej., pd.Timestamp("2025-08-31 23:59:59", tz="UTC") si quieres acotar

PARAMS = {"Actual": {"Number": 6}}  # umbral CNT30 > 18

# ------------------------------ Helpers --------------------------------
def _to_dt_utc(s):
    return pd.to_datetime(s, errors="coerce", utc=True)

def _is_round_thousands(series_amount):
    amt = pd.to_numeric(series_amount, errors="coerce").fillna(0.0001)
    return np.isfinite(amt) & np.isclose(amt % 1000, 0, atol=1e-9)

def _restrict_windows_after(df, date_col="date", count_from=COUNT_FROM, count_to=COUNT_TO):
    m = df[date_col] >= count_from
    if count_to is not None:
        m &= (df[date_col] <= count_to)
    return df.loc[m].copy()

def _load_tx_retail(path):
    tx = pd.read_csv(path, dtype={"customer_id":"string"}, encoding="utf-8-sig", low_memory=False)
    tx["customer_sub_type"] = tx.get("customer_sub_type","").astype(str)
    tx = tx[tx["customer_sub_type"] == CUSTOMER_SUB_TYPE].copy()

    tx["tx_direction"]   = tx.get("tx_direction","").astype(str).str.title()
    tx["tx_type"]        = tx.get("tx_type","").astype(str).str.title()
    tx["tx_amount"]      = pd.to_numeric(tx.get("tx_amount"), errors="coerce")
    tx["tx_base_amount"] = pd.to_numeric(tx.get("tx_base_amount"), errors="coerce")
    tx["tx_date_time"]   = _to_dt_utc(tx.get("tx_date_time"))
    tx["customer_id"]    = tx["customer_id"].astype("string")
    return tx

def _simulate_hnr_in(tx, number_threshold):
    elig = (
        tx["tx_direction"].eq("Inbound") &
        tx["tx_type"].eq("Cash") &
        tx["tx_date_time"].notna() &
        tx["customer_id"].notna() &
        (tx["tx_base_amount"] > 1000) &
        _is_round_thousands(tx["tx_amount"])
    )
    g = tx.loc[elig, ["customer_id","tx_date_time"]].copy()
    if g.empty:
        return pd.DataFrame(columns=["customer_id","window_day"])

    parts = []
    for cid, sub in g.groupby("customer_id", sort=False):
        daily_cnt = (sub.set_index("tx_date_time")
                        .assign(x=1)["x"]
                        .resample("D").sum()
                        .fillna(0.0))
        cnt30 = daily_cnt.rolling("30D").sum()
        parts.append(pd.DataFrame({"customer_id": cid, "date": cnt30.index, "CNT30": cnt30.values}))

    M = pd.concat(parts, ignore_index=True)
    M = _restrict_windows_after(M, "date", COUNT_FROM, COUNT_TO)
    M = M.loc[M["CNT30"] > number_threshold, ["customer_id","date"]].drop_duplicates()
    M["window_day"] = pd.to_datetime(M["date"]).dt.normalize()
    return M[["customer_id","window_day"]]

def _load_reales_hnr_in(path):
    dtypes = {"alert_id":"string","rule_code":"string","customer_id":"string"}
    real = pd.read_csv(path, dtype=dtypes, encoding="utf-8-sig", low_memory=False)
    real["rule_code"] = real["rule_code"].astype(str).str.strip()
    real = real[(real["rule_code"] == "HNR-IN") & (real.get("customer_sub_type","").astype(str) == CUSTOMER_SUB_TYPE)].copy()

    real["tx_date_time"] = _to_dt_utc(real.get("tx_date_time"))
    real["created_at"]   = _to_dt_utc(real.get("created_at"))
    real["ref_dt"]       = real["tx_date_time"].where(real["tx_date_time"].notna(), real["created_at"])

    m = real["ref_dt"] >= COUNT_FROM
    if COUNT_TO is not None:
        m &= (real["ref_dt"] <= COUNT_TO)
    real = real.loc[m].copy()

    real["customer_id"] = real["customer_id"].astype("string").str.strip()
    real["window_day"]  = real["ref_dt"].dt.normalize()

    real_win = real[["customer_id","window_day"]].dropna().drop_duplicates().reset_index(drop=True)
    return real, real_win

# ------------------------------ Run ------------------------------------
tx_all = _load_tx_retail(TX_PATH)
N = int(PARAMS["Actual"]["Number"])

sim_win = _simulate_hnr_in(tx_all, N)
real_full, real_win = _load_reales_hnr_in(ALERTS_PATH)

# Comparación
sim_keys  = set(map(tuple, sim_win[["customer_id","window_day"]].itertuples(index=False, name=None)))
real_keys = set(map(tuple, real_win[["customer_id","window_day"]].itertuples(index=False, name=None)))

only_real = real_keys - sim_keys
only_sim  = sim_keys - real_keys

reales_sin_sim = pd.DataFrame(list(only_real), columns=["customer_id","window_day"]).sort_values(["customer_id","window_day"])
sim_sin_reales = pd.DataFrame(list(only_sim), columns=["customer_id","window_day"]).sort_values(["customer_id","window_day"])

# ------------------------------ Logs claros ----------------------------
print("=== HNR-IN — Resumen ===")
print(f"Alertas reales: {len(real_win):,}")
print(f"Alertas simuladas: {len(sim_win):,}")
print(f"Alertas reales que no están en la simulación: {len(reales_sin_sim):,}")
print(f"Alertas en la simulación que no saltaron realmente: {len(sim_sin_reales):,}")

print("\n--- Reales sin simulación (head) ---")
display(real_full.merge(reales_sin_sim, on=["customer_id","window_day"], how="inner").head(20)[
    ["alert_id","customer_id","window_day","tx_date_time","created_at","tx_amount","tx_base_amount","tx_direction","tx_type"]
])

print("\n--- Simulación sin reales (head) ---")
display(sim_sin_reales.head(20))


=== HNR-IN — Resumen ===
Alertas reales: 390
Alertas simuladas: 1,853
Alertas reales que no están en la simulación: 390
Alertas en la simulación que no saltaron realmente: 1,853

--- Reales sin simulación (head) ---


Unnamed: 0,alert_id,customer_id,window_day,tx_date_time,created_at,tx_amount,tx_base_amount,tx_direction,tx_type
0,83306,19244042,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:36:25.387345+00:00,5000000.0,5000000.0,Inbound,Cash
1,83323,9002247,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:36:29.252635+00:00,7000000.0,7000000.0,Inbound,Cash
2,83358,13952685,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:36:38.068652+00:00,150000.0,150000.0,Inbound,Cash
3,83368,10920013,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:36:38.068652+00:00,950000.0,950000.0,Inbound,Cash
4,83587,24654765,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:47:02.055679+00:00,20000.0,20000.0,Inbound,Cash
5,83626,19959568,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:50:30.969628+00:00,150000.0,150000.0,Inbound,Cash
6,83637,11624949,2025-03-03 00:00:00+00:00,2025-03-03 00:00:00+00:00,2025-03-11 12:50:41.586388+00:00,2500000.0,2500000.0,Inbound,Cash
7,83759,9002247,2025-03-04 00:00:00+00:00,2025-03-04 00:00:00+00:00,2025-03-12 12:27:52.991638+00:00,6500000.0,6500000.0,Inbound,Cash
8,83766,24654765,2025-03-04 00:00:00+00:00,2025-03-04 00:00:00+00:00,2025-03-12 12:27:52.991638+00:00,10000.0,10000.0,Inbound,Cash
9,83767,15385626,2025-03-04 00:00:00+00:00,2025-03-04 00:00:00+00:00,2025-03-12 12:27:52.991638+00:00,"2000000.0, 2500000.0","2000000.0, 2500000.0",Inbound,Cash



--- Simulación sin reales (head) ---


Unnamed: 0,customer_id,window_day
1797,10210630,2025-08-05 00:00:00+00:00
1081,10268897,2025-05-08 00:00:00+00:00
277,10268897,2025-05-09 00:00:00+00:00
271,10268897,2025-05-10 00:00:00+00:00
1410,10268897,2025-05-11 00:00:00+00:00
1622,10268897,2025-05-12 00:00:00+00:00
603,10268897,2025-05-13 00:00:00+00:00
851,10268897,2025-05-14 00:00:00+00:00
844,10268897,2025-05-15 00:00:00+00:00
1377,10268897,2025-05-16 00:00:00+00:00
