# Definición parámetros

In [None]:
# === HNR-IN — Percentiles de Number (redondas & >1000 CLP) en la mejor ventana 30d ===
# LÓGICA EXACTA (para el dataset de parametrización):
#   Dirección = Inbound
#   Tipo      = Cash
#   "Round"   = tx_amount múltiplo exacto de 1.000 (en moneda original)
#   "High"    = tx_base_amount > 1.000 CLP
#   Métrica   = Por cliente, máximo # de tx en cualquier ventana de 30 días (timestamp a timestamp)
# Salida: percentiles de ese máximo (p90/p95/p97/p99) y sugerencia (ceil p95)

import pandas as pd, numpy as np, math

# -------- Parámetros editables --------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"  # <-- tu CSV
SUBSUBSEGMENTS = ["I-1", "I-2"]              # <-- ajusta el sub-subsegmento
WINDOW_DAYS = 30
BASE_MIN_CLP = 1000
PCTS = [95, 97, 99]

# -------- Carga mínima --------
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")       # moneda original
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")  # CLP base
df["tx_direction"]   = df.get("tx_direction", "").astype(str).str.title()
df["tx_type"]        = df.get("tx_type", "").astype(str).str.title()

if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# -------- Filtros HNR-IN --------
is_round = np.isfinite(df["tx_amount"]) & np.isclose(df["tx_amount"] % 1000.0, 0.0, atol=1e-9)
m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    is_round &
    df["tx_base_amount"].gt(BASE_MIN_CLP) &
    df["tx_date_time"].notna() &
    df["customer_id"].notna()
)
g = df.loc[m, ["customer_id", "tx_date_time"]].sort_values(["customer_id","tx_date_time"]).copy()

def max_count_30d(ts: np.ndarray, days=30) -> int:
    """Devuelve el máximo conteo de timestamps dentro de cualquier ventana [t, t+days]."""
    if ts.size == 0: return 0
    ts = np.sort(ts)
    j = 0
    best = 0
    delta = np.timedelta64(days, "D")
    for i in range(ts.size):
        end = ts[i] + delta
        while j < ts.size and ts[j] <= end:
            j += 1
        best = max(best, j - i)
    return best

if g.empty:
    print("HNR-IN: no hay transacciones elegibles.")
else:
    # Máximo por cliente
    out_rows = []
    for cid, sub in g.groupby("customer_id", sort=False):
        times = sub["tx_date_time"].values
        out_rows.append({"customer_id": cid, "max_count_30d": max_count_30d(times, WINDOW_DAYS)})

    res = pd.DataFrame(out_rows)
    s = res["max_count_30d"].astype(float)

    stats = {f"p{p}": (float(np.percentile(s, p)) if len(s) else np.nan) for p in PCTS}
    recommended = int(math.ceil(stats["p95"])) if np.isfinite(stats.get("p95", np.nan)) else np.nan

    print("=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===")
    print(f"Clientes con tx elegibles: {len(res):,}")
    for p in PCTS:
        v = stats[f"p{p}"]
        print(f"p{p:>2}: {v:.2f}" if np.isfinite(v) else f"p{p:>2}: NA")
    print(f"\nWhole Number recomendado (ceil p95): {recommended}")


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== HNR-IN — Máximo # redondas & >1000 CLP en 30 días (por cliente) ===
Clientes con tx elegibles: 17
p95: 14.80
p97: 16.08
p99: 17.36

Whole Number recomendado (ceil p95): 15


# Simulación alertas

In [6]:
# === HNR-IN — Simulación de alertas (Actual vs propuestos) =====================
# LÓGICA EXACTA:
# tx_direction = Inbound
# AND tx_type = Cash
# AND tx_base_amount > 1000
# AND mod( tx_amount [default: 0.0001] , 1000 ) = 0   (redonda en moneda original)
# AND count de tx por {customer_id & tx_direction} en 30 días > [Number]
# Unidad = ventanas cliente–día que cumplen

import pandas as pd, numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# ---- EDITA AQUÍ ----------------------------------------------------------------
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = ["I-2"]               # <-- ajusta el sub-subsegmento
PARAMS = {
    "Actual": {"Number": 18},
    "p95":    {"Number": 15},
    "p97":    {"Number": 16},
    "p99":    {"Number": 17},
}
# -------------------------------------------------------------------------------

df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]   = pd.to_datetime(df.get("tx_date_time"), errors="coerce")
df["tx_amount"]      = pd.to_numeric(df.get("tx_amount"), errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df.get("tx_base_amount"), errors="coerce")
df["tx_direction"]   = df.get("tx_direction","").astype(str).str.title()
df["tx_type"]        = df.get("tx_type","").astype(str).str.title()

# Filtrado por sub-subsegmento
if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

# "Round" en moneda original con default 0.0001
amt_orig = df["tx_amount"].fillna(0.0001)
is_round = np.isfinite(amt_orig) & np.isclose(amt_orig % 1000, 0, atol=1e-9)

m = (
    df["tx_direction"].eq("Inbound") &
    df["tx_type"].eq("Cash") &
    df["tx_date_time"].notna() &
    df["customer_id"].notna() &
    is_round &
    (df["tx_base_amount"] > 1000)
)
g = df.loc[m, ["customer_id","tx_date_time"]].copy()

if g.empty:
    print("No hay transacciones elegibles para HNR-IN.")
else:
    parts=[]
    for cid, sub in g.groupby("customer_id", sort=False):
        # conteo diario de redondas high-value
        daily_cnt = (sub.set_index("tx_date_time")
                        .assign(x=1)["x"]
                        .resample("D").sum()
                        .fillna(0.0))
        CNT30 = daily_cnt.rolling("30D").sum()
        parts.append(pd.DataFrame({"customer_id": cid, "date": CNT30.index, "CNT30": CNT30.values}))
    M = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["customer_id","date","CNT30"])

    order = ["Actual","p90","p95","p97","p99"]
    param_tbl = (pd.DataFrame(PARAMS).T
                   .loc[[k for k in order if k in PARAMS]]
                   .rename_axis("escenario")
                   .reset_index())
    print("=== HNR-IN — Parámetros (Number) ==="); display(param_tbl)

    counts={}
    for k,v in PARAMS.items():
        N = v["Number"]
        m_ok = (M["CNT30"] > N)
        counts[k] = int(M.loc[m_ok, ["customer_id","date"]].drop_duplicates().shape[0])

    out = pd.DataFrame([{
        "alertas_actual": counts.get("Actual",0),
        "alertas_p90":    counts.get("p90",0),
        "alertas_p95":    counts.get("p95",0),
        "alertas_p97":    counts.get("p97",0),
        "alertas_p99":    counts.get("p99",0),
    }])
    print("=== HNR-IN — Alertas por escenario (ventanas cliente–día) ==="); display(out)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


=== HNR-IN — Parámetros (Number) ===


Unnamed: 0,escenario,Number
0,Actual,18
1,p95,15
2,p97,16
3,p99,17


=== HNR-IN — Alertas por escenario (ventanas cliente–día) ===


Unnamed: 0,alertas_actual,alertas_p90,alertas_p95,alertas_p97,alertas_p99
0,0,0,14,4,2
