In [1]:
# === P-1st — Percentiles de monto (primera transacción ≤7 días desde apertura) ==
import pandas as pd
import numpy as np

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

# Parámetros
PATH = "../../data/tx/transacciones_cash_2025__with_subsub.csv"
SUBSUBSEGMENTS = "I-1"
PCTS = [0.85, 0.90, 0.95, 0.97, 0.99]
FILTER_TO_CASH = True
WINDOW_DAYS = 7   # incluye el día 0 (apertura)

# --- Carga ---
df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"] = pd.to_datetime(df["tx_date_time"], errors="coerce")
df["customer_account_creation_date"] = pd.to_datetime(df["customer_account_creation_date"], errors="coerce")
df["tx_base_amount"] = pd.to_numeric(df["tx_base_amount"], errors="coerce")

if isinstance(SUBSUBSEGMENTS, str):
    target_labels = {SUBSUBSEGMENTS}
else:
    target_labels = set(map(str, SUBSUBSEGMENTS))

df = df[df["customer_sub_sub_type"].astype(str).isin(target_labels)].copy()

if FILTER_TO_CASH and "tx_type" in df.columns:
    df["tx_type"] = df["tx_type"].astype(str).str.title()

# --- Filtrado básico (fechas y montos válidos) ---
m = df["tx_date_time"].notna() & df["customer_account_creation_date"].notna() & df["tx_base_amount"].notna()
if FILTER_TO_CASH and "tx_type" in df.columns:
    m &= df["tx_type"].eq("Cash")

g = df.loc[m, ["customer_name","tx_date_time","customer_account_creation_date","tx_base_amount"]].copy()
if g.empty:
    print("No hay transacciones elegibles para P-1st.")
else:
    # Normalizamos ambas a fecha (ignora horas) para el cálculo de días
    g["tx_date"]   = g["tx_date_time"].dt.normalize()
    g["open_date"] = g["customer_account_creation_date"].dt.normalize()

    # Tomamos la PRIMERA transacción real de cada cliente (la más antigua por fecha/hora)
    idx_first = g.sort_values(["customer_name","tx_date_time"]).groupby("customer_name").head(1).index
    first = g.loc[idx_first].copy()

    # Días desde apertura (incluye día 0 = mismo día de apertura)
    first["days_since_open"] = (first["tx_date"] - first["open_date"]).dt.days

    # Filtro: entre 0 y WINDOW_DAYS inclusive, y montos positivos
    within = first["days_since_open"].between(0, WINDOW_DAYS, inclusive="both") & (first["tx_base_amount"] > 0)
    first_in_window = first.loc[within].copy()

    # Reporte de quiénes entran
    if first_in_window.empty:
        print("No hay primeras transacciones dentro de los 7 días (incluyendo el día de apertura).")
    else:
        first_in_window = first_in_window.rename(columns={
            "customer_name": "Customer",
            "open_date": "Account_Open_Date",
            "tx_date_time": "First_Tx_Timestamp",
            "tx_base_amount": "Amount_CLP"
        })[["Customer","Account_Open_Date","First_Tx_Timestamp","days_since_open","Amount_CLP"]] \
          .sort_values(["days_since_open","Amount_CLP"], ascending=[True, False])

        print(f"Clientes con 1ª transacción dentro de {WINDOW_DAYS} días (incluye día 0): {len(first_in_window):,}\n")
        display(first_in_window)

        # Percentiles del monto
        q = first_in_window["Amount_CLP"].astype(float).quantile(PCTS)
        out = pd.DataFrame({
            "percentil":[f"p{int(p*100)}" for p in PCTS],
            "Amount_CLP":[q.get(p, np.nan) for p in PCTS]
        })
        print(f"\n=== P-1st — Percentiles (n={len(first_in_window):,}) ===")
        display(out)


  df = pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")


No hay primeras transacciones dentro de los 7 días (incluyendo el día de apertura).


# Simulación alertas

In [5]:
# === P-1st — Sensibilidad (Actual vs propuestos) ===============================
# LÓGICA EXACTA:
# customer_account_creation_date - tx_date_time < [Days]   (incluye el día 0)
# AND count de transacciones por {customer_id} = 1
# AND tx_base_amount > [Amount]
# Unidad = transacciones que cumplen (primeras)

import pandas as pd, numpy as np
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

PATH="../../data/tx_retail_whale.csv"
PARAMS={
    "Actual":{"Days":7, "Amount":990_300_000},
    "p95":   {"Days":7, "Amount":202_000_000},
    "p97":   {"Days":7, "Amount":203_200_000},
    "p99":   {"Days":7, "Amount":204_400_000},
}

df=pd.read_csv(PATH, dtype={"customer_id":"string"}, encoding="utf-8-sig")
df["tx_date_time"]=pd.to_datetime(df["tx_date_time"], errors="coerce")
df["customer_account_creation_date"]=pd.to_datetime(df["customer_account_creation_date"], errors="coerce")
df["tx_base_amount"]=pd.to_numeric(df["tx_base_amount"], errors="coerce")

g=df[df["tx_date_time"].notna() & df["customer_account_creation_date"].notna() & df["tx_base_amount"].notna()].copy()
g=g.sort_values(["customer_id","tx_date_time"])
g["tx_order"]=g.groupby("customer_id").cumcount()+1
g["days_from_open"]= (g["tx_date_time"] - g["customer_account_creation_date"]).dt.total_seconds()/86400.0

param_tbl=pd.DataFrame(PARAMS).T.rename_axis("escenario").reset_index()
print("=== P-1st — Parámetros ==="); display(param_tbl)

counts={}
for k,v in PARAMS.items():
    D, A = v["Days"], v["Amount"]
    m=(g["tx_order"].eq(1) & (g["days_from_open"]>=0) & (g["days_from_open"]<=D) & (g["tx_base_amount"]>A))
    counts[k]=int(m.sum())

out=pd.DataFrame([{
    "alertas_actual":counts.get("Actual",0),
    "alertas_p95":counts.get("p95",0),
    "alertas_p97":counts.get("p97",0),
    "alertas_p99":counts.get("p99",0),
}])
print("=== P-1st — Alertas por escenario (tx) ==="); display(out)


=== P-1st — Parámetros ===


Unnamed: 0,escenario,Days,Amount
0,Actual,7,990300000
1,p95,7,202000000
2,p97,7,203200000
3,p99,7,204400000


=== P-1st — Alertas por escenario (tx) ===


Unnamed: 0,alertas_actual,alertas_p95,alertas_p97,alertas_p99
0,0,0,0,0
