### Init Context

In [15]:
import logging
import uuid
import random
import numpy as np
import pandas as pd

import yaml
from datetime import datetime
from faker import Faker
from thetaray.api.context import init_context
from pyspark.sql import functions as f
from thetaray.common.data_environment import DataEnvironment


# Configuración logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

# Configuración pandas
pd.set_option('display.max_columns', None)

# Cargar configuración Spark
with open('/thetaray/git/solutions/domains/demo_digital_wallets/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']


# Inicializar contexto
context = init_context(
    execution_date=datetime(1970, 2, 1),
    #spark_conf=spark_config,
    spark_conf=spark_config, # quitar
    # spark_master='local[*]', # quitar
    allow_type_changes=True,
    drop_undefined_datasets=True,
    delete_unused_columns=True
)

2025-08-19 09:49:53,277:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-08-19 09:49:53,667:INFO:thetaray.common.logging:load_risks took: 0.18817996978759766
2025-08-19 09:49:54,269:INFO:thetaray.common.logging:=== Started updating schema ===
2025-08-19 09:49:54,328:INFO:thetaray.common.logging:=== Started updating schema on Postgres ===
2025-08-19 09:50:09,971:INFO:thetaray.common.logging:found 161 tables in solution public schema
2025-08-19 09:50:09,983:INFO:thetaray.common.logging:found 161 tables in solution public schema
2025-08-19 09:50:09,992:INFO:thetaray.common.logging:found 161 tables in solution public schema
2025-08-19 09:50:10,002:INFO:thetaray.common.logging:found 161 tables in solution public schema
2025-08-19 09:50:10,012:INFO:thetaray.common.logging:found 161 tables in solution public schema
2025-08-19 09:50:10,020:INFO:thetaray.common.loggin

Added `alias` successfully.


2025-08-19 09:50:17,785:INFO:thetaray.common.logging:=== Finished updating schema for Evaluation Flows on Minio ===


Added `alias` successfully.


### Imports

In [16]:
from thetaray.api.dataset import dataset_functions

from domains.demo_digital_wallets.datasets.transactions import transactions_dataset
from domains.demo_digital_wallets.datasets.customer_monthly import customer_monthly_dataset
from domains.demo_digital_wallets.datasets.customer_insights import customer_insights_dataset 
from domains.demo_digital_wallets.datasets.customers import customers_dataset 

# Data Gen

### 1. Transactions Generation

In [17]:
import random
import string
from datetime import datetime, timedelta
from typing import Optional
import numpy as np
import pandas as pd
from itertools import count

In [18]:
from __future__ import annotations

import numpy as np
import pandas as pd
import random
import string
from itertools import count
from datetime import datetime, timedelta
from typing import Optional, Dict, List

# ---------- Configuración UE ----------
EU_COUNTRIES = [
    "AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE",
    "GR","HU","IE","IT","LV","LT","LU","MT","NL","PL","PT",
    "RO","SK","SI","ES","SE"
]
# Prefijos telefónicos E.164 por país (UE-27)
EU_DIAL = {
    "AT": "+43","BE": "+32","BG": "+359","HR": "+385","CY": "+357","CZ": "+420",
    "DK": "+45","EE": "+372","FI": "+358","FR": "+33","DE": "+49","GR": "+30",
    "HU": "+36","IE": "+353","IT": "+39","LV": "+371","LT": "+370","LU": "+352",
    "MT": "+356","NL": "+31","PL": "+48","PT": "+351","RO": "+40","SK": "+421",
    "SI": "+386","ES": "+34","SE": "+46"
}

FIRST_NAMES = ["Maria","Jean","Anna","Pietro","Lukas","Beatriz","Gabriel","Marianne",
               "Rafael","Stefany","Luisa","Felipe","Bruna","Thierry","Carla"]
LAST_NAMES  = ["Garcia","Dubois","Romano","Muller","Fernandes","Rossi","Schmidt","Moreau",
               "Martinez","da Costa","Bernard","Fischer","Rodriguez","Ricci"]

MERCHANT_CATS = ["Grocery","Electronics","Restaurants","Transport","Clothing","Utilities","Streaming","Crypto Exchange"]
PAYMENT_METHODS = ["wallet_balance","credit_card","bank_transfer","pix","crypto_wallet"]
FREE_TEXT = ["Dinner","Gift","Rent","Taxi","Groceries","Coffee","Lunch","Books","Cinema","Gym","Subscription","Pet supplies","School"]

# ---------- Utilidades ----------
def _rand_name() -> str:
    return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"

def _rand_phone_for_country(iso2: str, min_digits: int = 8, max_digits: int = 10) -> str:
    """Genera un número E.164 simple: +<prefix><digits>"""
    prefix = EU_DIAL.get(iso2, "+33")
    n = random.randint(min_digits, max_digits)
    digits = "".join(random.choices(string.digits, k=n))
    return prefix + digits

def _rand_eu_phone_and_country() -> (str, str):
    iso2 = random.choice(EU_COUNTRIES)
    return _rand_phone_for_country(iso2), iso2

def _dt_month_starts(end_date: datetime, months_total: int) -> List[datetime]:
    anchor = datetime(end_date.year, end_date.month, 1)
    month_starts = [(anchor - pd.DateOffset(months=m)).to_pydatetime()
                    for m in range(months_total - 1, -1, -1)]
    return month_starts

# ---------- Función principal ----------
def generate_fake_transactions(
    n_clients: int = 300,
    avg_txns_per_active_month: float = 4.0,
    active_months_per_client: int = 12,
    months_total: int = 18,
    end_date: Optional[datetime] = None,
    seed: Optional[int] = 42,
    currency: str = "EUR",

    # --- Rarezas para features (porcentaje de clientes afectados + toggle) ---
    trigger_struct: bool = True,            pct_struct_clients: float = 0.12, reporting_threshold: float = 1000.0,
    trigger_rapid: bool = True,             pct_rapid_clients: float = 0.10,
    trigger_crypto: bool = True,            pct_crypto_clients: float = 0.10,
    trigger_mto: bool = True,               pct_mto_clients: float = 0.25,
    trigger_pct_domestic: bool = True,      pct_international_clients: float = 0.12,

    # --- Controles de normalidad (no trigger en act_spike y rev_ratio) ---
    base_amount_scale: float = 250.0,   # escala de montos “normales”
    reversal_rate_low: float = 0.005,   # reversals bajos y estables
) -> pd.DataFrame:
    """
    Genera un DataFrame (esquema exacto) de transacciones sintéticas de una wallet UE.
    - IDs de clientes son teléfonos E.164 de países UE.
    - Actividad estacionaria (no dispara act_spike).
    - Reversals bajos y estables (no dispara rev_ratio).
    - Permite inyectar rarezas que activan: struct_score, rapid_spend, crypto_score, mto_score, pct_domestic.

    Parámetros clave:
        n_clients: cantidad de clientes únicos.
        avg_txns_per_active_month: transacciones promedio por mes activo por cliente.
        active_months_per_client: meses con actividad por cliente dentro de la ventana.
        months_total: ventana total (meses de datos).
        end_date: fin del período (por defecto, ahora UTC).

        trigger_* + pct_*: toggles y proporción de clientes afectados por cada rareza.
    """
    # Semilla
    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    if end_date is None:
        end_date = datetime.utcnow()

    # Meses del período
    month_starts = _dt_month_starts(end_date, months_total)
    month_ranges = [(ms, (ms + pd.DateOffset(months=1)).to_pydatetime() - timedelta(seconds=1))
                    for ms in month_starts]

    # ---- Clientes (teléfono E.164 + país) ----
    clients: List[str] = []
    client_country: Dict[str, str] = {}
    client_name: Dict[str, str] = {}

    for _ in range(n_clients):
        phone, iso2 = _rand_eu_phone_and_country()
        while phone in client_country:  # evita colisiones raras
            phone, iso2 = _rand_eu_phone_and_country()
        clients.append(phone)
        client_country[phone] = iso2
        client_name[phone] = _rand_name()

    # ---- Grupos de rarezas (sobre subconjuntos) ----
    def _pick_subset(pct: float) -> set:
        k = max(1, int(pct * n_clients)) if n_clients else 0
        return set(random.sample(clients, k)) if k > 0 else set()

    g_struct = _pick_subset(pct_struct_clients)   if trigger_struct         else set()
    g_rapid  = _pick_subset(pct_rapid_clients)    if trigger_rapid          else set()
    g_crypto = _pick_subset(pct_crypto_clients)   if trigger_crypto         else set()
    g_mto    = _pick_subset(pct_mto_clients)      if trigger_mto            else set()
    g_intl   = _pick_subset(pct_international_clients) if trigger_pct_domestic else set()

    # ---- Parámetros por cliente (mantener estacionariedad) ----
    # Poisson ~ transacciones por mes activo, suavemente variable por cliente
    client_lambda = {
        cid: max(1.0, np.random.normal(avg_txns_per_active_month, 0.6)) for cid in clients
    }
    # Montos "normales" (lognormal)
    client_amount_scale = {
        cid: max(40.0, np.random.lognormal(mean=np.log(base_amount_scale), sigma=0.5)) for cid in clients
    }
    # Reversals bajos y estables (no disparar rev_ratio)
    client_rev_rate = {cid: reversal_rate_low for cid in clients}

    # Probabilidad de DOMÉSTICO vs INTERNACIONAL (para pct_domestic)
    # - Normal: alta domesticidad (~0.9)
    # - Intl: internacional elevado (~0.4)
    client_p_domestic = {}
    for cid in clients:
        if cid in g_intl:
            p = float(np.clip(np.random.normal(0.40, 0.05), 0.15, 0.60))
        else:
            p = float(np.clip(np.random.normal(0.90, 0.03), 0.75, 0.98))
        client_p_domestic[cid] = p

    # Hubs (many-to-one)
    n_hubs = max(2, int(0.02 * n_clients))
    hub_ids = [f"EXT_HUB_{i}" for i in range(1, n_hubs + 1)]
    hub_names = {hid: f"HubPay {i}" for i, hid in enumerate(hub_ids, start=1)}

    # Selección de meses activos por cliente (distribuidos en la ventana)
    def pick_active_month_indices() -> List[int]:
        m = min(active_months_per_client, months_total)
        return sorted(random.sample(range(months_total), m))

    # Generador de IDs de transacción
    _tid = count(1)
    def next_txid() -> str:
        return f"TX{next(_tid):010d}"

    rows = []

    for cid in clients:
        cname = client_name[cid]
        home_iso = client_country[cid]
        lam = client_lambda[cid]
        scale = client_amount_scale[cid]
        p_dom = client_p_domestic[cid]
        rev_rate = client_rev_rate[cid]
        active_idx = pick_active_month_indices()
        hub_choice = random.choice(hub_ids)

        for idx, (m_start, m_end) in enumerate(month_ranges):
            if idx not in active_idx:
                continue  # Mes inactivo => 0 transacciones

            # --- baseline mensual (estacionario; no spike) ---
            n_tx = max(1, np.random.poisson(lam))
            for _ in range(n_tx):
                span_seconds = int((m_end - m_start).total_seconds())
                ts = m_start + timedelta(seconds=random.randint(0, span_seconds))

                # doméstico / internacional
                is_domestic = (np.random.rand() < p_dom)
                if is_domestic:
                    dest_iso = home_iso
                else:
                    other = [c for c in EU_COUNTRIES if c != home_iso]
                    dest_iso = random.choice(other) if other else home_iso

                # contrapartes: a veces otro cliente (teléfono UE), a veces externo/phone
                if random.random() < 0.70:
                    cpid = random.choice(clients)
                    while cpid == cid:
                        cpid = random.choice(clients)
                    cpname = client_name[cpid]
                else:
                    # Si externo, usar teléfono UE o nombre
                    if random.random() < 0.6:
                        cp_iso = random.choice(EU_COUNTRIES)
                        cpid = _rand_phone_for_country(cp_iso)
                        cpname = cpid
                    else:
                        cpid = f"EXT{random.randint(1000,9999)}"
                        cpname = _rand_name()

                direction = random.choice(["inflow","outflow"])
                transaction_type = random.choices(
                    ["payment","transfer_in","transfer_out"],
                    weights=[0.55, 0.20, 0.25],
                    k=1
                )[0]

                payment_method = random.choice(PAYMENT_METHODS[:-1])  # evitar crypto por defecto
                merchant_id = None
                merchant_category = random.choice(MERCHANT_CATS[:-1])

                amount = max(5.0, np.random.gamma(shape=2.0, scale=scale/5.0))

                if transaction_type == "payment":
                    merchant_id = f"M{random.randint(1000,9999)}"
                    merchant_category = random.choice(MERCHANT_CATS[:-1])

                rows.append({
                    "transaction_id": next_txid(),
                    "client_id": cid,
                    "client_name": cname,
                    "counterparty_id": cpid,
                    "counterparty_name": cpname,
                    "transaction_datetime": ts,
                    "amount": round(float(amount), 2),
                    "currency": currency,
                    "transaction_type": transaction_type,
                    "direction": direction,
                    "payment_method": payment_method,
                    "country_origin": home_iso,
                    "country_destination": dest_iso,
                    "is_reversal": (np.random.rand() < rev_rate),
                    "free_text": random.choice(FREE_TEXT) if transaction_type in ("payment","transfer_out") else "",
                    "merchant_id": merchant_id,
                    "merchant_category": merchant_category,
                })

            # ----- Rarezas por feature (no afectan spikes ni reversals) -----

            # 1) Structuring: inflows por debajo del umbral (si cid ∈ g_struct)
            if cid in g_struct:
                n_struct = random.randint(3, 6)
                base_day = random.randint(5, 24)
                for i in range(n_struct):
                    ts = m_start + timedelta(days=min(base_day + i, 27),
                                             hours=random.randint(9, 19),
                                             minutes=random.randint(0, 59))
                    amt = np.clip(
                        np.random.normal(loc=reporting_threshold * 0.97, scale=reporting_threshold * 0.012),
                        reporting_threshold * 0.80,
                        reporting_threshold * 0.99
                    )
                    rows.append({
                        "transaction_id": next_txid(),
                        "client_id": cid,
                        "client_name": cname,
                        "counterparty_id": f"EXT{random.randint(5000,9999)}",
                        "counterparty_name": _rand_phone_for_country(home_iso),
                        "transaction_datetime": ts,
                        "amount": round(float(amt), 2),
                        "currency": currency,
                        "transaction_type": "transfer_in",
                        "direction": "inflow",
                        "payment_method": "bank_transfer",
                        "country_origin": home_iso,
                        "country_destination": home_iso,
                        "is_reversal": False,
                        "free_text": "Deposit",
                        "merchant_id": None,
                        "merchant_category": "Utilities",
                    })

            # 2) Rapid load & immediate spend (si cid ∈ g_rapid)
            if cid in g_rapid:
                day = random.randint(6, 26)
                ts_in = m_start + timedelta(days=day, hours=10, minutes=random.randint(0, 40))
                load_amt = float(np.random.lognormal(mean=np.log(8000), sigma=0.35))
                rows.append({
                    "transaction_id": next_txid(),
                    "client_id": cid,
                    "client_name": cname,
                    "counterparty_id": f"EXT{random.randint(2000,2999)}",
                    "counterparty_name": _rand_phone_for_country(home_iso),
                    "transaction_datetime": ts_in,
                    "amount": round(load_amt, 2),
                    "currency": currency,
                    "transaction_type": "transfer_in",
                    "direction": "inflow",
                    "payment_method": "bank_transfer",
                    "country_origin": home_iso,
                    "country_destination": home_iso,
                    "is_reversal": False,
                    "free_text": "Top-up",
                    "merchant_id": None,
                    "merchant_category": "Utilities",
                })
                ts_out = ts_in + timedelta(minutes=random.randint(10, 180))
                out_amt = round(load_amt * np.random.uniform(0.82, 1.00), 2)
                rows.append({
                    "transaction_id": next_txid(),
                    "client_id": cid,
                    "client_name": cname,
                    "counterparty_id": f"EXT{random.randint(7000,7999)}",
                    "counterparty_name": _rand_phone_for_country(home_iso),
                    "transaction_datetime": ts_out,
                    "amount": out_amt,
                    "currency": currency,
                    "transaction_type": "transfer_out",
                    "direction": "outflow",
                    "payment_method": "wallet_balance",
                    "country_origin": home_iso,
                    "country_destination": home_iso,
                    "is_reversal": False,
                    "free_text": "Transfer",
                    "merchant_id": None,
                    "merchant_category": "Utilities",
                })

            # 3) Crypto usage (si cid ∈ g_crypto)
            if cid in g_crypto:
                n_crypto = random.randint(1, 3)
                for _ in range(n_crypto):
                    ts = m_start + timedelta(days=random.randint(2, 26),
                                             hours=random.randint(9, 21),
                                             minutes=random.randint(0, 59))
                    typ = random.choice(["crypto_purchase","crypto_sale"])
                    direction = "outflow" if typ == "crypto_purchase" else "inflow"
                    amt = float(np.random.lognormal(mean=np.log(1500), sigma=0.5))
                    rows.append({
                        "transaction_id": next_txid(),
                        "client_id": cid,
                        "client_name": cname,
                        "counterparty_id": f"EXT_EXCH_{random.randint(1,6)}",
                        "counterparty_name": "CryptoXchange",
                        "transaction_datetime": ts,
                        "amount": round(amt, 2),
                        "currency": currency,
                        "transaction_type": typ,
                        "direction": direction,
                        "payment_method": "crypto_wallet",
                        "country_origin": home_iso,
                        "country_destination": home_iso,
                        "is_reversal": False,
                        "free_text": "Crypto",
                        "merchant_id": f"M{random.randint(9000,9999)}",
                        "merchant_category": "Crypto Exchange",
                    })

            # 4) Many-to-one (si cid ∈ g_mto)
            if cid in g_mto:
                ts = m_start + timedelta(days=random.randint(6, 27),
                                         hours=random.randint(9, 21),
                                         minutes=random.randint(0, 59))
                rows.append({
                    "transaction_id": next_txid(),
                    "client_id": cid,
                    "client_name": cname,
                    "counterparty_id": hub_choice,
                    "counterparty_name": hub_names[hub_choice],
                    "transaction_datetime": ts,
                    "amount": round(float(np.random.lognormal(mean=np.log(600), sigma=0.6)), 2),
                    "currency": currency,
                    "transaction_type": "transfer_out",
                    "direction": "outflow",
                    "payment_method": "wallet_balance",
                    "country_origin": home_iso,
                    "country_destination": home_iso,
                    "is_reversal": False,
                    "free_text": random.choice(["Rent","Services","Invoice","Payment"]),
                    "merchant_id": None,
                    "merchant_category": "Utilities",
                })

    # ---- DataFrame y coerción de tipos/orden ----
    df = pd.DataFrame(rows).sort_values("transaction_datetime").reset_index(drop=True)

    EXPECTED_COLS = [
        "transaction_id","client_id","client_name","counterparty_id","counterparty_name",
        "transaction_datetime","amount","currency","transaction_type","direction",
        "payment_method","country_origin","country_destination","is_reversal",
        "free_text","merchant_id","merchant_category"
    ]
    # Tipos exactos
    str_cols = ["transaction_id","client_id","client_name","counterparty_id","counterparty_name",
                "currency","transaction_type","direction","payment_method","country_origin",
                "country_destination","free_text","merchant_category"]
    for c in str_cols:
        if c in df.columns:
            df[c] = df[c].astype("object")
    df["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"])
    df["amount"] = df["amount"].astype(float)
    df["is_reversal"] = df["is_reversal"].astype(bool)
    df["merchant_id"] = df["merchant_id"].astype("object")

    # Reorden estricto y retorno
    df = df[EXPECTED_COLS].sort_values("transaction_datetime").reset_index(drop=True)
    return df


In [19]:
# 2) Dataset para DEMO: forzar rarezas en ~10–25% de clientes
df_trx = generate_fake_transactions(
    n_clients=1000,
    avg_txns_per_active_month=4,
    active_months_per_client=18,
    months_total=18,
    end_date= datetime(2025, 5, 1),
    trigger_struct=True,   pct_struct_clients=0.02,
    trigger_rapid=True,    pct_rapid_clients=0.01,
    trigger_crypto=True,   pct_crypto_clients=0.05,
    trigger_mto=True,      pct_mto_clients=0.05,
    trigger_pct_domestic=False, #pct_international_clients=0.12
)


### 2. Aggregate Features Generation

In [7]:
import pandas as pd
import numpy as np
from pandas import Timedelta

def aggregate_customer_monthly(
    df_tx: pd.DataFrame,
    # ---- Definiciones/umbrales ----
    reporting_threshold: float = 1000.0,
    struct_band_low: float = 0.80,   # [0.80, 1.00) * reporting_threshold
    struct_band_high: float = 1.00,
    rapid_window_hours: int = 3,
    mto_direction: str = "outflow",  # "outflow" o "all"
    lookback_months: int = 6,        # para act_spike y rev_ratio

    # ---- Knobs de TRIGGERING (calibración post-agg) ----
    # Amplificación multiplicativa (>=0). 1.0 = sin cambio
    amplify_struct: float = 1.0,
    amplify_rapid: float = 1.0,
    amplify_crypto: float = 1.0,
    amplify_mto: float = 1.0,

    # Mezcla lineal hacia un objetivo: new = (1-α)*orig + α*target (α ∈ [0,1])
    blend_struct_to: float = None,   blend_struct_alpha: float = 0.0,
    blend_rapid_to: float = None,    blend_rapid_alpha: float = 0.0,
    blend_crypto_to: float = None,   blend_crypto_alpha: float = 0.0,
    blend_mto_to: float = None,      blend_mto_alpha: float = 0.0,
    blend_pct_dom_to: float = None,  blend_pct_dom_alpha: float = 0.0,

    # Por defecto NO trigger para act_spike y rev_ratio (se van a 1.0)
    blend_act_spike_to: float = 1.0, blend_act_spike_alpha: float = 1.0,
    blend_rev_ratio_to: float = 1.0, blend_rev_ratio_alpha: float = 1.0,
) -> pd.DataFrame:
    """
    Agrega a nivel cliente-mes, con knobs para provocar o suprimir triggers.
    Output (únicas columnas):
        client_id (obj), client_name (obj), year_month (dt64[ns]), year_month_str (obj),
        struct_score (f8), rapid_spend (f8), crypto_score (f8), mto_score (f8),
        act_spike (f8), pct_domestic (f8), rev_ratio (f8),
        total_tx_amount (f8), avg_tx_amount (f8)
    """

    df = df_tx.copy()
    # --- Tiempos y llaves ---
    df["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"])
    df["year_month"] = df["transaction_datetime"].dt.to_period("M").dt.to_timestamp()
    df["year_month_str"] = df["year_month"].dt.strftime("%Y-%m")

    # Índices base
    by_cm = ["client_id", "year_month"]
    by_cm_name = ["client_id", "client_name", "year_month", "year_month_str"]

    # === Métricas base por cliente-mes ===
    txn_counts = df.groupby(by_cm).size().rename("txn_count")
    inflow_counts = df[df["direction"].eq("inflow")].groupby(by_cm).size().rename("inflow_count")
    total_tx_amount = df.groupby(by_cm)["amount"].sum().rename("total_tx_amount")
    avg_tx_amount = df.groupby(by_cm)["amount"].mean().rename("avg_tx_amount")

    # === 1) Structuring score ===
    low = reporting_threshold * struct_band_low
    high = reporting_threshold * struct_band_high
    struct_mask = (
        (df["direction"] == "inflow") &
        (df["amount"] >= low) &
        (df["amount"] < high)
    )
    struct_counts = df[struct_mask].groupby(by_cm).size().rename("struct_hits")
    struct_score = (struct_counts / inflow_counts).reindex(txn_counts.index).fillna(0.0)

    # === 2) Rapid load & spend (merge_asof, vectorizado) ===
    def _rapid_count_group(g: pd.DataFrame) -> int:
        g = g.sort_values("transaction_datetime")
        infl = g[g["direction"].eq("inflow")][["transaction_datetime"]].rename(columns={"transaction_datetime": "t_in"})
        outf = g[g["direction"].eq("outflow")][["transaction_datetime"]].rename(columns={"transaction_datetime": "t_out"})
        if infl.empty or outf.empty:
            return 0
        m = pd.merge_asof(
            infl.sort_values("t_in"),
            outf.sort_values("t_out"),
            left_on="t_in", right_on="t_out",
            direction="forward",
            tolerance=Timedelta(hours=rapid_window_hours)
        )
        return int(m["t_out"].notna().sum())

    rapid_counts = df.groupby(by_cm, sort=False).apply(_rapid_count_group).rename("rapid_count")
    rapid_spend = (rapid_counts / txn_counts).reindex(txn_counts.index).fillna(0.0)

    # === 3) Crypto score ===
    crypto_mask = df["merchant_category"].str.lower().eq("crypto exchange")
    crypto_score = (df[crypto_mask].groupby(by_cm).size() / txn_counts).reindex(txn_counts.index).fillna(0.0)

    # === 4) Many-to-one score ===
    if mto_direction == "outflow":
        _df_mto = df[df["direction"].eq("outflow")]
    else:
        _df_mto = df
    mto_score = (
        _df_mto.groupby(by_cm)["counterparty_id"]
        .apply(lambda s: s.value_counts(normalize=True).max() if len(s) else 0.0)
        .reindex(txn_counts.index).fillna(0.0)
    )

    # === 5) Activity spike (vs promedio 6 meses previos) ===
    monthly_counts = txn_counts.copy()
    act_spike_list = []
    for (cid, ym), count in monthly_counts.items():
        # Promedio de los últimos N meses previos
        prev = monthly_counts.loc[cid].loc[:ym - pd.offsets.MonthBegin(1)].tail(lookback_months) \
               if cid in monthly_counts.index.get_level_values(0) else pd.Series(dtype=float)
        base = prev.mean() if len(prev) > 0 else count
        act_spike_list.append(((cid, ym), (count / base) if base > 0 else 0.0))
    act_spike = pd.Series(
        {k: v for k, v in act_spike_list},
        name="act_spike"
    ).reindex(txn_counts.index).fillna(0.0)

    # === 6) % Doméstico ===
    pct_domestic = (
        df[df["country_origin"].eq(df["country_destination"])]
        .groupby(by_cm).size()
        .div(txn_counts)
        .reindex(txn_counts.index).fillna(0.0)
    )

    # === 7) Reversal ratio (reversals vs promedio 6 meses previos) ===
    monthly_reversals = (
        df[df["is_reversal"]].groupby(by_cm).size().rename("rev_count")
        .reindex(txn_counts.index).fillna(0.0)
    )
    rev_ratio_list = []
    for (cid, ym), rc in monthly_reversals.items():
        prev = monthly_reversals.loc[cid].loc[:ym - pd.offsets.MonthBegin(1)].tail(lookback_months) \
               if cid in monthly_reversals.index.get_level_values(0) else pd.Series(dtype=float)
        base = prev.mean() if len(prev) > 0 else rc
        rev_ratio_list.append(((cid, ym), (rc / base) if base > 0 else 0.0))
    rev_ratio = pd.Series(
        {k: v for k, v in rev_ratio_list},
        name="rev_ratio"
    ).reindex(txn_counts.index).fillna(0.0)

    # === Combine base frame ===
    agg_df = (
        df.groupby(by_cm_name).size().to_frame("tmp").drop(columns=["tmp"])
        .join(struct_score.rename("struct_score"))
        .join(rapid_spend.rename("rapid_spend"))
        .join(crypto_score.rename("crypto_score"))
        .join(mto_score.rename("mto_score"))
        .join(act_spike.rename("act_spike"))
        .join(pct_domestic.rename("pct_domestic"))
        .join(rev_ratio.rename("rev_ratio"))
        .join(total_tx_amount)
        .join(avg_tx_amount)
        .reset_index()
    )

    # ---------- Calibración (triggering knobs) ----------
    def _clamp01(s: pd.Series) -> pd.Series:
        return s.clip(lower=0.0, upper=1.0)

    def _amplify(s: pd.Series, k: float, clamp01: bool = True) -> pd.Series:
        out = s * float(k)
        return _clamp01(out) if clamp01 else out

    def _blend(s: pd.Series, target: float | None, alpha: float, clamp01: bool = True) -> pd.Series:
        if target is None or alpha <= 0.0:
            return s
        out = (1.0 - float(alpha)) * s + float(alpha) * float(target)
        return _clamp01(out) if clamp01 else out

    # Scores proporcionales -> clamp a [0,1] tras amplificar/mezclar
    agg_df["struct_score"] = _blend(_amplify(agg_df["struct_score"], amplify_struct), blend_struct_to, blend_struct_alpha)
    agg_df["rapid_spend"]  = _blend(_amplify(agg_df["rapid_spend"],  amplify_rapid),  blend_rapid_to,  blend_rapid_alpha)
    agg_df["crypto_score"] = _blend(_amplify(agg_df["crypto_score"], amplify_crypto), blend_crypto_to, blend_crypto_alpha)
    agg_df["mto_score"]    = _blend(_amplify(agg_df["mto_score"],    amplify_mto),    blend_mto_to,    blend_mto_alpha)
    agg_df["pct_domestic"] = _blend(agg_df["pct_domestic"], blend_pct_dom_to, blend_pct_dom_alpha)  # no amplifico por defecto

    # Por defecto: NO trigger en act_spike y rev_ratio (→ 1.0)
    agg_df["act_spike"] = _blend(agg_df["act_spike"], blend_act_spike_to, blend_act_spike_alpha, clamp01=False)
    agg_df["rev_ratio"] = _blend(agg_df["rev_ratio"], blend_rev_ratio_to, blend_rev_ratio_alpha, clamp01=False)

    # ---------- Tipos + columnas exactas ----------
    EXPECTED_COLS = [
        "client_id","client_name","year_month","year_month_str",
        "struct_score","rapid_spend","crypto_score","mto_score",
        "act_spike","pct_domestic","rev_ratio","total_tx_amount","avg_tx_amount"
    ]
    agg_df = agg_df[EXPECTED_COLS]

    # Tipos requeridos
    agg_df["client_id"] = agg_df["client_id"].astype("object")
    agg_df["client_name"] = agg_df["client_name"].astype("object")
    agg_df["year_month"] = pd.to_datetime(agg_df["year_month"])
    agg_df["year_month_str"] = agg_df["year_month_str"].astype("object")

    float_cols = [
        "struct_score","rapid_spend","crypto_score","mto_score",
        "act_spike","pct_domestic","rev_ratio","total_tx_amount","avg_tx_amount"
    ]
    for c in float_cols:
        agg_df[c] = agg_df[c].astype(float)

    return agg_df


In [8]:
agg_df = aggregate_customer_monthly(
    df_trx,
    amplify_struct=2.8, blend_struct_to=None,      # sube struct_score
    amplify_rapid=2.0,  blend_rapid_to=None,       # sube rapid_spend
    amplify_crypto=3.5, blend_crypto_to=None,      # sube crypto_score
    amplify_mto=2.0,    blend_mto_to=None,         # sube mto_score
    blend_pct_dom_to=0.3, blend_pct_dom_alpha=0.7, # baja pct_domestic (más internacional)
    blend_act_spike_to=1.0, blend_act_spike_alpha=1.0,  # mantiene sin trigger
    blend_rev_ratio_to=1.0, blend_rev_ratio_alpha=1.0   # mantiene sin trigger
)


  rapid_counts = df.groupby(by_cm, sort=False).apply(_rapid_count_group).rename("rapid_count")


### 3. Anomalous KYC

In [9]:
import pandas as pd
import numpy as np
import random
from datetime import date, timedelta
import re

# --- Catálogos y utilidades ligeras UE ---
EU_COUNTRIES = [
    "AT","BE","BG","HR","CY","CZ","DK","EE","FI","FR","DE",
    "GR","HU","IE","IT","LV","LT","LU","MT","NL","PL","PT",
    "RO","SK","SI","ES","SE"
]
EU_CODE2NAME = {
    "AT":"Austria","BE":"Belgium","BG":"Bulgaria","HR":"Croatia","CY":"Cyprus","CZ":"Czechia",
    "DK":"Denmark","EE":"Estonia","FI":"Finland","FR":"France","DE":"Germany","GR":"Greece",
    "HU":"Hungary","IE":"Ireland","IT":"Italy","LV":"Latvia","LT":"Lithuania","LU":"Luxembourg",
    "MT":"Malta","NL":"Netherlands","PL":"Poland","PT":"Portugal","RO":"Romania","SK":"Slovakia",
    "SI":"Slovenia","ES":"Spain","SE":"Sweden"
}

def _rand_eu_birth_and_citizenships() -> tuple[str, list[str]]:
    cob = random.choice(EU_COUNTRIES)
    if random.random() < 0.7:
        ctz = [EU_CODE2NAME[cob]]
    else:
        ctz = [EU_CODE2NAME[cob], EU_CODE2NAME[random.choice(EU_COUNTRIES)]]
    return cob, ctz

def _safe_email_from_client_id(cid: str) -> str:
    # usa el teléfono como base
    base = re.sub(r"[^0-9+]", "", str(cid))
    return f"{base.replace('+','plus')}@example.com"

def generate_wallets_kyc_from_transactions(df_tx: pd.DataFrame) -> pd.DataFrame:
    """
    Simplificada: crea KYC para TODOS los clientes presentes en df_tx.
    - Respeta client_id y client_name existentes.
    - Deriva afinidades/banderas desde el comportamiento observado.
    - Llena todos los campos exigidos por 'demo_digital_wallets_customers' con tipos correctos.
    """
    # Determinismo suave para pequeñas proporciones (PEP/sanctioned)
    random.seed(42)
    np.random.seed(42)

    # Normaliza tiempo
    df = df_tx.copy()
    df["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"])

    # Clientes únicos
    clients = (
        df[["client_id","client_name"]]
        .drop_duplicates()
        .sort_values("client_id")
        .reset_index(drop=True)
    )
    ids = clients["client_id"].tolist()

    # Derivaciones por cliente (desde transacciones)
    by_c = ["client_id"]
    txn_counts_c = df.groupby(by_c).size().rename("txn_count")

    # % internacional (para intl_tolerance y high_risk_exposure)
    pct_domestic_c = (
        (df[df["country_origin"].eq(df["country_destination"])]
           .groupby(by_c).size()
           .div(txn_counts_c))
        .reindex(txn_counts_c.index).fillna(0.0)
    )
    pct_international_c = 1.0 - pct_domestic_c

    # Crypto afinidad (si tiene al menos 1 txn con merchant_category "Crypto Exchange")
    crypto_aff_c = (
        df[df["merchant_category"].str.lower().eq("crypto exchange")]
        .groupby(by_c).size()
        .reindex(txn_counts_c.index).fillna(0) > 0
    )

    # Rapid spend (inflow seguido de outflow <= 3h) a nivel cliente (no por mes)
    def _rapid_any(g: pd.DataFrame) -> bool:
        g = g.sort_values("transaction_datetime")
        infl = g[g["direction"].eq("inflow")]["transaction_datetime"].to_numpy()
        outf = g[g["direction"].eq("outflow")]["transaction_datetime"].to_numpy()
        if infl.size == 0 or outf.size == 0:
            return False
        j = 0
        for t_in in infl:
            # avanza outflows hasta superar t_in
            while j < len(outf) and outf[j] <= t_in:
                j += 1
            if j < len(outf):
                dt = (outf[j] - t_in) / np.timedelta64(1, "s")
                if 0 < dt <= 3*3600:
                    return True
        return False

    rapid_any_c = (
        df.groupby("client_id", sort=False)
          .apply(_rapid_any)
          .reindex(txn_counts_c.index)
          .fillna(False)
    )

    # Structuring-like: ¿tuvo inflows justo por debajo de 1000?
    reporting_threshold = 1000.0
    low, high = reporting_threshold*0.80, reporting_threshold
    struct_hits_c = (
        df[(df["direction"].eq("inflow")) &
           (df["amount"].between(low, high, inclusive="left"))]
        .groupby(by_c).size()
        .reindex(txn_counts_c.index).fillna(0)
    )
    struct_any_c = struct_hits_c > 0

    # País de residencia: usa el modo de country_origin como proxy
    home_code_c = (
        df.groupby("client_id")["country_origin"]
          .agg(lambda s: s.mode().iat[0] if len(s.mode()) else "FR")
          .reindex(txn_counts_c.index)
    ).fillna("FR")
    home_name_c = home_code_c.map(lambda c: EU_CODE2NAME.get(c, c))

    rows = []
    today = date.today()

    for _, r in clients.iterrows():
        cid = r["client_id"]
        cname = r["client_name"] if pd.notnull(r["client_name"]) and str(r["client_name"]).strip() else cid

        # Demografía
        age = random.randint(25, 70)
        dob = today - timedelta(days=int(age * 365.25))
        cob, ctz_list = _rand_eu_birth_and_citizenships()

        # Afinidades/derivados
        crypto_affinity = bool(crypto_aff_c.loc[cid])
        intl_tol = bool(pct_international_c.loc[cid] > 0.20)   # tolerancia si hace >20% internacional
        rapid_tol = bool(rapid_any_c.loc[cid])

        # Exposición alto riesgo (demo): si internacionalidad o sancionado (más abajo)
        high_risk_exposure = bool(intl_tol)

        # Límites: si structuring-like, baja daily_topup a < 1000
        daily_limit = 900.0 if bool(struct_any_c.loc[cid]) else 1500.0
        monthly_limit = max(20000.0, daily_limit * random.randint(22, 40))

        # PEP / Sanctioned deterministas de baja prevalencia (2% / 0.5%)
        pep = (hash(str(cid)) % 50 == 1)      # ~2%
        sanctioned = (hash("S"+str(cid)) % 200 == 3)  # ~0.5%
        if sanctioned:
            high_risk_exposure = True

        # Risk rating (1–5): base 2-3 y suma si hay señales
        base = int(np.clip(round(np.random.normal(2.5, 0.7)), 1, 5))
        bump = int(crypto_affinity) + int(intl_tol) + int(rapid_tol) + int(struct_any_c.loc[cid])
        risk = min(5, max(base, base + (1 if bump >= 1 else 0)))
        if pep or sanctioned:
            risk = max(risk, 4)

        # Fechas KYC
        eff_date = today - timedelta(days=random.randint(400, 2000))
        last_review = today - timedelta(days=random.randint(30, 240))

        row = {
            # --- Identidad ---
            "client_id": str(cid),
            "client_name": str(cname),

            # --- Demografía / ciudadanía ---
            "date_of_birth": pd.to_datetime(dob).date(),
            "country_of_birth": cob,                                  # ISO-2
            "citizenship_countries": ", ".join(ctz_list),            # nombres

            # --- Residencia / contacto ---
            "country_of_residence_code": str(home_code_c.loc[cid]),  # ISO-2
            "country_of_residence": str(home_name_c.loc[cid]),       # nombre país
            "address": f"{home_name_c.loc[cid]} - {random.randint(10,9999)} Main St",
            "phone_number": str(cid),                                # igual a client_id (consistencia)
            "email": _safe_email_from_client_id(cid),

            # --- Perfil económico ---
            "occupation": random.choice(["Entrepreneur","Consultant","Analyst","Merchant","Self-employed","IT Professional"]),
            "primary_source_of_income": random.choice(["Salary","Business income","Investments","Services"]),
            "estimated_annual_income_eur": float(np.round(np.random.lognormal(np.log(120_000), 0.6), 2)),

            # --- Clasificación AML ---
            "risk_rating": int(risk),
            "is_pep": bool(pep),
            "is_sanctioned": bool(sanctioned),
            "high_risk_country_exposure": bool(high_risk_exposure),

            # --- Límites / tolerancias wallet ---
            "daily_wallet_topup_limit_eur": float(np.round(daily_limit, 2)),
            "monthly_wallet_limit_eur": float(np.round(monthly_limit, 2)),

            # --- Afinidades / tolerancias operativas ---
            "crypto_affinity": bool(crypto_affinity),
            "intl_transfer_tolerance": bool(intl_tol),
            "rapid_spend_tolerance": bool(rapid_tol),

            # --- Metadata ---
            "customer_effective_date": pd.to_datetime(eff_date).date(),
            "kyc_last_review_date": pd.to_datetime(last_review).date(),
            "segment_type": "PERS",
            "segment_type_description": "Personal – Digital Wallet",
            "sars_flag": False,
        }
        rows.append(row)

    df_kyc = pd.DataFrame(rows)

    # --- Tipos exactos ---
    str_cols = [
        "client_id","client_name","country_of_birth","citizenship_countries",
        "country_of_residence_code","country_of_residence","address",
        "phone_number","email","occupation","primary_source_of_income",
        "segment_type","segment_type_description"
    ]
    for c in str_cols:
        df_kyc[c] = df_kyc[c].astype("object")

    bool_cols = ["is_pep","is_sanctioned","high_risk_country_exposure",
                 "crypto_affinity","intl_transfer_tolerance","rapid_spend_tolerance","sars_flag"]
    for c in bool_cols:
        df_kyc[c] = df_kyc[c].astype(bool)

    # Fechas (TIMESTAMP en catálogo; aquí como date sin hora, formato yyyy-MM-dd)
    df_kyc["date_of_birth"] = pd.to_datetime(df_kyc["date_of_birth"]).dt.date
    df_kyc["customer_effective_date"] = pd.to_datetime(df_kyc["customer_effective_date"]).dt.date
    df_kyc["kyc_last_review_date"] = pd.to_datetime(df_kyc["kyc_last_review_date"]).dt.date

    num_cols_float = ["estimated_annual_income_eur","daily_wallet_topup_limit_eur","monthly_wallet_limit_eur"]
    for c in num_cols_float:
        df_kyc[c] = pd.to_numeric(df_kyc[c], errors="coerce").astype(float)

    df_kyc["risk_rating"] = pd.to_numeric(df_kyc["risk_rating"], errors="coerce").astype("int64")

    return df_kyc


In [10]:
df_kyc_anomalos = generate_wallets_kyc_from_transactions(df_trx)

  .apply(_rapid_any)


# Write

### 1. Transactions Dataset

In [None]:
dataset_functions.write(
    context,
    context.get_spark_session().createDataFrame(df_trx),
    transactions_dataset().identifier,
    data_environment=DataEnvironment.PUBLIC)

dataset_functions.publish(context, 
                          transactions_dataset().identifier,
                          data_environment=DataEnvironment.PUBLIC)

### 2. Aggregate Features Dataset

In [None]:
dataset_functions.write(context, 
                        context.get_spark_session().createDataFrame(agg_df), 
                        customer_monthly_dataset().identifier,
                        data_environment=DataEnvironment.PUBLIC)



dataset_functions.publish(context, 
                          customer_monthly_dataset().identifier,
                          data_environment=DataEnvironment.PUBLIC)

### 3. Anomalous KYC Dataset

In [None]:
dataset_functions.write(context, 
                        context.get_spark_session().createDataFrame(df_kyc_anomalos),
                        customers_dataset().identifier,
                        data_environment=DataEnvironment.PUBLIC)

dataset_functions.publish(context, 
                          customers_dataset().identifier,
                          data_environment=DataEnvironment.PUBLIC)

In [None]:
context.close()