### Init Context

In [1]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_remittance/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 2, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]',
                      allow_type_changes=True, 
                      delete_unused_columns=True)

2025-07-19 14:17:59,017:INFO:thetaray.common.logging:start loading solution.....[ load_risks=True , solution_path=/thetaray/git/solutions/domains , settings_path=/thetaray/git/solutions/settings ]
2025-07-19 14:17:59,234:INFO:thetaray.common.logging:load_risks took: 0.08745241165161133
2025-07-19 14:17:59,711:INFO:thetaray.common.logging:=== Started updating schema ===
2025-07-19 14:17:59,916:INFO:thetaray.common.logging:=== Started updating schema on Postgres ===
2025-07-19 14:18:11,456:INFO:thetaray.common.logging:found 115 tables in solution public schema
2025-07-19 14:18:11,459:INFO:thetaray.common.logging:demo_remittance_ef
2025-07-19 14:18:11,466:INFO:thetaray.common.logging:found 115 tables in solution public schema
2025-07-19 14:18:11,473:INFO:thetaray.common.logging:demo_ret_smb_ef
2025-07-19 14:18:11,480:INFO:thetaray.common.logging:found 115 tables in solution public schema
2025-07-19 14:18:11,482:INFO:thetaray.common.logging:party_tr_analysis
2025-07-19 14:18:11,488:INFO:th

### Imports

In [2]:
from thetaray.api.dataset import dataset_functions

from domains.demo_remittance.datasets.customer_monthly import customer_monthly_dataset
from domains.demo_remittance.datasets.customers import customers_dataset
from domains.demo_remittance.datasets.transactions import transactions_dataset

from pyspark.sql import functions as f

In [3]:
import pandas as pd
import numpy as np
import random, uuid
from datetime import datetime
from faker import Faker

faker = Faker()

# ————————————————————————————————————————————————————————————————
# Parámetros de referencia
# ————————————————————————————————————————————————————————————————
HIGH_RISK_COUNTRIES = ["IR", "KP", "SY", "SD", "VE", "SO", "YE", "CU", "MM", "CF"]
TAX_HEAVEN_COUNTRIES = ["CY", "KY", "BS", "PA", "LU", "CH", "SG", "AE", "BM", "VG"]
MAX_LIMIT = 10000.0          # límite típico de reporte (CTR / 17b)
STRUCTURE_BAND = 500.0       # "just below" = dentro de 500 USD bajo el límite
VELOCITY_WINDOW = 3          # nº días para el cálculo de spikes

ISO_4217 = ["USD", "EUR", "GBP", "JPY", "CAD", "MXN", "AUD"]
CHANNELS = ["agent", "branch", "mobile_app", "web"]
STRUCTURE_THRESHOLD = 10000.0

def generate_normal_remittance_transactions(customer_id, customer_name, period, n_transactions=30):
    """
    Genera transacciones normales de remesas para clientes regulares
    """
    records = []
    customers_pool = [f"CUST{i:04d}" for i in range(500, 800)]
    
    def get_counterparty_details():
        cp_name = faker.company()
        cp_account = f"ACC-CPTY-{uuid.uuid4().hex[:12].upper()}"
        cp_contract = f"CON-{uuid.uuid4().hex[:8].upper()}"
        
        # 85% transacciones a países de bajo riesgo
        is_high_risk = random.random() < 0.15
        if is_high_risk:
            cp_country = random.choice(HIGH_RISK_COUNTRIES)
            cp_country_risk = 'High'
        else:
            cp_country = faker.country_code()
            while cp_country in HIGH_RISK_COUNTRIES + TAX_HEAVEN_COUNTRIES:
                cp_country = faker.country_code()
            cp_country_risk = random.choice(['Low', 'Medium'])
        
        return cp_name, cp_account, cp_contract, cp_country, cp_country_risk

    # Generar pool de contrapartes únicas para el mes
    num_unique_counterparties = min(n_transactions, random.randint(8, 20))
    monthly_counterparties = []
    
    for _ in range(num_unique_counterparties):
        counterparty_id = (
            random.choice(customers_pool)
            if random.random() < 0.20
            else f"EXT-{uuid.uuid4().hex[:6].upper()}"
        )
        cp_name, cp_account, cp_contract, cp_country, cp_risk = get_counterparty_details()
        
        monthly_counterparties.append({
            'id': counterparty_id,
            'name': cp_name,
            'account': cp_account,
            'contract': cp_contract,
            'country': cp_country,
            'risk': cp_risk
        })
    
    # Generar transacciones distribuidas a lo largo del mes
    for _ in range(n_transactions):
        ts = period + pd.Timedelta(
            days=random.randint(0, 27),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59)
        )
        
        # Montos normales para remesas (distribución exponencial)
        amt = round(float(np.random.exponential(scale=600)), 2)
        amt = max(50, min(amt, 5000))  # Entre 50 y 5000 para transacciones normales
        
        channel = random.choice(CHANNELS)
        in_out = random.choice(['IN', 'OUT'])
        
        # Seleccionar contraparte del pool mensual
        counterparty = random.choice(monthly_counterparties)
        
        records.append({
            'transaction_id': str(uuid.uuid4()),
            'customer_id': customer_id,
            'customer_name': customer_name,
            'counterparty_id': counterparty['id'],
            'counterparty_customer_name': counterparty['name'],
            'counterparty_account': counterparty['account'],
            'counterparty_contract': counterparty['contract'],
            'counterparty_country': counterparty['country'],
            'counterparty_country_risk': counterparty['risk'],
            'transaction_timestamp': ts,
            'channel': channel,
            'amount': amt,
            'currency': 'USD',
            'origin_country_code': 'US' if in_out == 'OUT' else counterparty['country'],
            'destination_country_code': counterparty['country'] if in_out == 'OUT' else 'US',
            'in_out': in_out,
        })
    
    return records

def generate_anomalous_remittance_transactions(
    customer_name, 
    months=12,
    avg_outgoing_per_month=80,
    avg_structuring_per_month=25,
    avg_velocity_spike_per_month=15,
    avg_multi_party_per_month=25,
    avg_high_risk_volume_per_month=30,
    pct_counterparty_in_bank=0.20,
    start_date=None
):
    """
    Genera transacciones anómalas de remesas para el cliente ANOMR001
    """
    if start_date:
        today = pd.Timestamp(pd.to_datetime(start_date).replace(day=1, hour=0, minute=0, second=0, microsecond=0))
    else:
        today = pd.Timestamp(datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0))

    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    customer_id = "ANOMR001"
    customers_pool = [f"CUST{i:04d}" for i in range(500, 800)]
    
    records = []

    def get_counterparty_details(is_high_risk=False, specified_country=None):
        cp_name = faker.company()
        cp_account = f"ACC-CPTY-{uuid.uuid4().hex[:12].upper()}"
        cp_contract = f"CON-{uuid.uuid4().hex[:8].upper()}"
        
        if specified_country:
            cp_country = specified_country
            cp_country_risk = 'High' if cp_country in HIGH_RISK_COUNTRIES + TAX_HEAVEN_COUNTRIES else random.choice(['Low', 'Medium'])
        elif is_high_risk:
            cp_country = random.choice(HIGH_RISK_COUNTRIES)
            cp_country_risk = 'High'
        else:
            cp_country = faker.country_code()
            while cp_country in HIGH_RISK_COUNTRIES + TAX_HEAVEN_COUNTRIES:
                cp_country = faker.country_code()
            cp_country_risk = random.choice(['Low', 'Medium'])
            
        return cp_name, cp_account, cp_contract, cp_country, cp_country_risk

    for period in periods:
        n_outgoing = np.random.poisson(lam=avg_outgoing_per_month)
        n_structuring = np.random.poisson(lam=avg_structuring_per_month)
        n_velocity = np.random.poisson(lam=avg_velocity_spike_per_month)
        n_multi_party = np.random.poisson(lam=avg_multi_party_per_month)
        n_high_risk_volume = np.random.poisson(lam=avg_high_risk_volume_per_month)

        # 1) Grandes transferencias a países de alto riesgo
        for _ in range(n_outgoing):
            ts = period + pd.Timedelta(
                days=random.randint(0, 27),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            amt = round(random.uniform(12_000, 25_000), 2)
            dest_country = random.choice(HIGH_RISK_COUNTRIES)
            
            counterparty_id = (
                random.choice(customers_pool)
                if random.random() < pct_counterparty_in_bank
                else f"EXT-{uuid.uuid4().hex[:6].upper()}"
            )
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details(is_high_risk=True, specified_country=dest_country)
            
            records.append({
                "transaction_id": str(uuid.uuid4()),
                "customer_id": customer_id,
                "customer_name": customer_name,
                "counterparty_id": counterparty_id,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                "transaction_timestamp": ts,
                "channel": random.choice(CHANNELS),
                "amount": amt,
                "currency": "USD",
                "origin_country_code": "US",
                "destination_country_code": dest_country,
                "in_out": "OUT"
            })

        # 2) Structuring: montos justo por debajo del umbral
        for _ in range(n_structuring):
            ts = period + pd.Timedelta(
                days=random.randint(0, 27),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            amt = round(STRUCTURE_THRESHOLD - random.uniform(5, 500), 2)  # Usar STRUCTURE_BAND
            dest_country = random.choice(["MX", "PH", "CN", "IN"])
            
            counterparty_id = (
                random.choice(customers_pool)
                if random.random() < pct_counterparty_in_bank
                else f"EXT-{uuid.uuid4().hex[:6].upper()}"
            )
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details(specified_country=dest_country)
            
            records.append({
                "transaction_id": str(uuid.uuid4()),
                "customer_id": customer_id,
                "customer_name": customer_name,
                "counterparty_id": counterparty_id,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                "transaction_timestamp": ts,
                "channel": "agent",
                "amount": amt,
                "currency": "USD",
                "origin_country_code": "US",
                "destination_country_code": dest_country,
                "in_out": "OUT"
            })

        # 3) Velocity spikes: ráfagas de transacciones
        if n_velocity > 0:
            # Crear varios spikes en el mes
            n_spikes = random.randint(1, 3)
            for spike_num in range(n_spikes):
                spike_start_day = random.randint(0, 27)
                spike_start_hour = random.randint(0, 23)
                base_ts_spike = period + pd.Timedelta(days=spike_start_day, hours=spike_start_hour)
                
                spike_size = n_velocity // n_spikes if n_spikes > 0 else n_velocity
                
                # Usar la misma contraparte para simular velocity spike
                counterparty_id = f"EXT-{uuid.uuid4().hex[:6].upper()}"
                dest_country = random.choice(HIGH_RISK_COUNTRIES)
                cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details(is_high_risk=True, specified_country=dest_country)
                
                for i in range(spike_size):
                    ts = base_ts_spike + pd.Timedelta(minutes=random.randint(0, 180))  # 3 horas de ventana
                    amt = round(random.uniform(2000, 8000), 2)
                    
                    records.append({
                        "transaction_id": str(uuid.uuid4()),
                        "customer_id": customer_id,
                        "customer_name": customer_name,
                        "counterparty_id": counterparty_id,
                        'counterparty_customer_name': cp_name,
                        'counterparty_account': cp_acc,
                        'counterparty_contract': cp_con,
                        'counterparty_country': cp_country,
                        'counterparty_country_risk': cp_risk,
                        "transaction_timestamp": ts,
                        "channel": random.choice(["mobile_app", "web"]),
                        "amount": amt,
                        "currency": "USD",
                        "origin_country_code": "US",
                        "destination_country_code": dest_country,
                        "in_out": "OUT"
                    })

        # 4) Multi-party activity: muchas contrapartes únicas
        for _ in range(n_multi_party):
            ts = period + pd.Timedelta(
                days=random.randint(0, 27),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            amt = round(float(np.random.exponential(scale=600)), 2)
            
            # Generar contraparte única
            counterparty_id = f"CTR_{uuid.uuid4().hex[:6]}"
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details()
            
            records.append({
                "transaction_id": str(uuid.uuid4()),
                "customer_id": customer_id,
                "customer_name": customer_name,
                "counterparty_id": counterparty_id,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                "transaction_timestamp": ts,
                "channel": random.choice(CHANNELS),
                "amount": amt,
                "currency": "USD",
                "origin_country_code": "US",
                "destination_country_code": cp_country,
                "in_out": "OUT"
            })

        # 5) High-risk volume: grandes montos a países de alto riesgo
        for _ in range(n_high_risk_volume):
            ts = period + pd.Timedelta(
                days=random.randint(0, 27),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            amt = round(random.uniform(50000, 100000), 2)
            dest_country = random.choice(HIGH_RISK_COUNTRIES)
            
            counterparty_id = (
                random.choice(customers_pool)
                if random.random() < pct_counterparty_in_bank
                else f"EXT-{uuid.uuid4().hex[:6].upper()}"
            )
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details(is_high_risk=True, specified_country=dest_country)
            
            records.append({
                "transaction_id": str(uuid.uuid4()),
                "customer_id": customer_id,
                "customer_name": customer_name,
                "counterparty_id": counterparty_id,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                "transaction_timestamp": ts,
                "channel": random.choice(CHANNELS),
                "amount": amt,
                "currency": "USD",
                "origin_country_code": "US",
                "destination_country_code": dest_country,
                "in_out": "OUT"
            })

        # 6) Transacciones normales adicionales
        n_normal = np.random.poisson(lam=20)
        for _ in range(n_normal):
            ts = period + pd.Timedelta(
                days=random.randint(0, 27),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            amt = round(float(np.random.exponential(scale=600)), 2)
            
            counterparty_id = (
                random.choice(customers_pool)
                if random.random() < pct_counterparty_in_bank
                else f"EXT-{uuid.uuid4().hex[:6].upper()}"
            )
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details()
            
            records.append({
                "transaction_id": str(uuid.uuid4()),
                "customer_id": customer_id,
                "customer_name": customer_name,
                "counterparty_id": counterparty_id,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                "transaction_timestamp": ts,
                "channel": random.choice(CHANNELS),
                "amount": amt,
                "currency": "USD",
                "origin_country_code": "US",
                "destination_country_code": cp_country,
                "in_out": "OUT"
            })

    return pd.DataFrame(records).sort_values("transaction_timestamp").reset_index(drop=True)

def calculate_remittance_aggregated_features(transactions_df, customer_id, customer_name, period):
    """
    Calcula las features agregadas para remesas en un período específico (mensual)
    basándose en las transacciones reales generadas
    """
    # Filtrar transacciones del período mensual específico
    start_date = period
    end_date = period + pd.DateOffset(months=1)
    mask = (transactions_df['transaction_timestamp'] >= start_date) & \
           (transactions_df['transaction_timestamp'] < end_date) & \
           (transactions_df['customer_id'] == customer_id)
    
    period_txs = transactions_df[mask].copy()
    
    if len(period_txs) == 0:
        # Si no hay transacciones, devolver valores por defecto
        return {
            'customer_id': customer_id,
            'customer_name': customer_name,
            'year_month': period,
            'year_month_str': period.strftime("%Y-%m"),
            'multpl_tx_bl_lim': 0,
            'vel_spike': 0,
            'multi_party_actv': 0,
            'hr_jurid_vol': 0.0,
            'hr_jurid_vol_pop': 0.0,
            'total_tx_amount': 0.0,
            'avg_tx_amount': 0.0
        }
    
    amounts = period_txs['amount'].values
    timestamps = period_txs['transaction_timestamp']
    dest_countries = period_txs['destination_country_code']
    counterparties = period_txs['counterparty_id']
    
    # 1. Multiple transactions below limit (structuring)
    below_lim_mask = (amounts >= MAX_LIMIT - STRUCTURE_BAND) & (amounts < MAX_LIMIT)
    multiple_tx_below_limit = int(below_lim_mask.sum())
    
    # 2. Velocity spike - usar rolling window como en el código original
    ts_series = pd.Series(1, index=pd.to_datetime(timestamps).sort_values())
    rolling_counts = ts_series.rolling(f"{VELOCITY_WINDOW}D").sum()
    velocity_spike = int((rolling_counts >= 3).sum())
    
    # 3. Multi-party activity - contrapartes únicas por encima del umbral
    unique_counterparties = period_txs['counterparty_id'].nunique()
    multi_party_activity = unique_counterparties
    
    # 4. High-risk jurisdiction volume
    hr_mask = dest_countries.isin(HIGH_RISK_COUNTRIES)
    high_risk_jurisdiction_volume = float(amounts[hr_mask].sum())
    
    # 5. High-risk jurisdiction volume with population factor
    pop_factor = random.uniform(0.001, 0.006) if customer_id == 'ANOMR001' else random.uniform(0.001, 0.002)
    hr_jurid_vol_pop = high_risk_jurisdiction_volume * pop_factor
    
    # 6. Métricas adicionales
    total_tx_amount = float(amounts.sum())
    avg_tx_amount = float(amounts.mean())

    return {
        'customer_id': customer_id,
        'customer_name': customer_name,
        'year_month': period,
        'year_month_str': period.strftime("%Y-%m"),
        'multpl_tx_bl_lim': multiple_tx_below_limit,
        'vel_spike': velocity_spike,
        'multi_party_actv': multi_party_activity,
        'hr_jurid_vol': high_risk_jurisdiction_volume,
        'hr_jurid_vol_pop': hr_jurid_vol_pop,
        'total_tx_amount': total_tx_amount,
        'avg_tx_amount': avg_tx_amount
    }

def generate_remittance_dataset(n_customers=100, months=12, transactions_per_customer_per_month=30):
    """
    Genera el dataset completo de remesas: primero las transacciones, luego las features agregadas
    """
    # Preparar clientes
    normal_customers = [f"CUST{i:04d}" for i in range(n_customers - 1)]
    anomaly_customer = "ANOMR001"
    all_customers = normal_customers + [anomaly_customer]
    
    # Generar nombres de clientes
    customer_names = {cust: faker.name() for cust in all_customers}
    
    # Preparar períodos
    today = pd.Timestamp(datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0))
    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    
    # 1. Generar todas las transacciones
    all_transactions = []
    
    # Transacciones para clientes normales
    for customer in normal_customers:
        for period in periods:
            txs = generate_normal_remittance_transactions(
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period,
                n_transactions=transactions_per_customer_per_month
            )
            all_transactions.extend(txs)
    
    # Transacciones para cliente anómalo
    anomaly_txs = generate_anomalous_remittance_transactions(
        customer_name=customer_names[anomaly_customer],
        months=months
    )
    all_transactions.extend(anomaly_txs.to_dict('records'))
    
    # Convertir a DataFrame
    transactions_df = pd.DataFrame(all_transactions)
    
    # 2. Calcular features agregadas basándose en las transacciones reales
    aggregated_records = []
    
    for customer in all_customers:
        for period in periods:
            features = calculate_remittance_aggregated_features(
                transactions_df=transactions_df,
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period
            )
            aggregated_records.append(features)
    
    aggregated_df = pd.DataFrame(aggregated_records)
    
    # Ordenar datasets
    transactions_df = transactions_df.sort_values(['customer_id', 'transaction_timestamp'])
    aggregated_df = aggregated_df.sort_values(['customer_id', 'year_month'])
    
    return transactions_df, aggregated_df


def generate_anomalous_kyc(name: str) -> pd.DataFrame:
    """
    Devuelve el registro KYC del cliente anómalo ANOMR001, adaptado a remittances.
    """
    dob = faker.date_of_birth(minimum_age=30, maximum_age=70)
    eff_date = faker.date_between(start_date="-5y", end_date="-1y")

    data = {
        # --- Identificación básica ---
        "customer_id": "ANOMR001",
        "customer_num": f"RM-{faker.bothify('######')}",
        "document_type": random.choice(["Passport", "National ID", "Driver License"]),
        "document_id": faker.bothify("??########"),
        "name": name,
        "date_of_birth": dob,
        "country_of_birth": "CU",
        "citizenship_countries": "United States, Cuba",

        # --- Residencia & contacto ---
        "country_of_residence_code": "US",
        "country_of_residence": "United States",
        "address": faker.address().replace("\n", ", "),
        "phone_number": faker.phone_number(),
        "email": faker.email(),

        # --- Perfil económico / laboral ---
        "occupation": random.choice(["Entrepreneur", "Consultant", "Investor"]),
        "primary_source_of_income": random.choice(
            ["Business income", "Investments", "Salary"]),
        "estimated_annual_income_usd": random.randint(150_000, 500_000),

        # --- Clasificación de riesgo AML ---
        "risk_rating": random.randint(4, 5),       # 4‑5 = alto
        "is_pep": random.choice([True, False]),
        "is_sanctioned": False,
        "high_risk_country_exposure": True,

        # --- Límites y frecuencia de remesas ---
        "daily_remittance_limit_usd": 9_500,       # justo bajo umbral de CTR
        "monthly_remittance_limit_usd": 50_000,

        # --- Metadata ---
        "customer_effective_date": eff_date,
        "kyc_last_review_date": datetime.today().date(),
        "segment_type": "PERS",
        "segment_type_description": "Personal – Remittance",
        "sars_flag": False,
    }

    return pd.DataFrame([data])

Transacciones generadas: 4002
Features agregadas: 120

Features del cliente anómalo por mes:
    year_month_str  multpl_tx_bl_lim  vel_spike  multi_party_actv  \
119        2025-02                20        204               189   
118        2025-03                30        202               197   
117        2025-04                29        196               189   
116        2025-05                29        170               160   
115        2025-06                22        188               180   

     hr_jurid_vol  
119    4475836.30  
118    3401148.71  
117    3650333.28  
116    3353484.28  
115    3936290.24  

Transacciones del cliente anómalo: 1152
Monto promedio: $20836.55
Contrapartes únicas: 1073


### Data Gen

In [4]:
transactions_df, agg_df = generate_remittance_dataset()
dataset_functions.write(context, context.get_spark_session().createDataFrame(agg_df), customer_monthly_dataset().identifier)
dataset_functions.publish(context, customer_monthly_dataset().identifier)
anom_name = agg_df.loc[agg_df.customer_id=='ANOMR001']['customer_name'].iloc[0]

dataset_functions.write(context, context.get_spark_session().createDataFrame(transactions_df), transactions_dataset().identifier)
dataset_functions.publish(context, transactions_dataset().identifier)

dataset_functions.write(context, context.get_spark_session().createDataFrame(generate_anomalous_kyc(name=anom_name)), customers_dataset().identifier)
dataset_functions.publish(context, customers_dataset().identifier)

2025-07-19 14:22:40,298:INFO:thetaray.common.logging:### DataSet - writing started ###
                                                                                2025-07-19 14:22:44,609:INFO:thetaray.common.logging:### DataSet - writing done, 1200 written, 0 corrupted, 0 rejected  ###
                                                                                2025-07-19 14:22:47,175:INFO:thetaray.common.logging:finished publishing records for dataset demo_remittance_customer_monthly
2025-07-19 14:22:47,394:INFO:thetaray.common.logging:### DataSet - writing started ###
                                                                                2025-07-19 14:22:51,288:INFO:thetaray.common.logging:### DataSet - writing done, 37984 written, 0 corrupted, 0 rejected  ###
2025-07-19 14:22:53,514:INFO:thetaray.common.logging:finished publishing records for dataset demo_remittance_transactions
2025-07-19 14:22:53,543:INFO:thetaray.common.logging:### DataSet - writing started ###
25

True

In [None]:
context.close()