### Init Context

In [None]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_ret_indiv/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 2, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]')

### Imports

In [None]:
from thetaray.api.dataset import dataset_functions

from domains.demo_ret_indiv.datasets.customer_monthly import customer_monthly_dataset
from domains.demo_ret_indiv.datasets.customers import customers_dataset
from domains.demo_ret_indiv.datasets.transactions import transactions_dataset

from pyspark.sql import functions as f

### Data Gen

In [None]:

import pandas as pd
import numpy as np
import random, uuid
from datetime import datetime
from faker import Faker

faker = Faker()

# Constantes
HIGH_RISK_COUNTRIES = ['IR', 'KP', 'SY', 'SD', 'VE', 'SO', 'YE', 'CU', 'MM', 'CF']
STRUCTURE_THRESHOLD = 10000.0

def generate_anomalous_transactions(months=12, avg_atm_per_month=15, avg_transfer_per_month=8, avg_structuring_per_month=12, avg_round_amounts_per_month=12, start_date=None):
    """
    Generate synthetic transactions for the anomalous customer ANOM001 over a period of months.
    Includes ATM withdrawals, high-risk transfers, structuring transactions, and round amounts.
    Returns a pandas DataFrame with the full transaction schema.
    """
    # Determine the starting month
    if start_date is None:
        today = pd.Timestamp(datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0))
    else:
        start = pd.to_datetime(start_date)
        today = pd.Timestamp(start.replace(day=1, hour=0, minute=0, second=0, microsecond=0))

    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    records = []
    customer_id = 'ANOM001'
    account_id = f"ACC-{customer_id}"

    for period in periods:
        # Number of each transaction type for this month
        num_atm = np.random.poisson(lam=avg_atm_per_month)
        num_transfers = np.random.poisson(lam=avg_transfer_per_month)
        num_structures = np.random.poisson(lam=avg_structuring_per_month)
        num_round_amounts = np.random.poisson(lam=avg_round_amounts_per_month)

        # ATM transactions
        for _ in range(num_atm):
            trx_time = period + pd.Timedelta(days=random.randint(0,27),
                                             hours=random.randint(0,23),
                                             minutes=random.randint(0,59),
                                             seconds=random.randint(0,59))
            amt = float(np.random.randint(200, 801) * 10)  # sizeable ATM withdrawals
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': customer_id,
                'account_id': account_id,
                'transaction_timestamp': trx_time,
                'transaction_type_code': 'ATM',
                'transaction_type_description': 'ATM Withdrawal',
                'channel': 'ATM',
                'original_trx_amount': amt,
                'original_trx_currency': 'USD',
                'reference_trx_amount': amt,  # assume USD reference
                'normalized_country_amount': amt,
                'counterparty_customer_name': None,
                'counterparty_account': None,
                'counterparty_contract': '',
                'counterparty_country': None,
                'counterparty_country_risk': None,
                'internal': False,
                'in_out': 'OUT',
                'atm_id': f"ATM-{random.randint(100,999)}",
                'branch_id': '',
                'transaction_description': 'Cash withdrawal at ATM',
                'banker_id': ''
            })

        # High-risk country transfers
        for _ in range(num_transfers):
            trx_time = period + pd.Timedelta(days=random.randint(0,27),
                                             hours=random.randint(0,23),
                                             minutes=random.randint(0,59),
                                             seconds=random.randint(0,59))
            country = random.choice(HIGH_RISK_COUNTRIES)
            amt = round(float(np.random.uniform(15000, 50000)),2) # large transfer amounts
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': customer_id,
                'account_id': account_id,
                'transaction_timestamp': trx_time,
                'transaction_type_code': 'TRF',
                'transaction_type_description': 'Wire Transfer',
                'channel': 'internet',
                'original_trx_amount': amt,
                'original_trx_currency': 'USD',
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': faker.name(),
                'counterparty_account': f"ACCT-{faker.bothify('????-########')}",
                'counterparty_contract': '',
                'counterparty_country': country,
                'counterparty_country_risk': 'High',
                'internal': False,
                'in_out': random.choice(['IN', 'OUT']),
                'atm_id': None,
                'branch_id': '',
                'transaction_description': f"Wire transfer",
                'banker_id': ''
            })

        # Structuring transactions (just below the reporting threshold)
        for _ in range(num_structures):
            trx_time = period + pd.Timedelta(days=random.randint(0,27),
                                             hours=random.randint(0,23),
                                             minutes=random.randint(0,59),
                                             seconds=random.randint(0,59))
            amt = float(round(STRUCTURE_THRESHOLD - np.random.uniform(1, 50), 0))
            # random choice of channel
            channel = random.choice(['TELLER', 'ATM'])
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': customer_id,
                'account_id': account_id,
                'transaction_timestamp': trx_time,
                'transaction_type_code': 'STRUCTURING',
                'transaction_type_description': 'Structuring',
                'channel': channel,
                'original_trx_amount': amt,
                'original_trx_currency': 'USD',
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': None,
                'counterparty_account': None,
                'counterparty_contract': '',
                'counterparty_country': None,
                'counterparty_country_risk': None,
                'internal': False,
                'in_out': 'IN',
                'atm_id': f"ATM-{random.randint(100,999)}" if channel == 'ATM' else None,
                'branch_id': '',
                'transaction_description': 'Cash deposit',
                'banker_id': ''
            })

        # Round amount transactions
        for _ in range(num_round_amounts):
            trx_time = period + pd.Timedelta(days=random.randint(0,27),
                                             hours=random.randint(0,23),
                                             minutes=random.randint(0,59),
                                             seconds=random.randint(0,59))
            amt = float(np.random.randint(1,10)*1000)  # Asegurar que sean múltiplos de 10
            # random choice of channel
            channel = random.choice(['TELLER', 'ATM'])
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': customer_id,
                'account_id': account_id,
                'transaction_timestamp': trx_time,
                'transaction_type_code': 'ROUND_AMOUNT',
                'transaction_type_description': 'Round Amount',
                'channel': channel,
                'original_trx_amount': amt,
                'original_trx_currency': 'USD',
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': None,
                'counterparty_account': None,
                'counterparty_contract': '',
                'counterparty_country': None,
                'counterparty_country_risk': None,
                'internal': False,
                'in_out': np.random.choice(['IN','OUT']),
                'atm_id': f"ATM-{random.randint(100,999)}" if channel == 'ATM' else None,
                'branch_id': '',
                'transaction_description': 'Round amount transaction',
                'banker_id': ''
            })

    df = pd.DataFrame(records)
    # Ensure correct ordering
    df.sort_values('transaction_timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def calculate_aggregated_features_from_transactions(df_trx, customer_id, customer_name):
    """
    Calcula las features agregadas mes-a-mes a partir de las transacciones reales generadas.
    NO randomiza los resultados, sino que los calcula desde los datos reales.
    """
    if df_trx.empty:
        return pd.DataFrame()
    
    # 1) Extrae la columna year_month como timestamp al primer día del mes
    df_trx = df_trx.copy()
    df_trx['year_month'] = df_trx['transaction_timestamp'].dt.to_period('M').dt.to_timestamp()
    
    # 2) Calcula las métricas reales basándose en los datos:
    
    results = []
    
    for period, group in df_trx.groupby('year_month'):
        # Structuring: suma de montos de transacciones tipo STRUCTURING
        structuring_txs = group[group['transaction_type_code'] == 'STRUCTURING']
        structuring = float(structuring_txs['original_trx_amount'].sum())
        
        # Count distinct ATM: número de transacciones ATM
        cnt_distinct_atm = int((group['transaction_type_code'] == 'ATM').sum())
        
        # Sum high-risk transfers: suma de montos de transferencias a países de alto riesgo
        high_risk_txs = group[group['transaction_type_code'] == 'TRF']
        sum_trx_high_risk = float(high_risk_txs['original_trx_amount'].sum())
        
        # Round amounts: número de transacciones con montos redondos
        round_amounts_txs = group[group['transaction_type_code'] == 'ROUND_AMOUNT']
        round_amounts = int(len(round_amounts_txs))
        
        # Detectar deposit-withdrawal pipe: alternar IN/OUT en cortos períodos
        # Ordenar por timestamp y detectar patrones alternantes
        group_sorted = group.sort_values('transaction_timestamp')
        pipe_score = 0.0
        if len(group_sorted) > 1:
            in_out_sequence = group_sorted['in_out'].tolist()
            alternating_count = 0
            for i in range(1, len(in_out_sequence)):
                if in_out_sequence[i] != in_out_sequence[i-1]:
                    alternating_count += 1
            # Ratio de alternancia como indicador de pipe behavior
            pipe_score = alternating_count / max(1, len(in_out_sequence) - 1)
        
        # Overall activity spike: detectar picos de actividad
        # Contar transacciones por día y detectar días con actividad inusual
        daily_counts = group.groupby(group['transaction_timestamp'].dt.date).size()
        activity_spike = 0
        if len(daily_counts) > 0:
            threshold = daily_counts.mean() + 2 * daily_counts.std()
            activity_spike = int((daily_counts > threshold).sum())
        
        # Calcular pop factors basados en volúmenes reales
        pop_factor = random.uniform(0.002, 0.006)  # Factor poblacional para cliente anómalo
        
        results.append({
            'customer_id': customer_id,
            'customer_name': customer_name,
            'year_month': period,
            'year_month_str': period.strftime('%Y-%m'),
            'structuring': structuring,
            'cnt_distinct_atm': cnt_distinct_atm,
            'cnt_distinct_atm_pop': cnt_distinct_atm * pop_factor,
            'sum_trx_high_risk': sum_trx_high_risk,
            'sum_trx_high_risk_pop': sum_trx_high_risk * pop_factor,
            'deposit_withdrawal_pipe': pipe_score,
            'overall_activity_spike': float(activity_spike),
            'crypto_activity': 0.0,  # No hay crypto en este dataset
            'check_deposit_value': 0.0,  # No hay cheques en este dataset
            'round_amounts': float(round_amounts)
        })
    
    return pd.DataFrame(results)

def generate_normal_individual_transactions(customer_id, customer_name, period, n_transactions=20):
    """
    Genera transacciones normales para clientes individuales regulares
    """
    records = []
    account_id = f"ACC-{customer_id}"
    
    # Generar transacciones distribuidas a lo largo del mes
    for _ in range(n_transactions):
        ts = period + pd.Timedelta(
            days=random.randint(0, 27),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
            seconds=random.randint(0, 59)
        )
        
        # Tipos de transacciones normales
        tx_type = random.choice(['ATM', 'TRANSFER', 'DEPOSIT', 'PURCHASE'])
        channel = random.choice(['ATM', 'internet', 'branch', 'mobile'])
        
        # Montos normales (más pequeños que el cliente anómalo)
        if tx_type == 'ATM':
            amt = float(np.random.randint(20, 100) * 10)  # ATM normales más pequeños
        elif tx_type == 'TRANSFER':
            amt = round(float(np.random.uniform(500, 5000)), 2)  # Transferencias normales
        else:
            amt = round(float(np.random.uniform(100, 2000)), 2)  # Otros tipos
        
        # Países de bajo riesgo para transferencias
        if tx_type == 'TRANSFER':
            country = random.choice(['CA', 'GB', 'FR', 'DE', 'AU', 'JP'])
            country_risk = 'Low'
        else:
            country = None
            country_risk = None
        
        records.append({
            'transaction_id': str(uuid.uuid4()),
            'customer_id': customer_id,
            'customer_name': customer_name,
            'account_id': account_id,
            'transaction_timestamp': ts,
            'transaction_type_code': tx_type,
            'transaction_type_description': f'{tx_type} Transaction',
            'channel': channel,
            'original_trx_amount': amt,
            'original_trx_currency': 'USD',
            'reference_trx_amount': amt,
            'normalized_country_amount': amt,
            'counterparty_customer_name': faker.name() if tx_type == 'TRANSFER' else None,
            'counterparty_account': f"ACCT-{faker.bothify('????-########')}" if tx_type == 'TRANSFER' else None,
            'counterparty_contract': '',
            'counterparty_country': country,
            'counterparty_country_risk': country_risk,
            'internal': False,
            'in_out': random.choice(['IN', 'OUT']),
            'atm_id': f"ATM-{random.randint(100,999)}" if channel == 'ATM' else None,
            'branch_id': f"BR-{random.randint(100,999)}" if channel == 'branch' else '',
            'transaction_description': f'Normal {tx_type.lower()} transaction',
            'banker_id': f"BNK-{random.randint(1000,9999)}" if channel == 'branch' else ''
        })
    
    return records

def calculate_normal_aggregated_features(transactions_df, customer_id, customer_name, period):
    """
    Calcula features agregadas para clientes normales basándose en sus transacciones reales
    """
    # Filtrar transacciones del período mensual específico
    start_date = period
    end_date = period + pd.DateOffset(months=1)
    mask = (transactions_df['transaction_timestamp'] >= start_date) & \
           (transactions_df['transaction_timestamp'] < end_date) & \
           (transactions_df['customer_id'] == customer_id)
    
    period_txs = transactions_df[mask].copy()
    
    if len(period_txs) == 0:
        # Si no hay transacciones, devolver valores por defecto
        return {
            'customer_id': customer_id,
            'customer_name': customer_name,
            'year_month': period,
            'year_month_str': period.strftime("%Y-%m"),
            'structuring': 0.0,
            'cnt_distinct_atm': 0.0,
            'cnt_distinct_atm_pop': 0.0,
            'sum_trx_high_risk': 0.0,
            'sum_trx_high_risk_pop': 0.0,
            'deposit_withdrawal_pipe': 0.0,
            'overall_activity_spike': 0.0,
            'crypto_activity': 0.0,
            'check_deposit_value': 0.0,
            'round_amounts': 0.0
        }
    
    # Calcular métricas basándose en transacciones reales
    # Structuring: para clientes normales será 0 o muy bajo
    structuring_txs = period_txs[period_txs['transaction_type_code'] == 'STRUCTURING']
    structuring = float(structuring_txs['original_trx_amount'].sum())
    
    # ATM count
    cnt_distinct_atm = float(int((period_txs['transaction_type_code'] == 'ATM').sum()))
    
    # High-risk transfers: clientes normales tienen pocos o ninguno
    high_risk_txs = period_txs[
        (period_txs['transaction_type_code'] == 'TRANSFER') & 
        (period_txs['counterparty_country'].isin(HIGH_RISK_COUNTRIES))
    ]
    sum_trx_high_risk = float(high_risk_txs['original_trx_amount'].sum())
    
    # Round amounts
    round_amounts_txs = period_txs[period_txs['transaction_type_code'] == 'ROUND_AMOUNT']
    round_amounts = float(len(round_amounts_txs))
    
    # Pipe behavior (más bajo para clientes normales)
    pipe_score = 0.0
    if len(period_txs) > 1:
        period_txs_sorted = period_txs.sort_values('transaction_timestamp')
        in_out_sequence = period_txs_sorted['in_out'].tolist()
        alternating_count = 0
        for i in range(1, len(in_out_sequence)):
            if in_out_sequence[i] != in_out_sequence[i-1]:
                alternating_count += 1
        pipe_score = alternating_count / max(1, len(in_out_sequence) - 1)
    
    # Activity spike (menor para clientes normales)
    daily_counts = period_txs.groupby(period_txs['transaction_timestamp'].dt.date).size()
    activity_spike = 0
    if len(daily_counts) > 0:
        threshold = daily_counts.mean() + 2 * daily_counts.std()
        activity_spike = int((daily_counts > threshold).sum())
    
    # Pop factors para clientes normales (menores)
    pop_factor = random.uniform(0.001, 0.002)
    
    return {
        'customer_id': customer_id,
        'customer_name': customer_name,
        'year_month': period,
        'year_month_str': period.strftime("%Y-%m"),
        'structuring': structuring,
        'cnt_distinct_atm': cnt_distinct_atm,
        'cnt_distinct_atm_pop': cnt_distinct_atm * pop_factor,
        'sum_trx_high_risk': sum_trx_high_risk,
        'sum_trx_high_risk_pop': sum_trx_high_risk * pop_factor,
        'deposit_withdrawal_pipe': pipe_score,
        'overall_activity_spike': float(activity_spike),
        'crypto_activity': 0.0,
        'check_deposit_value': 0.0,
        'round_amounts': round_amounts
    }

def generate_aggregated_dataset(n_customers=100, months=12, transactions_per_customer_per_month=20):
    """
    Genera el dataset completo de individuos: transacciones + features agregadas
    """
    # Preparar clientes
    normal_customers = [f"IND{i:04d}" for i in range(n_customers - 1)]
    anomaly_customer = "ANOM001"
    all_customers = normal_customers + [anomaly_customer]
    
    # Generar nombres
    customer_names = {cust: faker.name() for cust in all_customers}
    
    # Preparar períodos
    today = pd.Timestamp(datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0))
    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    
    # 1. Generar todas las transacciones
    all_transactions = []
    
    # Transacciones para clientes normales
    for customer in normal_customers:
        for period in periods:
            txs = generate_normal_individual_transactions(
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period,
                n_transactions=transactions_per_customer_per_month
            )
            all_transactions.extend(txs)
    
    # Transacciones para cliente anómalo
    anomaly_txs = generate_anomalous_transactions(months=months)
    all_transactions.extend(anomaly_txs.to_dict('records'))
    
    # Convertir a DataFrame
    transactions_df = pd.DataFrame(all_transactions)
    
    # 2. Calcular features agregadas
    aggregated_records = []
    
    # Features para clientes normales
    for customer in normal_customers:
        for period in periods:
            features = calculate_normal_aggregated_features(
                transactions_df=transactions_df,
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period
            )
            aggregated_records.append(features)
    
    # Features para cliente anómalo (calculadas desde transacciones reales)
    anomaly_features = calculate_aggregated_features_from_transactions(
        df_trx=anomaly_txs,
        customer_id=anomaly_customer,
        customer_name=customer_names[anomaly_customer]
    )
    aggregated_records.extend(anomaly_features.to_dict('records'))
    
    aggregated_df = pd.DataFrame(aggregated_records)
    
    # Ordenar datasets
    transactions_df = transactions_df.sort_values(['customer_id', 'transaction_timestamp'])
    aggregated_df = aggregated_df.sort_values(['customer_id', 'year_month'])
    
    return transactions_df, aggregated_df

def generate_anomalous_kyc(name):
    """
    Generate KYC record for the anomalous customer.
    Returns a pandas DataFrame with one record.
    """
    dob = faker.date_of_birth(minimum_age=30, maximum_age=70)
    eff_date = faker.date_between(start_date='-5y', end_date='-1y')
    data = {
        'customer_id': 'ANOM001',
        'customer_num': f"NUM-{faker.bothify('######')}",
        'type_of_document': random.choice(['Passport', 'Driver License', 'National ID']),
        'document_number_id_code': faker.bothify('??########'),
        'name': name,
        'date_of_birth': dob,
        'country_of_birth_nationality': 'CU',
        'tax_residency_countries': 'United States',
        'country_of_residence_code': 'US',
        'country_of_residence': 'United States',
        'citizenship_countries_code': 'US',
        'citizenship_countries': 'United States, Cuba',
        'address': faker.address(),
        'phone_number': faker.phone_number(),
        'occupation': random.choice(['Entrepreneur', 'Investor', 'Consultant', 'Executive']),
        'is_unemployed': False,
        'customer_effective_date': eff_date,
        'aml_risk_segment': random.randint(4,6),
        'pep': random.choice([True, False]),
        'segment_type': 'PERS',
        'segment_type_description': 'Personal',
        'sars_flag': False
    }
    return pd.DataFrame([data])

In [None]:
df_trx, agg_df = generate_aggregated_dataset()
dataset_functions.write(context, context.get_spark_session().createDataFrame(agg_df), customer_monthly_dataset().identifier)
anom_name = agg_df.loc[agg_df.customer_id=='ANOM001']['customer_name'].iloc[0]

dataset_functions.write(
    context,
    context.get_spark_session().createDataFrame(df_trx),
    transactions_dataset().identifier
)
dataset_functions.publish(context, transactions_dataset().identifier)

dataset_functions.write(context, context.get_spark_session().createDataFrame(generate_anomalous_kyc(name=anom_name)), customers_dataset().identifier)
dataset_functions.publish(context, customers_dataset().identifier)

In [None]:
context.close()