### Init Context

In [None]:
from thetaray.api.context import init_context
import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_ret_smb/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']
context = init_context(execution_date=datetime.datetime(1970, 2, 1),
                       spark_conf=spark_config,
                       spark_master='local[*]')

### Imports

In [None]:
from thetaray.api.dataset import dataset_functions

from domains.demo_ret_smb.datasets.customer_monthly import customer_monthly_dataset
from domains.demo_ret_smb.datasets.customers import customers_dataset
from domains.demo_ret_smb.datasets.transactions import transactions_dataset

from pyspark.sql import functions as f

### Data Gen

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import random
from faker import Faker
import uuid

faker = Faker()

# Define a sample list of tax haven countries (ISO 3166-1 alpha-2 codes)
TAX_HEAVEN_COUNTRIES = ["CY", "KY", "BS", "PA", "LU", "CH", "SG", "AE", "BM", "VG"]

# Mapping for transaction type descriptions
TRANSACTION_TYPE_DESCRIPTIONS = {
    'PIPE': 'Alternating incoming and outgoing transfers',
    'TAX_HEAVEN': 'Transfer involving a tax haven jurisdiction',
    'SPIKE': 'Unusual burst of transaction activity',
    'MANY_TO_ONE': 'Multiple incoming transfers from different counterparties to a single account',
    'ONE_TO_MANY': 'Single account making transfers to multiple counterparties',
    'NORMAL': 'Normal transaction'
}

def generate_normal_transactions(customer_id, customer_name, period, n_transactions=50):
    """
    Genera transacciones normales para un cliente SMB regular
    """
    records = []
    account_id = f"ACC-{customer_id}"
    
    # Funciones para generar IDs
    def generate_real_atm_id():
        return f"ATM-{random.randint(1000,9999)}"
    
    def generate_real_branch_id():
        return f"BR-{random.randint(100,999)}"

    def generate_real_banker_id():
        return f"BNK-{random.randint(1000,9999)}"

    def generate_placeholder_atm_id():
        return f"ATM-P-{uuid.uuid4().hex[:4].upper()}"
        
    def generate_placeholder_branch_id():
        return f"BR-P-{uuid.uuid4().hex[:3].upper()}"

    def generate_placeholder_banker_id():
        return f"BNK-P-{uuid.uuid4().hex[:4].upper()}"
    
    # Generar transacciones distribuidas a lo largo del mes
    for _ in range(n_transactions):
        ts = period + pd.Timedelta(
            days=random.randint(0,27),
            hours=random.randint(0,23),
            minutes=random.randint(0,59),
            seconds=random.randint(0,59)
        )
        
        # Transacción normal
        amt = round(float(np.random.normal(loc=5000, scale=2000)),2)
        amt = max(100, amt)  # Evitar montos negativos
        
        channel = random.choice(['internet', 'branch', 'mobile', 'ATM', 'wire'])
        in_out = random.choice(['IN', 'OUT'])
        
        # Determinar si es doméstica o internacional
        is_domestic = random.random() < 0.85  # 85% transacciones domésticas
        
        # Generar detalles de contraparte
        cp_name = faker.company()
        cp_account = f"ACC-CPTY-{uuid.uuid4().hex[:12].upper()}"
        cp_contract = f"CON-{uuid.uuid4().hex[:8].upper()}"
        
        if is_domestic:
            cp_country = 'US'  # Asumiendo US como país local
            cp_country_risk = 'Low'
        else:
            cp_country = faker.country_code()
            while cp_country in TAX_HEAVEN_COUNTRIES:
                cp_country = faker.country_code()
            cp_country_risk = random.choice(['Low', 'Medium'])
        
        # Asignar IDs según el canal
        atm_id = generate_placeholder_atm_id()
        branch_id = generate_placeholder_branch_id()
        banker_id = generate_placeholder_banker_id()
        
        if channel == 'branch':
            branch_id = generate_real_branch_id()
            banker_id = generate_real_banker_id()
        elif channel == 'ATM':
            atm_id = generate_real_atm_id()
        
        records.append({
            'transaction_id': str(uuid.uuid4()),
            'customer_id': customer_id,
            'customer_name': customer_name,
            'account_id': account_id,
            'transaction_timestamp': ts,
            'transaction_type_code': 'NORMAL',
            'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['NORMAL'],
            'channel': channel,
            'original_trx_amount': amt,
            'original_trx_currency': 'USD',
            'reference_trx_amount': amt,
            'normalized_country_amount': amt,
            'counterparty_customer_name': cp_name,
            'counterparty_account': cp_account,
            'counterparty_contract': cp_contract,
            'counterparty_country': cp_country,
            'counterparty_country_risk': cp_country_risk,
            'internal': False,
            'in_out': in_out,
            'atm_id': atm_id,
            'branch_id': branch_id,
            'transaction_description': 'Regular business transaction',
            'banker_id': banker_id,
        })
    
    return records

def generate_anomalous_transactions(customer_name, months=12,
                                   avg_pipe_tx_per_month=50,
                                   avg_tax_heaven_tx_per_month=5,
                                   avg_spike_tx_per_month=200,
                                   avg_many_to_one_per_month=10,
                                   avg_one_to_many_per_month=10,
                                   start_date=None,
                                   default_currency='USD'):
    """
    Genera transacciones sintéticas para el cliente anómalo SMB_ANOM
    """
    if start_date is None:
        today_dt = datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    else:
        today_dt = pd.to_datetime(start_date).replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    
    today = pd.Timestamp(today_dt)
    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    records = []
    cust = 'SMB_ANOM'
    account_id_main = f"ACC-{cust}"

    # Funciones generadoras de IDs
    def generate_real_atm_id():
        return f"ATM-{random.randint(1000,9999)}"
    
    def generate_real_branch_id():
        return f"BR-{random.randint(100,999)}"

    def generate_real_banker_id():
        return f"BNK-{random.randint(1000,9999)}"

    def generate_placeholder_atm_id():
        return f"ATM-P-{uuid.uuid4().hex[:4].upper()}"
        
    def generate_placeholder_branch_id():
        return f"BR-P-{uuid.uuid4().hex[:3].upper()}"

    def generate_placeholder_banker_id():
        return f"BNK-P-{uuid.uuid4().hex[:4].upper()}"

    for period in periods:
        n_pipe = np.random.poisson(lam=avg_pipe_tx_per_month)
        n_tax = np.random.poisson(lam=avg_tax_heaven_tx_per_month)
        n_spike = np.random.poisson(lam=avg_spike_tx_per_month)
        n_many_one = np.random.poisson(lam=avg_many_to_one_per_month)
        n_one_many = np.random.poisson(lam=avg_one_to_many_per_month)

        def get_counterparty_details(is_tax_haven=False, specified_country=None):
            cp_name = faker.company()
            cp_account = f"ACC-CPTY-{uuid.uuid4().hex[:12].upper()}"
            cp_contract = f"CON-{uuid.uuid4().hex[:8].upper()}"
            if specified_country:
                cp_country = specified_country
                cp_country_risk = 'High' if cp_country in TAX_HEAVEN_COUNTRIES else random.choice(['Low', 'Medium'])
            elif is_tax_haven:
                cp_country = random.choice(TAX_HEAVEN_COUNTRIES)
                cp_country_risk = 'High'
            else:
                cp_country = faker.country_code()
                while cp_country in TAX_HEAVEN_COUNTRIES: 
                    cp_country = faker.country_code()
                cp_country_risk = random.choice(['Low', 'Medium'])
            return cp_name, cp_account, cp_contract, cp_country, cp_country_risk

        # 1) Pipe behaviour - pares de transacciones IN/OUT
        for i in range(n_pipe // 2):
            # Transacción de entrada
            ts_in = period + pd.Timedelta(days=random.randint(0,25),
                                         hours=random.randint(0,23),
                                         minutes=random.randint(0,59))
            amt = round(float(np.random.uniform(10000, 50000)),2)
            
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': cust,
                'customer_name': customer_name,
                'account_id': account_id_main,
                'transaction_timestamp': ts_in,
                'transaction_type_code': 'PIPE',
                'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['PIPE'],
                'channel': 'internet',
                'original_trx_amount': amt,
                'original_trx_currency': default_currency,
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': faker.company(),
                'counterparty_account': f"ACC-PIPE-{uuid.uuid4().hex[:12].upper()}",
                'counterparty_contract': f"CON-{uuid.uuid4().hex[:8].upper()}",
                'counterparty_country': faker.country_code(),
                'counterparty_country_risk': 'Low',
                'internal': False,
                'in_out': 'IN',
                'atm_id': generate_placeholder_atm_id(),
                'branch_id': generate_placeholder_branch_id(),
                'transaction_description': 'Pipe account behaviour - IN',
                'banker_id': generate_placeholder_banker_id(),
            })
            
            # Transacción de salida (1-3 días después)
            ts_out = ts_in + pd.Timedelta(days=random.randint(1,3),
                                         hours=random.randint(0,12))
            
            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': cust,
                'customer_name': customer_name,
                'account_id': account_id_main,
                'transaction_timestamp': ts_out,
                'transaction_type_code': 'PIPE',
                'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['PIPE'],
                'channel': 'internet',
                'original_trx_amount': amt * 0.95,  # Pequeña comisión
                'original_trx_currency': default_currency,
                'reference_trx_amount': amt * 0.95,
                'normalized_country_amount': amt * 0.95,
                'counterparty_customer_name': faker.company(),
                'counterparty_account': f"ACC-PIPE-{uuid.uuid4().hex[:12].upper()}",
                'counterparty_contract': f"CON-{uuid.uuid4().hex[:8].upper()}",
                'counterparty_country': faker.country_code(),
                'counterparty_country_risk': 'Low',
                'internal': False,
                'in_out': 'OUT',
                'atm_id': generate_placeholder_atm_id(),
                'branch_id': generate_placeholder_branch_id(),
                'transaction_description': 'Pipe account behaviour - OUT',
                'banker_id': generate_placeholder_banker_id(),
            })

        # 2) Tax Heaven transfers
        for _ in range(n_tax):
            ts = period + pd.Timedelta(days=random.randint(0,27),
                                     hours=random.randint(0,23),
                                     minutes=random.randint(0,59))
            tax_country = random.choice(TAX_HEAVEN_COUNTRIES)
            amt = round(float(np.random.uniform(50000, 200000)),2)
            direction = random.choice(['IN', 'OUT'])
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details(is_tax_haven=True, specified_country=tax_country)

            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': cust,
                'customer_name': customer_name,
                'account_id': account_id_main,
                'transaction_timestamp': ts,
                'transaction_type_code': 'TAX_HEAVEN',
                'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['TAX_HEAVEN'],
                'channel': 'wire',
                'original_trx_amount': amt,
                'original_trx_currency': default_currency,
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                'internal': False,
                'in_out': direction,
                'atm_id': generate_placeholder_atm_id(),
                'branch_id': generate_placeholder_branch_id(),
                'transaction_description': f'Transfer to/from tax haven - {tax_country}',
                'banker_id': generate_placeholder_banker_id(),
            })

        # 3) Spike of transactions
        if n_spike > 0:
            spike_start_day = random.randint(0,27)
            spike_start_hour = random.randint(0,23)
            base_ts_spike = period + pd.Timedelta(days=spike_start_day, hours=spike_start_hour)
            
            for i in range(n_spike):
                ts = base_ts_spike + pd.Timedelta(minutes=random.randint(0, 180))  # 3 horas de ventana
                amt = round(float(np.random.uniform(100, 1000)*10),0)
                channel = random.choice(['internet', 'mobile', 'ATM'])
                in_out = random.choice(['IN', 'OUT'])
                
                cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details()

                records.append({
                    'transaction_id': str(uuid.uuid4()),
                    'customer_id': cust,
                    'customer_name': customer_name,
                    'account_id': account_id_main,
                    'transaction_timestamp': ts,
                    'transaction_type_code': 'SPIKE',
                    'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['SPIKE'],
                    'channel': channel,
                    'original_trx_amount': amt,
                    'original_trx_currency': default_currency,
                    'reference_trx_amount': amt,
                    'normalized_country_amount': amt,
                    'counterparty_customer_name': cp_name,
                    'counterparty_account': cp_acc,
                    'counterparty_contract': cp_con,
                    'counterparty_country': cp_country,
                    'counterparty_country_risk': cp_risk,
                    'internal': False,
                    'in_out': in_out,
                    'atm_id': generate_real_atm_id() if channel == 'ATM' else generate_placeholder_atm_id(),
                    'branch_id': generate_placeholder_branch_id(),
                    'transaction_description': 'High frequency transaction - spike',
                    'banker_id': generate_placeholder_banker_id(),
                })

        # 4) Many-to-one (múltiples contrapartes enviando a una cuenta)
        many_to_one_counterparties = []
        for _ in range(n_many_one):
            ts = period + pd.Timedelta(days=random.randint(0,27),
                                     hours=random.randint(0,23),
                                     minutes=random.randint(0,59))
            amt = round(float(np.random.uniform(500, 5000)),2)
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details()
            many_to_one_counterparties.append(cp_acc)

            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': cust,
                'customer_name': customer_name,
                'account_id': account_id_main,
                'transaction_timestamp': ts,
                'transaction_type_code': 'MANY_TO_ONE',
                'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['MANY_TO_ONE'],
                'channel': 'wire',
                'original_trx_amount': amt,
                'original_trx_currency': default_currency,
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                'internal': False,
                'in_out': 'IN',
                'atm_id': generate_placeholder_atm_id(),
                'branch_id': generate_placeholder_branch_id(),
                'transaction_description': 'Multiple sources to single account',
                'banker_id': generate_placeholder_banker_id(),
            })

        # 5) One-to-many (una cuenta enviando a múltiples contrapartes)
        one_to_many_counterparties = []
        for _ in range(n_one_many):
            ts = period + pd.Timedelta(days=random.randint(0,27),
                                     hours=random.randint(0,23),
                                     minutes=random.randint(0,59))
            amt = round(float(np.random.uniform(500, 5000)),2)
            
            cp_name, cp_acc, cp_con, cp_country, cp_risk = get_counterparty_details()
            one_to_many_counterparties.append(cp_acc)

            records.append({
                'transaction_id': str(uuid.uuid4()),
                'customer_id': cust,
                'customer_name': customer_name,
                'account_id': account_id_main,
                'transaction_timestamp': ts,
                'transaction_type_code': 'ONE_TO_MANY',
                'transaction_type_description': TRANSACTION_TYPE_DESCRIPTIONS['ONE_TO_MANY'],
                'channel': 'wire',
                'original_trx_amount': amt,
                'original_trx_currency': default_currency,
                'reference_trx_amount': amt,
                'normalized_country_amount': amt,
                'counterparty_customer_name': cp_name,
                'counterparty_account': cp_acc,
                'counterparty_contract': cp_con,
                'counterparty_country': cp_country,
                'counterparty_country_risk': cp_risk,
                'internal': False,
                'in_out': 'OUT',
                'atm_id': generate_placeholder_atm_id(),
                'branch_id': generate_placeholder_branch_id(),
                'transaction_description': 'Single source to multiple destinations',
                'banker_id': generate_placeholder_banker_id(),
            })

    df_tx = pd.DataFrame.from_records(records)
    if not df_tx.empty:
        df_tx.sort_values('transaction_timestamp', inplace=True)
        df_tx.reset_index(drop=True, inplace=True)
    return df_tx

def calculate_aggregated_features(transactions_df, customer_id, customer_name, period):
    """
    Calcula las features agregadas para un cliente en un período específico
    basándose en las transacciones reales
    """
    # Filtrar transacciones del período
    start_date = period
    end_date = period + pd.DateOffset(months=1)
    mask = (transactions_df['transaction_timestamp'] >= start_date) & \
           (transactions_df['transaction_timestamp'] < end_date) & \
           (transactions_df['customer_id'] == customer_id)
    
    period_txs = transactions_df[mask].copy()
    
    if len(period_txs) == 0:
        # Si no hay transacciones, devolver valores por defecto
        return {
            'customer_id': customer_id,
            'customer_name': customer_name,
            'year_month': period,
            'year_month_str': period.strftime("%Y-%m"),
            'pipe_accnt_behv': 0.0,
            'tax_heaven_jurisd': 0.0,
            'tax_heaven_jurisd_pop': 0.0,
            'spike_of_trx': 0.0,
            'many_to_one': 0.0,
            'one_to_many': 0.0,
            'avg_tx_amount_monthly': 0.0,
            'pct_domestic_transactions': 0.0,
            'atm_withdrawal_ratio': 0.0,
            'num_unique_counterparties': 0,
            'total_transactions': 0
        }
    
    # 1. Pipe account behaviour - ratio de transacciones PIPE
    pipe_txs = period_txs[period_txs['transaction_type_code'] == 'PIPE']
    pipe_ratio = len(pipe_txs) / len(period_txs) if len(period_txs) > 0 else 0
    
    # 2. Tax heaven jurisdiction - suma de montos a paraísos fiscales
    tax_haven_txs = period_txs[
        (period_txs['transaction_type_code'] == 'TAX_HEAVEN') | 
        (period_txs['counterparty_country'].isin(TAX_HEAVEN_COUNTRIES))
    ]
    tax_heaven_amount = tax_haven_txs['original_trx_amount'].sum()
    
    # 3. Tax heaven jurisdiction population (factor de población)
    pop_factor = random.uniform(0.001, 0.003) if customer_id == 'SMB_ANOM' else random.uniform(0.001, 0.002)
    tax_heaven_pop = tax_heaven_amount * pop_factor
    
    # 4. Spike of transactions - detectar picos de actividad
    # Agrupar transacciones por hora y detectar picos
    period_txs['hour'] = period_txs['transaction_timestamp'].dt.floor('H')
    hourly_counts = period_txs.groupby('hour').size()
    
    if len(hourly_counts) > 0:
        spike_threshold = hourly_counts.mean() + 2 * hourly_counts.std()
        spike_hours = hourly_counts[hourly_counts > spike_threshold]
        spike_count = spike_hours.sum() if len(spike_hours) > 0 else 0
    else:
        spike_count = 0
    
    # 5. Many-to-one - contar contrapartes únicas enviando (IN)
    incoming_txs = period_txs[(period_txs['in_out'] == 'IN')]
    many_to_one_count = incoming_txs['counterparty_account'].nunique()
    
    # 6. One-to-many - contar contrapartes únicas recibiendo (OUT)
    outgoing_txs = period_txs[period_txs['in_out'] == 'OUT']
    one_to_many_count = outgoing_txs['counterparty_account'].nunique()
    
    # 7. Average transaction amount
    avg_tx_amount = period_txs['original_trx_amount'].mean()
    
    # 8. Percentage of domestic transactions
    domestic_txs = period_txs[period_txs['counterparty_country'] == 'US']  # Asumiendo US como local
    pct_domestic = len(domestic_txs) / len(period_txs) if len(period_txs) > 0 else 0
    
    # 9. ATM withdrawal ratio
    atm_txs = period_txs[period_txs['channel'] == 'ATM']
    atm_ratio = len(atm_txs) / len(period_txs) if len(period_txs) > 0 else 0
    

    return {
        'customer_id': customer_id,
        'customer_name': customer_name,
        'year_month': period,
        'year_month_str': period.strftime("%Y-%m"),
        'pipe_accnt_behv': float(pipe_ratio * 10),  # Escalar para que sea más visible
        'tax_heaven_jurisd': float(tax_heaven_amount),
        'tax_heaven_jurisd_pop': float(tax_heaven_pop),
        'spike_of_trx': float(spike_count),
        'many_to_one': float(many_to_one_count),
        'one_to_many': float(one_to_many_count),
        'avg_tx_amount_monthly': float(avg_tx_amount),
        'pct_domestic_transactions': float(pct_domestic),
        'atm_withdrawal_ratio': float(atm_ratio)
    }

def generate_smb_aggregated_dataset(n_customers=100, months=12, transactions_per_customer_per_month=50):
    """
    Genera el dataset completo: primero las transacciones, luego las features agregadas
    """
    # Preparar clientes
    normal_customers = [f"SMB{i:04d}" for i in range(n_customers - 1)]
    anomaly_customer = "SMB_ANOM"
    all_customers = normal_customers + [anomaly_customer]
    
    # Generar nombres de empresas
    customer_names = {cust: faker.company() for cust in all_customers}
    
    # Preparar períodos
    today = pd.Timestamp(datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0))
    periods = [today - pd.DateOffset(months=i) for i in range(months)]
    
    # 1. Generar todas las transacciones
    
    all_transactions = []
    
    # Transacciones para clientes normales
    for customer in normal_customers:
        for period in periods:
            txs = generate_normal_transactions(
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period,
                n_transactions=transactions_per_customer_per_month
            )
            all_transactions.extend(txs)
    
    # Transacciones para cliente anómalo
    anomaly_txs = generate_anomalous_transactions(
        customer_name=customer_names[anomaly_customer],
        months=months
    )
    all_transactions.extend(anomaly_txs.to_dict('records'))
    
    # Convertir a DataFrame
    transactions_df = pd.DataFrame(all_transactions)
    
    # 2. Calcular features agregadas
    aggregated_records = []
    
    for customer in all_customers:
        for period in periods:
            features = calculate_aggregated_features(
                transactions_df=transactions_df,
                customer_id=customer,
                customer_name=customer_names[customer],
                period=period
            )
            aggregated_records.append(features)
    
    aggregated_df = pd.DataFrame(aggregated_records)
    
    # Ordenar datasets
    transactions_df = transactions_df.sort_values(['customer_id', 'transaction_timestamp'])
    aggregated_df = aggregated_df.sort_values(['customer_id', 'year_month'])
    
    return transactions_df, aggregated_df

def generate_anomalous_kyc(name):
    """
    Genera registro KYC para SMB_ANOM:
    incluye datos de empresa y riesgo aumentado.
    """
    incorporation_date = faker.date_between(start_date='-10y', end_date='-2y')
    data = {
        'customer_id': 'SMB_ANOM',
        'business_name': name,
        'registration_number': faker.bothify('SMB-#######'),
        'incorporation_country': random.choice(TAX_HEAVEN_COUNTRIES + ['US','GB','DE']),
        'incorporation_date': incorporation_date,
        'industry': random.choice(['Tech', 'Manufacturing', 'Trading', 'Services']),
        'annual_revenue': float(np.random.uniform(1e6, 5e7)),
        'num_employees': random.randint(10, 500),
        'aml_risk_segment': random.randint(4,6),
        'tax_residence': random.choice(TAX_HEAVEN_COUNTRIES + ['US','GB']),
        'pep': random.choice([True, False]),
    }
    return pd.DataFrame([data])



# Ejemplo de uso

transactions_df, agg_df = generate_smb_aggregated_dataset(
    n_customers=10,  # Reducido para ejemplo
    months=6,        # 6 meses de historia
    transactions_per_customer_per_month=30
)



In [None]:
transactions_df, agg_df = generate_smb_aggregated_dataset()

dataset_functions.write(context, context.get_spark_session().createDataFrame(agg_df), customer_monthly_dataset().identifier)
anom_name = agg_df.loc[agg_df.customer_id=='SMB_ANOM']['customer_name'].iloc[0]

dataset_functions.write(context, context.get_spark_session().createDataFrame(transactions_df), transactions_dataset().identifier)
dataset_functions.publish(context, transactions_dataset().identifier)

dataset_functions.write(context, context.get_spark_session().createDataFrame(generate_anomalous_kyc(name=anom_name)), customers_dataset().identifier)
dataset_functions.publish(context, customers_dataset().identifier)

In [None]:
context.close()