In [0]:
# Databricks Notebook
# Simulate core banking transactions (1000–2000 ledger entries per minute)

import uuid
import random
from datetime import datetime, date, timedelta
import decimal
import time
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, BooleanType, DateType, TimestampType, DoubleType

# === Configuration ===
TARGET_ENTRIES_PER_MINUTE = 100000
BATCH_INTERVAL_SECONDS = 60
TOTAL_MINUTES_TO_RUN = 10  # simulate for 10 minutes

# Sample Master Data Parameters
NUM_CUSTOMER_ACCOUNTS = 500
NUM_ATMS = 100
CUSTOMER_ID_START = 10000000
CUSTOMER_ACCOUNT_ID_START = 100000000
ATM_CASH_ACCOUNT_ID_START = 200000000

# === Master Data Containers ===
CUSTOMER_ACCOUNTS = []  # (customer_id, account_id, gl_code, currency)
ATM_DETAILS = []         # (atm_id, branch_id, atm_cash_account_id)

# === GL Codes and Constants ===
GL_CUSTOMER_SAVINGS = '201001'
GL_CASH_IN_ATM = '100102'
GL_BANK_VAULT = '100101'
GL_FEE_INCOME = '402001'

TXN_TYPE_WITHDRAWAL = '001'
TXN_TYPE_DEPOSIT = '002'
TXN_TYPE_FEE = '021'
CHANNEL_ATM = '100'
PROC_SYS_CORE = '200'
PROC_SYS_GATEWAY = '201'
TXN_STATUS_SUCCESS = '303'

# === Hourly Distribution ===
hourly_distribution_by_type = {
    "withdrawal": {hour: (40, 80) if 8 <= hour <= 20 else (10, 30) for hour in range(24)},
    "deposit": {hour: (10, 25) if 9 <= hour <= 18 else (0, 5) for hour in range(24)}
}

# === Generate Master Data ===
def generate_master_data():
    for i in range(NUM_CUSTOMER_ACCOUNTS):
        cid = CUSTOMER_ID_START + i
        accid = CUSTOMER_ACCOUNT_ID_START + i
        currency = random.choice(['BDT', 'USD', 'EUR'])
        CUSTOMER_ACCOUNTS.append((cid, accid, GL_CUSTOMER_SAVINGS, currency))

    for i in range(NUM_ATMS):
        atm_id = f"ATM{i:04d}"
        branch_id = f"B{i % 10 + 1:03d}"
        cash_acc = ATM_CASH_ACCOUNT_ID_START + i
        ATM_DETAILS.append((atm_id, branch_id, cash_acc))

# === Generate Single Transaction ===
def generate_transaction(txn_type, sim_date, hour, atm_id, atm_acc):
    entries = []
    customer = random.choice(CUSTOMER_ACCOUNTS)
    customer_id, customer_acc, gl_code, currency = customer
    txn_id = str(uuid.uuid4())
    
    # Native Python datetime, not ISO string
    ts = datetime.combine(sim_date, datetime.min.time()) + timedelta(hours=hour, minutes=random.randint(0, 59))
    now_ts = datetime.now()
    amt = float(random.choice(range(100, 2001, 100)))  # Use float, not Decimal

    if txn_type == "withdrawal":
        entries.append(Row(
            LEDGER_ENTRY_ID=f"{txn_id}-1",
            TRANSACTION_ID=txn_id,
            ENTRY_SEQUENCE_NO=1,
            TRANSACTION_TIMESTAMP=ts,
            PROCESSING_TIMESTAMP=now_ts,
            VALUE_DATE=ts.date(),
            ACCOUNT_ID=str(customer_acc),
            GL_ACCOUNT_CODE=gl_code,
            AMOUNT=amt,
            CURRENCY_CODE=currency,
            ENTRY_TYPE="DR",
            EQUIVALENT_BASE_AMOUNT=amt,
            FX_RATE=1.0,
            TRANSACTION_TYPE_CODE=TXN_TYPE_WITHDRAWAL,
            CHANNEL_CODE=CHANNEL_ATM,
            PROCESSING_SYSTEM_CODE=PROC_SYS_CORE,
            TRANSACTION_STATUS_CODE=TXN_STATUS_SUCCESS,
            ENTRY_DESCRIPTION=f"Withdrawal by {customer_id}",
            BATCH_ID=None,
            CORRELATION_ID=None,
            IS_REVERSAL_ENTRY=False,
            REVERSED_LEDGER_ENTRY_ID=None,
            RELATED_ENTITY_TYPE="CUSTOMER",
            RELATED_ENTITY_ID=str(customer_id),
            AUDIT_USER_ID=None,
            AUDIT_CLIENT_IP=None,
            AUDIT_HASH=None,
            CREATED_DATE=now_ts,
            LAST_UPDATED_DATE=now_ts
        ))

        entries.append(Row(
            LEDGER_ENTRY_ID=f"{txn_id}-2",
            TRANSACTION_ID=txn_id,
            ENTRY_SEQUENCE_NO=2,
            TRANSACTION_TIMESTAMP=ts,
            PROCESSING_TIMESTAMP=now_ts,
            VALUE_DATE=ts.date(),
            ACCOUNT_ID=str(atm_acc),
            GL_ACCOUNT_CODE=GL_CASH_IN_ATM,
            AMOUNT=amt,
            CURRENCY_CODE=currency,
            ENTRY_TYPE="CR",
            EQUIVALENT_BASE_AMOUNT=amt,
            FX_RATE=1.0,
            TRANSACTION_TYPE_CODE=TXN_TYPE_WITHDRAWAL,
            CHANNEL_CODE=CHANNEL_ATM,
            PROCESSING_SYSTEM_CODE=PROC_SYS_GATEWAY,
            TRANSACTION_STATUS_CODE=TXN_STATUS_SUCCESS,
            ENTRY_DESCRIPTION=f"ATM Dispense {atm_id}",
            BATCH_ID=None,
            CORRELATION_ID=None,
            IS_REVERSAL_ENTRY=False,
            REVERSED_LEDGER_ENTRY_ID=None,
            RELATED_ENTITY_TYPE="ATM",
            RELATED_ENTITY_ID=atm_id,
            AUDIT_USER_ID=None,
            AUDIT_CLIENT_IP=None,
            AUDIT_HASH=None,
            CREATED_DATE=now_ts,
            LAST_UPDATED_DATE=now_ts
        ))
    return entries

# Define the schema for the DataFrame
schema = StructType([
    StructField("LEDGER_ENTRY_ID", StringType(), True),
    StructField("TRANSACTION_ID", StringType(), True),
    StructField("ENTRY_SEQUENCE_NO", IntegerType(), True),
    StructField("TRANSACTION_TIMESTAMP", TimestampType(), True),
    StructField("PROCESSING_TIMESTAMP", TimestampType(), True),
    StructField("VALUE_DATE", DateType(), True),
    StructField("ACCOUNT_ID", StringType(), True),
    StructField("GL_ACCOUNT_CODE", StringType(), True),
    StructField("AMOUNT", DoubleType(), True),
    StructField("CURRENCY_CODE", StringType(), True),
    StructField("ENTRY_TYPE", StringType(), True),
    StructField("EQUIVALENT_BASE_AMOUNT", DoubleType(), True),
    StructField("FX_RATE", DoubleType(), True),
    StructField("TRANSACTION_TYPE_CODE", StringType(), True),
    StructField("CHANNEL_CODE", StringType(), True),
    StructField("PROCESSING_SYSTEM_CODE", StringType(), True),
    StructField("TRANSACTION_STATUS_CODE", StringType(), True),
    StructField("ENTRY_DESCRIPTION", StringType(), True),
    StructField("BATCH_ID", StringType(), True),
    StructField("CORRELATION_ID", StringType(), True),
    StructField("IS_REVERSAL_ENTRY", BooleanType(), True),
    StructField("REVERSED_LEDGER_ENTRY_ID", StringType(), True),
    StructField("RELATED_ENTITY_TYPE", StringType(), True),
    StructField("RELATED_ENTITY_ID", StringType(), True),
    StructField("AUDIT_USER_ID", StringType(), True),
    StructField("AUDIT_CLIENT_IP", StringType(), True),
    StructField("AUDIT_HASH", StringType(), True),
    StructField("CREATED_DATE", TimestampType(), True),
    StructField("LAST_UPDATED_DATE", TimestampType(), True)
])

# === Simulation Loop ===
def simulate_stream():
    generate_master_data()
    sim_date = date.today() - timedelta(days=1)

    for minute in range(TOTAL_MINUTES_TO_RUN):
        start = time.time()
        batch = []

        while len(batch) < TARGET_ENTRIES_PER_MINUTE:
            for _ in range(10):  # Generate in small chunks
                txn_type = random.choice(["withdrawal", "deposit"])
                atm = random.choice(ATM_DETAILS)
                batch.extend(generate_transaction(
                    txn_type,
                    sim_date,
                    random.randint(0, 23),
                    atm[0],
                    atm[2]
                ))
        try:
            df = spark.createDataFrame(batch, schema=schema)
        except Exception as e:
            print("Schema Mismatch Error:", e)
            print("Sample Row:", batch[0].asDict())
            raise
        df.write.format("delta").mode("append").save("/Volumes/bank_cbs/00_landing/financial_ledger")

        elapsed = time.time() - start
        print(f"Minute {minute+1}: Inserted {len(batch)} rows in {elapsed:.2f}s")
        # Simulate processing time
        time.sleep(max(0, BATCH_INTERVAL_SECONDS - (time.time() - start)))

# Run the simulation
simulate_stream()

Minute 1: Inserted 2004 rows in 25.26s
Minute 2: Inserted 2000 rows in 2.22s
Minute 3: Inserted 2004 rows in 2.03s
Minute 4: Inserted 2002 rows in 1.88s
Minute 5: Inserted 2002 rows in 1.93s
Minute 6: Inserted 2002 rows in 1.87s
Minute 7: Inserted 2008 rows in 2.20s
Minute 8: Inserted 2002 rows in 1.86s
Minute 9: Inserted 2008 rows in 1.77s
Minute 10: Inserted 2002 rows in 1.72s
