In [1]:
# 📦 Dependencies
import sqlite3
import random
from faker import Faker
from datetime import datetime, timedelta
from tqdm import tqdm

In [2]:
# 🎲 Faker setup
fake = Faker()
random.seed(42)
Faker.seed(42)

In [3]:
# 📂 Create SQLite DB
db_path = "../data/aml_simulation.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [4]:
# 🧹 Drop existing tables
tables = ["customers", "accounts", "transactions", "watchlist_entities", "flagged_txns"]
for t in tables:
    cursor.execute(f"DROP TABLE IF EXISTS {t}")

In [5]:
# 📐 Schema creation
cursor.executescript("""
CREATE TABLE customers (
    customer_id INTEGER PRIMARY KEY,
    name TEXT,
    dob TEXT,
    country TEXT,
    risk_category TEXT
);

CREATE TABLE accounts (
    account_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    account_type TEXT,
    open_date TEXT
);

CREATE TABLE transactions (
    txn_id INTEGER PRIMARY KEY,
    account_id INTEGER,
    timestamp TEXT,
    amount REAL,
    currency TEXT,
    origin_country TEXT,
    dest_country TEXT,
    channel TEXT,
    counterparty TEXT
);

CREATE TABLE watchlist_entities (
    entity_id INTEGER PRIMARY KEY,
    name TEXT,
    country TEXT
);

CREATE TABLE flagged_txns (
    flagged_id INTEGER PRIMARY KEY,
    txn_id INTEGER,
    rule_triggered TEXT,
    reason TEXT
);
""")
conn.commit()

In [6]:
# 👤 Generate 10,000 customers
countries = ['US', 'DE', 'GB', 'RU', 'CN', 'IR', 'PL', 'FR', 'BR', 'NG']
risk_weights = ['Low', 'Medium', 'High']
customer_rows = []

for i in range(1, 10001):
    customer_rows.append((
        i,
        fake.name(),
        fake.date_of_birth(minimum_age=18, maximum_age=85).isoformat(),
        random.choice(countries),
        random.choices(risk_weights, weights=[0.7, 0.25, 0.05])[0]
    ))

cursor.executemany("INSERT INTO customers VALUES (?, ?, ?, ?, ?)", customer_rows)
conn.commit()

In [7]:
# Generate accounts
account_id = 1
account_types = ['Checking', 'Savings', 'Investment']
account_rows = []

for cust_id in tqdm(range(1, 10001)):
    for _ in range(random.randint(1, 3)):
        account_rows.append((
            account_id,
            cust_id,
            random.choice(account_types),
            fake.date_between(start_date='-10y', end_date='-1d').isoformat()
        ))
        account_id += 1

cursor.executemany("INSERT INTO accounts VALUES (?, ?, ?, ?)", account_rows)
conn.commit()

100%|██████████| 10000/10000 [00:00<00:00, 15392.92it/s]


In [8]:
# Generate transactions (50-200 per account)
# Define allowed date range
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 6, 30)

txn_id = 1
channels = ['Online', 'ATM', 'Branch', 'Mobile']
currencies = ['USD', 'EUR', 'GBP', 'PLN']
txn_batch = []

# Adjust country list and weights for better distribution of high-risk countries
high_risk_countries = ['IR', 'RU', 'CN']
low_risk_countries = [c for c in countries if c not in high_risk_countries]

# Adjusted weights to limit dominance of high risk countries
country_weights = {
    'IR': 0.05,  # reduced from implied ~0.5
    'RU': 0.10,
    'CN': 0.10
}
# Equal weights for low risk countries summing to 0.75 total
low_risk_weight_each = 0.75 / len(low_risk_countries)
for c in low_risk_countries:
    country_weights[c] = low_risk_weight_each

country_choices = list(country_weights.keys())
country_probabilities = list(country_weights.values())

for acc in tqdm(account_rows):
    acc_id, _, _, open_date = acc
    num_txns = random.randint(50, 200)
    open_dt = datetime.fromisoformat(open_date)

    for _ in range(num_txns):
        # Transaction date cannot be before account open date or before start_date
        min_date = max(open_dt, start_date)
        max_days = (end_date - min_date).days
        if max_days < 1:
            # No valid transaction date range, skip this transaction
            continue

        # Balanced month selection with slight growth over time
        total_months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month + 1)
        month_indices = list(range(total_months))
        # Linear weights increasing from 1 to 2 (for example) to add slight growing tendency
        month_weights = [1 + (i / total_months) for i in month_indices]
        chosen_month_idx = random.choices(month_indices, weights=month_weights)[0]
        chosen_year = start_date.year + (start_date.month - 1 + chosen_month_idx) // 12
        chosen_month = (start_date.month - 1 + chosen_month_idx) % 12 + 1

        # Create txn_dt within chosen month and min_date constraint
        # Calculate earliest and latest possible date in chosen month
        first_of_month = datetime(chosen_year, chosen_month, 1)
        if chosen_month == 12:
            next_month = datetime(chosen_year + 1, 1, 1)
        else:
            next_month = datetime(chosen_year, chosen_month + 1, 1)

        # Set min_day as max between first_of_month and min_date
        min_txn_dt = max(first_of_month, min_date)
        max_txn_dt = next_month - timedelta(seconds=1)

        if min_txn_dt > max_txn_dt:
            # No valid day in this month, skip transaction generation this iteration
            continue

        delta_seconds = int((max_txn_dt - min_txn_dt).total_seconds())
        random_second = random.randint(0, delta_seconds)
        txn_dt = min_txn_dt + timedelta(seconds=random_second)

        origin_country = random.choices(country_choices, weights=country_probabilities)[0]
        destination_country = random.choices(country_choices, weights=country_probabilities)[0]

        txn_batch.append((
            txn_id,
            acc_id,
            txn_dt.isoformat(),
            round(random.uniform(10, 10000), 2),
            random.choice(currencies),
            origin_country,
            destination_country,
            random.choice(channels),
            fake.company()
        ))
        txn_id += 1

        if len(txn_batch) > 100000:
            cursor.executemany("INSERT INTO transactions VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", txn_batch)
            txn_batch = []

# Final flush
if txn_batch:
    cursor.executemany("INSERT INTO transactions VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", txn_batch)
conn.commit()


100%|██████████| 20054/20054 [06:10<00:00, 54.12it/s]


In [9]:
# Generate watchlist entities
watchlist = [(i, fake.company(), random.choice(countries)) for i in range(1, 21)]
cursor.executemany("INSERT INTO watchlist_entities VALUES (?, ?, ?)", watchlist)
conn.commit()

In [10]:
conn.close()
print(f"✅ Database created: {db_path}")

✅ Database created: ../data/aml_simulation.db
