# Synthetic Remittance Data Generator
This notebook creates a demo dataset of **remittance transactions** that can be used to test AML rules, dashboards, or data pipelines.  

* One single table (`transactions`) with the essential columns:
  - `transaction_id`, `customer_id`, `counterparty_id`
  - timestamps, channels, amounts, currencies, country codes, direction (`in_out`)
* Includes **one anomalous customer** (`ANOM001`) who shows higher‑risk behaviour (large amounts and high‑risk corridors).
* You can tweak the parameters `n_customers`, `months`, etc. to control the volume and shape of the data.

In [None]:

import pandas as pd
import numpy as np
import random, uuid
from datetime import datetime
from faker import Faker

faker = Faker()
random.seed(42)
np.random.seed(42)

# ISO 4217 examples
ISO_4217 = ["USD", "EUR", "GBP", "JPY", "CAD", "MXN", "AUD"]

# Channels used by the remittance business
CHANNELS = ["agent", "branch", "mobile_app", "web"]

# FATF / OFAC style high‑risk country codes
HIGH_RISK_COUNTRIES = ["IR", "KP", "SY", "SD", "VE", "SO", "YE", "CU", "MM", "CF"]


In [None]:

def generate_remittance_dataset(
    n_customers: int = 100,
    months: int = 12,
    start_date: datetime | None = None,
    pct_counterparty_in_bank: float = 0.25,
) -> pd.DataFrame:
    """Create synthetic remittance transactions.

    Parameters
    ----------
    n_customers : int
        Total number of bank customers *including* the anomalous one.
    months : int
        How many months backward (from `start_date` or today) to simulate.
    start_date : datetime | None
        Reference month (defaults to first day of current month).
    pct_counterparty_in_bank : float
        Probability that the counter‑party is also a bank customer.

    Returns
    -------
    pandas.DataFrame
        One row per transaction, sorted chronologically.
    """

    if start_date is None:
        # first day of current month
        start_date = datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0)

    # build customer universe (last id reserved for anomalous customer)
    customers = [f"CUST{i:04d}" for i in range(n_customers - 1)] + ["ANOM001"]

    # pre‑compute monthly anchor dates (first of month going backwards)
    periods = [pd.Timestamp(start_date) - pd.DateOffset(months=m) for m in range(months)]

    records: list[dict] = []

    for period in periods:
        for cust in customers:
            # number of transactions the customer performs this month
            lam = 5 if cust != "ANOM001" else 25  # anomalous customer is more active
            n_tx = np.random.poisson(lam=lam)

            for _ in range(n_tx):
                # timestamp anywhere within the month
                ts = period + pd.Timedelta(
                    days=random.randint(0, 27),
                    hours=random.randint(0, 23),
                    minutes=random.randint(0, 59),
                    seconds=random.randint(0, 59),
                )

                # decide if counterparty is also a customer
                if random.random() < pct_counterparty_in_bank:
                    counterparty_id = random.choice(customers)
                else:
                    counterparty_id = f"EXT-{uuid.uuid4().hex[:6].upper()}"

                # base amount & channel
                amount = round(np.random.exponential(scale=400), 2)
                channel = random.choice(CHANNELS)
                currency = random.choice(ISO_4217)

                # anomalies: larger amounts + high‑risk corridors
                origin_country = faker.country_code()
                dest_country = faker.country_code()

                if cust == "ANOM001":
                    amount = round(random.uniform(8000, 15000), 2)
                    origin_country = random.choice(HIGH_RISK_COUNTRIES)

                records.append(
                    {
                        "transaction_id": str(uuid.uuid4()),
                        "customer_id": cust,
                        "counterparty_id": counterparty_id,
                        "transaction_timestamp": ts,
                        "transaction_type_code": "REMIT",
                        "channel": channel,
                        "amount": amount,
                        "currency": currency,
                        "origin_country_code": origin_country,
                        "destination_country_code": dest_country,
                        "in_out": random.choice(["IN", "OUT"]),
                    }
                )

    df = pd.DataFrame.from_records(records).sort_values("transaction_timestamp").reset_index(drop=True)
    return df


In [None]:

# ------------------------------------------------------------
# Example: generate 6 months of data for 120 customers
# ------------------------------------------------------------
df = generate_remittance_dataset(n_customers=120, months=6)
print(f"Generated {len(df):,} transactions")
df.head()
