<a href="https://colab.research.google.com/github/Melaniemweru/NLP-warning-system/blob/main/notebooks/AML_dataset_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Connect to github repo in colab

In [9]:
import os

REPO_URL = "https://github.com/Melaniemweru/NLP-warning-system.git"
REPO_NAME = "NLP-warning-system"

# Clone only if the folder does NOT exist
if not os.path.exists(REPO_NAME):
    !git clone {REPO_URL}

# Move into the repo
%cd /content/{REPO_NAME}

# Quick check of folders
!ls


remote: Enumerating objects: 52, done.[K
remote: Counting objects:   1% (1/52)[Kremote: Counting objects:   3% (2/52)[Kremote: Counting objects:   5% (3/52)[Kremote: Counting objects:   7% (4/52)[Kremote: Counting objects:   9% (5/52)[Kremote: Counting objects:  11% (6/52)[Kremote: Counting objects:  13% (7/52)[Kremote: Counting objects:  15% (8/52)[Kremote: Counting objects:  17% (9/52)[Kremote: Counting objects:  19% (10/52)[Kremote: Counting objects:  21% (11/52)[Kremote: Counting objects:  23% (12/52)[Kremote: Counting objects:  25% (13/52)[Kremote: Counting objects:  26% (14/52)[Kremote: Counting objects:  28% (15/52)[Kremote: Counting objects:  30% (16/52)[Kremote: Counting objects:  32% (17/52)[Kremote: Counting objects:  34% (18/52)[Kremote: Counting objects:  36% (19/52)[Kremote: Counting objects:  38% (20/52)[Kremote: Counting objects:  40% (21/52)[Kremote: Counting objects:  42% (22/52)[Kremote: Counting objects:  44% (23/52)[Kr

Make sure data folder exists

In [10]:
import os

os.makedirs("data/raw/transactions", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print("✅ Folders ready:")
!ls data
!ls data/raw


✅ Folders ready:
annotations  processed	raw
regulations  transactions


In [11]:
import os
paths = [
    "data/raw/transactions/synthetic_transactions.csv",
    "data/processed/X_train.csv",
    "data/processed/X_val.csv",
    "data/processed/X_test.csv",
    "data/processed/y_train.csv",
    "data/processed/y_val.csv",
    "data/processed/y_test.csv"
]

for p in paths:
    if os.path.exists(p):
        os.remove(p)
        print("Deleted:", p)
    else:
        print("Not found:", p)


Deleted: data/raw/transactions/synthetic_transactions.csv
Deleted: data/processed/X_train.csv
Deleted: data/processed/X_val.csv
Deleted: data/processed/X_test.csv
Deleted: data/processed/y_train.csv
Deleted: data/processed/y_val.csv
Deleted: data/processed/y_test.csv


Define controls and helper functions for a more realistic dataset

In [12]:
# ===============================
# CHUNK 1 — Imports & Setup
# ===============================
import pandas as pd
import numpy as np
import random
import datetime
import uuid
import os

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Create folders
os.makedirs("data/raw/transactions", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)


STEP 4 – Helper Functions

In [13]:
# ===============================
# CHUNK 2 — Helper Functions
# ===============================

def random_date(start=(2022,1,1), end=(2024,12,31)):
    """Pick a random date between start and end."""
    start_dt = datetime.date(*start)
    end_dt = datetime.date(*end)
    delta = (end_dt - start_dt).days
    return start_dt + datetime.timedelta(days=random.randint(0, delta))

def choose_amount(label):
    """Generate realistic amounts."""
    if label == "Non-Compliant":
        base = np.random.lognormal(mean=14.5, sigma=0.9)
    else:
        base = np.random.lognormal(mean=13.3, sigma=0.7)
    return float(np.clip(base, 2e4, 2e8))

def add_noise(text):
    """Random misspellings for realism."""
    if random.random() < 0.15:
        text = text.replace("customer", "custmer")
    if random.random() < 0.15:
        text = text.replace("transfer", "tranfer")
    if random.random() < 0.10:
        text = text.replace("account", "acount")
    return text


STEP 5 – Templates

In [14]:
# ===============================
# CHUNK 3 — Templates
# ===============================
BRANCHES = ["Nairobi_Main", "Westlands", "Kisumu", "Mombasa", "Eldoret"]
CLIENT_TYPES = ["Retail", "SME", "Corporate"]
CURRENCIES = ["KES"]

FOREIGN_LOCS = ["Dubai", "Hong Kong", "Mauritius", "Cyprus", "China", "UK"]
KENYAN_LOCS  = ["Nairobi", "Mombasa", "Kisumu", "Nakuru", "Thika"]

# Compliant templates
COMPL_TEMPLATES = [
    "Transaction for {src} processed at {loc}; {doc}.",
    "Customer activity aligned with historical patterns; {doc}.",
    "Routine payment for {src} at {loc}; {doc}.",
    "Processed salary or bill payment at {loc}; {doc}.",
]

COMPL_SRC = ["salary", "school fees", "rent", "utilities", "loan repayment"]
COMPL_DOC = ["ID verified", "invoice checked", "statements reviewed"]

# Non-compliant templates
NONCOMPL_TEMPLATES = [
    "High-value transfer to {foreign}; {reason}; {action}.",
    "Irregular pattern detected; {reason}; {action}.",
    "Large foreign remittance to {foreign}; {reason}; {action}.",
    "Unusual outgoing activity; {reason}; {action}.",
]

NONCOMPL_REASON = [
    "no valid KYC",
    "structuring suspected",
    "profile inconsistent with transactions",
    "third-party deposits observed",
]

NONCOMPL_ACTION = [
    "STR recommended",
    "EDD triggered",
    "escalate to compliance team",
]


Narrative Generators

In [15]:
# ===============================
# CHUNK 4 — Narrative Generators
# ===============================

def make_compliant():
    template = random.choice(COMPL_TEMPLATES)
    return template.format(
        src=random.choice(COMPL_SRC),
        loc=random.choice(KENYAN_LOCS),
        doc=random.choice(COMPL_DOC)
    )

def make_noncompliant():
    template = random.choice(NONCOMPL_TEMPLATES)
    return template.format(
        foreign=random.choice(FOREIGN_LOCS),
        reason=random.choice(NONCOMPL_REASON),
        action=random.choice(NONCOMPL_ACTION)
    )

def finalize_narrative(sentence, amount):
    """Add optional amount, reverse text, noise, and unique UUID."""
    # Optional amount
    if random.random() < 0.4:
        sentence += f" Approx amount KES {amount:,.0f}."

    # Optional word reversal
    if random.random() < 0.25:
        sentence = " ".join(sentence.split()[::-1])

    # Add noise + UUID for uniqueness
    sentence = add_noise(sentence)
    sentence += f" #{uuid.uuid4().hex[:6]}"

    return sentence


In [16]:
# ===============================
# CHUNK 5 — Build Dataset
# ===============================

N = 5000
NONCOMPLIANT_RATIO = 0.35
rows = []

for i in range(N):

    label = "Non-Compliant" if random.random() < NONCOMPLIANT_RATIO else "Compliant"
    amount = choose_amount(label)

    if label == "Compliant":
        base = make_compliant()
    else:
        base = make_noncompliant()

    narrative = finalize_narrative(base, amount)

    rows.append({
        "transaction_id": f"TXN{i:06d}",
        "branch": random.choice(BRANCHES),
        "client_id": f"CL{random.randint(100,999)}",
        "client_type": random.choice(CLIENT_TYPES),
        "transaction_date": random_date().isoformat(),
        "amount": amount,
        "currency": "KES",
        "narrative": narrative,
        "label": label
    })

df = pd.DataFrame(rows)


In [17]:
# ===============================
# CHUNK 6 — Save + Summary
# ===============================
print("Total rows:", len(df))
print("Unique narratives:", df["narrative"].nunique())

df.to_csv("data/raw/transactions/synthetic_transactions_clean.csv", index=False)
print("✅ Saved clean synthetic dataset.")


Total rows: 5000
Unique narratives: 5000
✅ Saved clean synthetic dataset.


In [None]:
!rm -rf NLP-warning-system   # optional: only if you want a clean clone
!git clone https://github.com/Melaniemweru/NLP-warning-system.git
%cd NLP-warning-system
!ls


remote: Enumerating objects: 43, done.[K
remote: Counting objects:   2% (1/43)[Kremote: Counting objects:   4% (2/43)[Kremote: Counting objects:   6% (3/43)[Kremote: Counting objects:   9% (4/43)[Kremote: Counting objects:  11% (5/43)[Kremote: Counting objects:  13% (6/43)[Kremote: Counting objects:  16% (7/43)[Kremote: Counting objects:  18% (8/43)[Kremote: Counting objects:  20% (9/43)[Kremote: Counting objects:  23% (10/43)[Kremote: Counting objects:  25% (11/43)[Kremote: Counting objects:  27% (12/43)[Kremote: Counting objects:  30% (13/43)[Kremote: Counting objects:  32% (14/43)[Kremote: Counting objects:  34% (15/43)[Kremote: Counting objects:  37% (16/43)[Kremote: Counting objects:  39% (17/43)[Kremote: Counting objects:  41% (18/43)[Kremote: Counting objects:  44% (19/43)[Kremote: Counting objects:  46% (20/43)[Kremote: Counting objects:  48% (21/43)[Kremote: Counting objects:  51% (22/43)[Kremote: Counting objects:  53% (23/43)[Kr