Connect to github repo in colab

In [1]:
import os

REPO_URL = "https://github.com/Melaniemweru/NLP-warning-system.git"
REPO_NAME = "NLP-warning-system"

# Clone only if the folder does NOT exist
if not os.path.exists(REPO_NAME):
    !git clone {REPO_URL}

# Move into the repo
%cd /content/{REPO_NAME}

# Quick check of folders
!ls


remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 39 (delta 11), reused 35 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (39/39), 85.56 KiB | 4.75 MiB/s, done.
Resolving deltas: 100% (11/11), done.
data  docs  notebooks  README.md  requirements.txt  src


Make sure data folder exists

In [2]:
import os

os.makedirs("data/raw/transactions", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print("✅ Folders ready:")
!ls data
!ls data/raw


✅ Folders ready:
annotations  processed	raw
regulations  transactions


Define controls and helper functions for a more realistic dataset

In [3]:
import pandas as pd
import numpy as np
import random, datetime

# ---------- Reproducibility ----------
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# ---------- Main controls ----------
N = 5000                # you can later increase to 10_000 or 20_000
NONCOMPLIANT_RATIO = 0.35

BRANCHES = ["Nairobi_Main", "Westlands", "Kisumu", "Mombasa", "Eldoret"]
CLIENT_TYPES = ["Retail", "SME", "Corporate"]
CURRENCIES = ["KES"]

# Some foreign locations commonly used in AML case examples (this is synthetic)
FOREIGN_LOCS = [
    "Dubai", "Hong Kong", "Mauritius", "Cyprus", "China",
    "UK", "Uganda", "Tanzania", "Somalia"
]

# Kenyan locations
KENYAN_LOCS = ["Nairobi", "Mombasa", "Kisumu", "Nakuru", "Eldoret", "Thika"]


def random_date(start=(2022, 1, 1), end=(2024, 12, 31)):
    """Sample a random date between start and end."""
    start_dt = datetime.date(*start)
    end_dt = datetime.date(*end)
    delta = (end_dt - start_dt).days
    return start_dt + datetime.timedelta(days=random.randint(0, delta))


def fmt_amount(x: float) -> float:
    """Round amount to 2 dp."""
    return float(np.round(x, 2))


def choose_amount(label: str) -> float:
    """
    Use lognormal distributions so that:
    - Non-Compliant tends to have larger / more extreme amounts
    - Compliant has more 'normal' amounts
    These parameters are chosen for synthetic realism, not from real bank data.
    """
    if label == "Non-Compliant":
        base = np.random.lognormal(mean=14.4, sigma=0.9)
    else:
        base = np.random.lognormal(mean=13.4, sigma=0.7)

    # clip to a reasonable KES range for this project
    amt = np.clip(base, 2e4, 2e8)
    return fmt_amount(amt)


STEP 4 – Define narrative building blocks (more variety, Kenyan context)

In [4]:
# Compliant fragments (ordinary, documented payments)
COMPLIANT_SOURCES = [
    "salary from employer",
    "school fees payment",
    "loan repayment",
    "rent payment",
    "local supplier invoice",
    "utilities bill",
    "sacco contribution"
]

COMPLIANT_DOCS = [
    "payslips on file",
    "invoice and delivery note on record",
    "employment letter and bank statement reviewed",
    "KRA PIN and tax returns verified",
    "loan agreement signed and updated",
    "school admission letter and fee structure reviewed"
]

# Non-compliant / suspicious reasons (still synthetic)
NONCOMPLIANT_REASONS = [
    "no updated KYC or source-of-funds documents",
    "customer declined to provide proof of income",
    "transactions inconsistent with stated business profile",
    "structuring suspected; several deposits just below reporting threshold",
    "dormant account suddenly active with large inward remittances",
    "third-party cash deposits followed by immediate outward transfers"
]

NONCOMPLIANT_ACTIONS = [
    "STR recommended to FRC",
    "relationship manager advised to file STR",
    "EDD triggered; escalate to compliance",
    "temporary account restriction recommended"
]


def small_typo_noise(text: str) -> str:
    """
    Add a bit of noise so narratives are not identical.
    This is deliberate synthetic noise, not real typos from data.
    """
    if random.random() < 0.25:
        text = text.replace(" customer", " custmer")
    if random.random() < 0.15:
        text = text.replace(" transfer", " tranfer")
    return text


def gen_compliant() -> str:
    """Generate a compliant narrative in Kenyan context."""
    src = random.choice(COMPLIANT_SOURCES)
    loc = random.choice(KENYAN_LOCS)
    doc = random.choice(COMPLIANT_DOCS)

    if random.random() < 0.5:
        template = (
            "Customer transaction consistent with historical activity; "
            "{src} processed via {loc} branch; {doc}."
        )
    else:
        template = (
            "Domestic payment for {src} through {loc} branch; "
            "{doc}; no anomalies detected."
        )

    return small_typo_noise(template.format(src=src, loc=loc, doc=doc))


def gen_noncompliant() -> str:
    """Generate a non-compliant / suspicious narrative."""
    foreign_loc = random.choice(FOREIGN_LOCS)
    reason = random.choice(NONCOMPLIANT_REASONS)
    action = random.choice(NONCOMPLIANT_ACTIONS)

    if random.random() < 0.5:
        template = (
            "High-value transfer to {foreign_loc}; {reason}. {action}."
        )
    else:
        template = (
            "Multiple transactions involving {foreign_loc}; {reason}. {action}."
        )

    return small_typo_noise(
        template.format(foreign_loc=foreign_loc, reason=reason, action=action)
    )


def maybe_insert_amount(sentence: str, amount: float) -> str:
    """
    In many narratives we explicitly mention KES amount.
    This helps the model learn amount-related patterns.
    """
    if random.random() < 0.7:
        amtk = f"KES {amount:,.0f}"
        if "transfer" in sentence:
            return sentence.replace("transfer", f"transfer of {amtk}", 1)
        if "payment" in sentence:
            return sentence.replace("payment", f"payment of {amtk}", 1)
        return sentence + f" Approximate value {amtk}."
    return sentence


STEP 5 – Generate the synthetic dataset

In [5]:
rows = []
n_non = int(N * NONCOMPLIANT_RATIO)
labels = ["Non-Compliant"] * n_non + ["Compliant"] * (N - n_non)
random.shuffle(labels)

for i, label in enumerate(labels, start=1):
    amount = choose_amount(label)
    base_sentence = gen_noncompliant() if label == "Non-Compliant" else gen_compliant()
    narrative = maybe_insert_amount(base_sentence, amount)

    rows.append({
        "transaction_id": f"TXN{i:06d}",
        "branch": random.choice(BRANCHES),
        "client_id": f"CL{random.randint(100, 999)}",
        "client_type": random.choice(CLIENT_TYPES),
        "transaction_date": random_date().isoformat(),
        "amount": amount,
        "currency": random.choice(CURRENCIES),
        "narrative": narrative,
        "label": label
    })

df = pd.DataFrame(rows)

print("Head:")
display(df.head(10))

print("\nLabel counts:")
print(df["label"].value_counts())

print("\nTotal rows:", len(df))
print("Unique narratives:", df["narrative"].nunique())


Head:


Unnamed: 0,transaction_id,branch,client_id,client_type,transaction_date,amount,currency,narrative,label
0,TXN000001,Kisumu,CL711,Retail,2023-09-09,2805360.86,KES,"High-value transfer of KES 2,805,361 to Maurit...",Non-Compliant
1,TXN000002,Westlands,CL633,Retail,2024-10-31,599118.68,KES,Customer transaction consistent with historica...,Compliant
2,TXN000003,Kisumu,CL469,SME,2023-03-24,1038597.68,KES,Customer transaction consistent with historica...,Compliant
3,TXN000004,Eldoret,CL349,Retail,2023-10-20,7065447.21,KES,"High-value transfer of KES 7,065,447 to Somali...",Non-Compliant
4,TXN000005,Kisumu,CL626,Corporate,2024-07-01,560224.37,KES,Customer transaction consistent with historica...,Compliant
5,TXN000006,Eldoret,CL646,Retail,2022-05-15,1453197.33,KES,High-value tranfer to Dubai; transactions inco...,Non-Compliant
6,TXN000007,Mombasa,CL762,Retail,2022-11-11,7431895.76,KES,Multiple transactions involving Cyprus; third-...,Non-Compliant
7,TXN000008,Nairobi_Main,CL688,Retail,2023-06-14,3579348.13,KES,"High-value transfer of KES 3,579,348 to Tanzan...",Non-Compliant
8,TXN000009,Eldoret,CL164,Corporate,2022-04-04,1175816.14,KES,"High-value transfer of KES 1,175,816 to Uganda...",Non-Compliant
9,TXN000010,Mombasa,CL490,SME,2022-03-05,2923534.94,KES,"High-value transfer of KES 2,923,535 to UK; do...",Non-Compliant



Label counts:
label
Compliant        3250
Non-Compliant    1750
Name: count, dtype: int64

Total rows: 5000
Unique narratives: 4261


STEP 6 – Save the raw CSV and create stratified train/val/test splits

In [6]:
from sklearn.model_selection import train_test_split
import os

# 6.1 Save full synthetic dataset
raw_path = "data/raw/transactions/synthetic_transactions.csv"
os.makedirs(os.path.dirname(raw_path), exist_ok=True)
df.to_csv(raw_path, index=False)
print(f"✅ Saved full synthetic dataset to {raw_path}")

# 6.2 Create stratified splits
X = df.drop(columns=["label"])
y = df["label"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=RANDOM_SEED,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=RANDOM_SEED,
    stratify=y_temp
)

print("Shapes →",
      "Train:", X_train.shape,
      "| Val:", X_val.shape,
      "| Test:", X_test.shape)

# 6.3 Save splits to data/processed
os.makedirs("data/processed", exist_ok=True)

X_train.to_csv("data/processed/X_train.csv", index=False)
X_val.to_csv("data/processed/X_val.csv", index=False)
X_test.to_csv("data/processed/X_test.csv", index=False)

y_train.to_csv("data/processed/y_train.csv", index=False)
y_val.to_csv("data/processed/y_val.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)

print("✅ Saved stratified splits in data/processed/")
!ls data/processed


✅ Saved full synthetic dataset to data/raw/transactions/synthetic_transactions.csv
Shapes → Train: (3500, 8) | Val: (750, 8) | Test: (750, 8)
✅ Saved stratified splits in data/processed/
X_test.csv  X_train.csv  X_val.csv  y_test.csv	y_train.csv  y_val.csv


In [7]:
%cd /content/NLP-warning-system

!git status

# Add the updated files
!git add data/raw/transactions/synthetic_transactions.csv
!git add data/processed/X_train.csv data/processed/X_val.csv data/processed/X_test.csv
!git add data/processed/y_train.csv data/processed/y_val.csv data/processed/y_test.csv

!git commit -m "Regenerate synthetic AML/KYC dataset with more varied Kenyan narratives"

# If your remote already works with stored credentials, this is enough:
!git push origin main

# If it fails asking for username/password, you will need to set the remote using:
# !git remote set-url origin https://<YOUR_TOKEN>@github.com/Melaniemweru/NLP-warning-system.git
# then run:
# !git push origin main


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   data/processed/X_test.csv[m
	[31mmodified:   data/processed/X_train.csv[m
	[31mmodified:   data/processed/X_val.csv[m
	[31mmodified:   data/processed/y_test.csv[m
	[31mmodified:   data/processed/y_train.csv[m
	[31mmodified:   data/processed/y_val.csv[m
	[31mmodified:   data/raw/transactions/synthetic_transactions.csv[m

no changes added to commit (use "git add" and/or "git commit -a")
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@5fdadd3352bb.(none)')
fatal: could not rea