**Setup and sanity checks**

In [85]:
import os

# make the folder if not present
os.makedirs("data/processed", exist_ok=True)
print("Created data/processed ")

# check current working directory
print("Current path:", os.getcwd())


Created data/processed 


Clone github repository

In [86]:
# Replace with your repo URL
!git clone https://github.com/Melaniemweru/NLP-warning-system.git
%cd NLP-warning-system
!ls


app.py	docs		    notebooks  requirements.txt


In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load the NEW clean dataset
df = pd.read_csv("data/raw/transactions/synthetic_transactions_clean.csv")
print("Loaded dataset:", df.shape)

# Separate inputs and labels
X = df.drop(columns=["label"])
y = df["label"]

# 1st split: Train vs Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

# 2nd split: Temp → Val + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

# Save them
os.makedirs("data/processed", exist_ok=True)
X_train.to_csv("data/processed/X_train.csv", index=False)
X_val.to_csv("data/processed/X_val.csv", index=False)
X_test.to_csv("data/processed/X_test.csv", index=False)

y_train.to_csv("data/processed/y_train.csv", index=False)
y_val.to_csv("data/processed/y_val.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)

print("Splits saved successfully.")


Loaded dataset: (5000, 9)
Splits saved successfully.


Verify and load in python

In [116]:
import pandas as pd
import os

files = [
    "X_train.csv","X_val.csv","X_test.csv",
    "y_train.csv","y_val.csv","y_test.csv"
]

print("Checking processed files:")
for f in files:
    full = f"data/processed/{f}"
    print(f, "→", os.path.exists(full))


Checking processed files:
X_train.csv → True
X_val.csv → True
X_test.csv → True
y_train.csv → True
y_val.csv → True
y_test.csv → True


**Validate required columns**

In [117]:
X_train = pd.read_csv("data/processed/X_train.csv")
X_val   = pd.read_csv("data/processed/X_val.csv")
X_test  = pd.read_csv("data/processed/X_test.csv")

y_train = pd.read_csv("data/processed/y_train.csv").squeeze("columns")
y_val   = pd.read_csv("data/processed/y_val.csv").squeeze("columns")
y_test  = pd.read_csv("data/processed/y_test.csv").squeeze("columns")

print("Loaded splits.")
print("Train:", X_train.shape, "| Val:", X_val.shape, "| Test:", X_test.shape)
print("\nTrain label distribution:\n", y_train.value_counts())


Loaded splits.
Train: (3500, 8) | Val: (750, 8) | Test: (750, 8)

Train label distribution:
 label
Compliant        2250
Non-Compliant    1250
Name: count, dtype: int64


In [118]:
required_cols = {
    "transaction_id","branch","client_id","client_type",
    "transaction_date","amount","currency","narrative"
}

for name, df_ in [("X_train", X_train), ("X_val", X_val), ("X_test", X_test)]:
    missing = required_cols - set(df_.columns)
    assert not missing, f"{name} missing columns: {missing}"

print("All required columns present.")


All required columns present.


Define a clean, deterministic text cleaner

In [119]:
import re, string

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[0-9]+", " ", text)
    text = text.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    text = re.sub(r"\s+", " ", text).strip()
    return text

for df_ in (X_train, X_val, X_test):
    df_["clean_narrative"] = df_["narrative"].apply(clean_text)

X_train[["narrative","clean_narrative"]].head()


Unnamed: 0,narrative,clean_narrative
0,verified. ID patterns; historical with aligned...,verified id patterns historical with aligned a...
1,Transaction for salary processed at Nairobi; I...,transaction for salary processed at nairobi id...
2,reviewed. statements Nairobi; at processed rep...,reviewed statements nairobi at processed repay...
3,Processed salary or bill payment at Thika; ID ...,processed salary or bill payment at thika id v...
4,checked. invoice Thika; at repayment loan for ...,checked invoice thika at repayment loan for pa...


Save

In [120]:
os.makedirs("data/processed/clean", exist_ok=True)

X_train.to_csv("data/processed/clean/X_train_clean.csv", index=False)
X_val.to_csv("data/processed/clean/X_val_clean.csv", index=False)
X_test.to_csv("data/processed/clean/X_test_clean.csv", index=False)

y_train.to_csv("data/processed/clean/y_train_clean.csv", index=False)
y_val.to_csv("data/processed/clean/y_val_clean.csv", index=False)
y_test.to_csv("data/processed/clean/y_test_clean.csv", index=False)

print("Cleaned datasets saved.")


Cleaned datasets saved.


6) TF-IDF vectorization (unigrams + bigrams)

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english",
    min_df=2
)

X_train_tfidf = tfidf.fit_transform(X_train["clean_narrative"])
X_val_tfidf   = tfidf.transform(X_val["clean_narrative"])
X_test_tfidf  = tfidf.transform(X_test["clean_narrative"])

X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape


((3500, 823), (750, 823), (750, 823))

7) Persist the vectorizer (so you can reuse it in training/inference)

In [122]:
import joblib, os
os.makedirs("src/models", exist_ok=True)

joblib.dump(tfidf, "src/models/tfidf_vectorizer.joblib")

print("Vectorizer saved.")


Vectorizer saved.


In [123]:
vocab = tfidf.get_feature_names_out()
print("Vocab size:", len(vocab))
print("Sample terms:", vocab[:25])


Vocab size: 823
Sample terms: ['aa' 'aaa' 'aad' 'aae' 'ab' 'abb' 'abc' 'abe' 'abf' 'ac' 'aca' 'acc'
 'acd' 'ace' 'acf' 'activity' 'activity aligned' 'activity customer'
 'activity outgoing' 'activity party' 'activity profile'
 'activity structuring' 'activity valid' 'ad' 'adc']


TEXT ONLY FEATURE PREPARATION

In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

unique_classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=unique_classes, y=y_train)

print("Class distribution:", y_train.value_counts().to_dict())
print("Computed class weights:", dict(zip(unique_classes, class_weights)))

clf = LogisticRegression(
    penalty="l2",
    solver="liblinear",
    max_iter=200,
    class_weight="balanced",
    random_state=42,
)

clf.fit(X_train_tfidf, y_train)

print("Model trained.")


Class distribution: {'Compliant': 2250, 'Non-Compliant': 1250}
Computed class weights: {'Compliant': np.float64(0.7777777777777778), 'Non-Compliant': np.float64(1.4)}
Model trained.


In [125]:
from sklearn.metrics import classification_report

y_val_pred = clf.predict(X_val_tfidf)
y_test_pred = clf.predict(X_test_tfidf)

print("=== VALIDATION REPORT ===")
print(classification_report(y_val, y_val_pred))

print("=== TEST REPORT ===")
print(classification_report(y_test, y_test_pred))


=== VALIDATION REPORT ===
               precision    recall  f1-score   support

    Compliant       1.00      1.00      1.00       482
Non-Compliant       1.00      1.00      1.00       268

     accuracy                           1.00       750
    macro avg       1.00      1.00      1.00       750
 weighted avg       1.00      1.00      1.00       750

=== TEST REPORT ===
               precision    recall  f1-score   support

    Compliant       1.00      1.00      1.00       483
Non-Compliant       1.00      1.00      1.00       267

     accuracy                           1.00       750
    macro avg       1.00      1.00      1.00       750
 weighted avg       1.00      1.00      1.00       750



In [126]:
import joblib, os

joblib.dump(tfidf, "src/models/tfidf_vectorizer.pkl")
joblib.dump(clf, "src/models/logreg_tfidf.pkl")

print("Saved:")
print(" - src/models/tfidf_vectorizer.pkl")
print(" - src/models/logreg_tfidf.pkl")


Saved:
 - src/models/tfidf_vectorizer.pkl
 - src/models/logreg_tfidf.pkl


In [127]:
import json

metrics = {
    "model_type": "baseline_logreg_tfidf",
    "n_train": int(len(y_train)),
    "n_val": int(len(y_val)),
    "n_test": int(len(y_test)),
    "val_report": classification_report(y_val, y_val_pred, output_dict=True),
    "test_report": classification_report(y_test, y_test_pred, output_dict=True),
}

with open("src/models/baseline_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved baseline_metrics.json")


Saved baseline_metrics.json


In [128]:
import textwrap, pathlib, os

baseline_code = textwrap.dedent("""\
    import os
    import joblib

    THIS_DIR = os.path.dirname(__file__)
    VECT_PATH = os.path.join(THIS_DIR, "tfidf_vectorizer.pkl")
    MODEL_PATH = os.path.join(THIS_DIR, "logreg_tfidf.pkl")

    _VECTORIZER = None
    _MODEL = None

    def _load_artifacts():
        global _VECTORIZER, _MODEL
        if _VECTORIZER is None:
            _VECTORIZER = joblib.load(VECT_PATH)
        if _MODEL is None:
            _MODEL = joblib.load(MODEL_PATH)
        return _VECTORIZER, _MODEL

    def predict_narrative(text: str):
        vect, model = _load_artifacts()
        X = vect.transform([text])
        proba = model.predict_proba(X)[0]
        pred = model.predict(X)[0]

        return {
            "prediction": pred,
            "probabilities": proba.tolist(),
            "classes": model.classes_.tolist()
        }
""")

path = pathlib.Path("src/models/baseline_model.py")
path.write_text(baseline_code)

print("Created baseline_model.py")


Created baseline_model.py


In [129]:
from src.models.baseline_model import predict_narrative

samples = [
    "High-value transfer to Dubai; no updated KYC; STR recommended.",
    "Salary processed at Nairobi branch; documentation verified."
]

for txt in samples:
    out = predict_narrative(txt)
    print("\nTEXT:", txt)
    print(" → prediction:", out["prediction"])
    print(" → class probabilities:", out["probabilities"])



TEXT: High-value transfer to Dubai; no updated KYC; STR recommended.
 → prediction: Non-Compliant
 → class probabilities: [0.017570918242400868, 0.9824290817575991]

TEXT: Salary processed at Nairobi branch; documentation verified.
 → prediction: Compliant
 → class probabilities: [0.9740465717132776, 0.0259534282867224]


In [139]:
!git status


On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)

nothing added to commit but untracked files present (use "git add" to track)


In [141]:
!echo "NLP-warning-system/" >> .gitignore


In [142]:
!git rm -r --cached NLP-warning-system/




In [140]:
echo "NLP-warning-system/" >> .gitignore
!git rm -r --cached NLP-warning-system/


SyntaxError: invalid syntax (ipython-input-2407670164.py, line 1)

In [136]:
!git add data/processed/*
!git add data/processed/clean/*
!git add src/models/*


In [137]:
!git status


On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)

nothing added to commit but untracked files present (use "git add" to track)


In [134]:
!git commit -m "Add processed splits, cleaned datasets, and baseline model files"


On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)

nothing added to commit but untracked files present (use "git add" to track)


In [109]:
!git push


Everything up-to-date


In [110]:
!ls /content




In [111]:
import notebook


In [112]:
%cd /content/NLP-warning-system

!git add notebooks/
!git commit -m "Add updated model training notebook"
!git push origin main


On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)

nothing added to commit but untracked files present (use "git add" to track)
fatal: could not read Username for 'https://github.com': No such device or address


In [113]:
!git remote -v


