# Fraud Model Training & Evaluation (Baseline + Trees + XGB)
Handles imbalance, runs CV, tunes threshold, exports ONNX, measures inference speed.

In [1]:
from pathlib import Path

def resolve_project_relative(p: str | Path) -> Path:
    p = Path(p)
    if p.is_absolute():
        return p

    here = Path.cwd().resolve()
    last_with_ml = None
    root = here
    while True:
        if (root / 'ml-model').is_dir():
            last_with_ml = root
        if root == root.parent:
            break
        root = root.parent

    if last_with_ml is not None:
        return (last_with_ml / p).resolve()

    # Fallback: just resolve from CWD
    return (here / p).resolve()

In [2]:
import warnings, time, json
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.inspection import permutation_importance
import joblib, yaml

# strict deps – fail fast if anything is missing
import xgboost as xgb
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as ort
from onnxmltools.convert import convert_xgboost
from onnxconverter_common.data_types import FloatTensorType as FloatTensorType2  # for onnxmltools



warnings.filterwarnings('ignore')


# --- CONFIG ---
CFG_PATH = resolve_project_relative(Path('ml-model/config/training_config.yaml'))
cfg = yaml.safe_load(CFG_PATH.read_text())

np.random.seed(cfg['seed'])
print('Config:', json.dumps(cfg, indent=2))

Config: {
  "seed": 42,
  "input_csv": "ml-model/data/creditcard.csv",
  "use_time_feature": false,
  "use_smote": false,
  "test_size": 0.2,
  "cv_splits": 5,
  "scaler_amount_path": "ml-model/artifacts/amount_scaler.pkl",
  "export_onnx": true,
  "onnx_opset": 15,
  "timing_samples": 10000,
  "threshold": null,
  "use_processed": true,
  "processed_dir": "ml-model/processed",
  "artifacts_dir": "ml-model/artifacts"
}


In [3]:
P = resolve_project_relative(Path(cfg["processed_dir"]))
A = resolve_project_relative(Path(cfg["artifacts_dir"]))

Xtr_np = np.load(P / "X_train_time80.npy").astype(np.float32)
y_train = np.load(P / "y_train_time80.npy").astype(np.int64)
Xte_np = np.load(P / "X_test_time20.npy").astype(np.float32)
y_test = np.load(P / "y_test_time20.npy").astype(np.int64)

params = json.loads((A / "feature_params.json").read_text())
feature_names = params["order"]
assert Xtr_np.shape[1] == len(feature_names) == Xte_np.shape[1], "Feature dim/order mismatch."

Xtr = pd.DataFrame(Xtr_np, columns=feature_names)
Xte = pd.DataFrame(Xte_np, columns=feature_names)

print({
    "Xtr": Xtr.shape, "Xte": Xte.shape,
    "fraud_train": int(y_train.sum()), "fraud_test": int(y_test.sum()),
    "features": feature_names[-5:]  # tail preview
})

{'Xtr': (227845, 31), 'Xte': (56962, 31), 'fraud_train': 417, 'fraud_test': 75, 'features': ['V27', 'V28', 'Amount_z', 'tod_sin', 'tod_cos']}


In [4]:
y_train = np.asarray(y_train, dtype=np.int64).ravel()

cnt = np.bincount(y_train, minlength=2)
neg, pos = int(cnt[0]), int(cnt[1]) if cnt.size > 1 else 0
ratio = neg / max(pos, 1)

MODELS = {
    "lr": LogisticRegression(
        class_weight="balanced", solver="liblinear", max_iter=1000, random_state=cfg["seed"]
    ),
    "rf": RandomForestClassifier(
        n_estimators=500, class_weight="balanced", n_jobs=-1, random_state=cfg["seed"]
    ),
    "xgb": xgb.XGBClassifier(
        n_estimators=700, max_depth=5, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9,
        scale_pos_weight=ratio, tree_method="hist", n_jobs=-1, random_state=cfg["seed"]
    ),
}
if cfg.get("use_smote", False):
    MODELS["lr_smote"] = ImbPipeline(steps=[
        ("smote", SMOTE(random_state=cfg["seed"], k_neighbors=5)),
        ("clf", LogisticRegression(solver="liblinear", max_iter=1000, random_state=cfg["seed"]))
    ])


In [5]:
from sklearn.base import clone

def as_pipeline(est):
    return est if isinstance(est, (Pipeline, ImbPipeline)) else Pipeline([("clf", est)])

def walk_forward_splits(n, k):
    # split train window into k+1 contiguous blocks; k folds = (train->next block as val)
    block = max(n // (k + 1), 1)
    for i in range(k):
        tr_end = block * (i + 1)
        val_start = tr_end
        val_end = block * (i + 2) if i < k - 1 else n
        if val_start >= val_end:
            break
        yield np.arange(0, tr_end), np.arange(val_start, val_end)

n = len(Xtr)
wf = list(walk_forward_splits(n, cfg['cv_splits']))

cv_results = {}
for name, est in MODELS.items():
    recs, f1s, aucs = [], [], []
    for tr_idx, val_idx in wf:
        X_tr = Xtr.iloc[tr_idx]
        y_tr = y_train[tr_idx]  # numpy indexing

        X_va = Xtr.iloc[val_idx]
        y_va = y_train[val_idx] # numpy indexing


# skip folds with no positives in validation
        if (int(y_va.sum()) == 0):
            continue

        mdl = clone(as_pipeline(est)).fit(X_tr, y_tr)
        yhat = mdl.predict(X_va)
        prec, rec, f1, _ = precision_recall_fscore_support(y_va, yhat, average="binary", zero_division=0)
        auc = roc_auc_score(y_va, mdl.predict_proba(X_va)[:, 1]) if hasattr(mdl, "predict_proba") else float("nan")
        recs.append(rec); f1s.append(f1); aucs.append(auc)

    cv_results[name] = {
        "recall": float(np.mean(recs)) if recs else 0.0,
        "f1":     float(np.mean(f1s))   if f1s else 0.0,
        "roc_auc": float(np.nanmean(aucs)) if aucs else float("nan"),
        "folds_used": len(recs)
    }
cv_results

{'lr': {'recall': 0.8761937601898712,
  'f1': 0.12807214041303389,
  'roc_auc': 0.9597639842630755,
  'folds_used': 5},
 'rf': {'recall': 0.643426510271133,
  'f1': 0.7575686810471371,
  'roc_auc': 0.9536663240101635,
  'folds_used': 5},
 'xgb': {'recall': 0.7906153034463668,
  'f1': 0.8145975240100854,
  'roc_auc': 0.9702198655298941,
  'folds_used': 5}}

In [6]:
# --- TRAIN ALL ---
trained = {name: Pipeline([("clf", mdl)]) for name, mdl in MODELS.items()}
trained = {n: m.fit(Xtr, y_train) for n, m in trained.items()}

In [7]:
#p
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_fscore_support

def sweep_thr_timeaware(model, X_train_win, y_train_win, target=0.80, calibrate=False):
    n = len(X_train_win)
    cut = int(0.75 * n)
    Xh, Xv = X_train_win.iloc[:cut], X_train_win.iloc[cut:]
    yh, yv = (y_train_win[:cut], y_train_win[cut:]) if not hasattr(y_train_win, "iloc") \
        else (y_train_win.iloc[:cut], y_train_win.iloc[cut:])

    base = clone(model)
    base = base.fit(Xh, yh)

    proba_model = base
    if calibrate:
        proba_model = CalibratedClassifierCV(base, method="isotonic", cv="prefit")
        proba_model.fit(Xv, yv)

    if not hasattr(proba_model, "predict_proba"):
        return None

    p = proba_model.predict_proba(Xv)[:, 1]
    best = (0.0, 0.0, 0.50)
    for thr in np.linspace(0.01, 0.60, 60):
        yhat = (p >= thr).astype(int)
        prec, rec, _, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
        if (rec >= target and prec > best[1]) or (best[0] < target and rec > best[0]):
            best = (rec, prec, thr)
    return best[2]


In [8]:
#p
trained = {name: Pipeline([("clf", mdl)]) for name, mdl in MODELS.items()}
trained = {n: m.fit(Xtr, y_train) for n, m in trained.items()}

# thresholds: calibrate ONLY the models that benefit (tree models). LR usually doesn’t need it.
swept = {
    "lr":  sweep_thr_timeaware(trained["lr"],  Xtr, y_train, target=0.80, calibrate=False),
    "rf":  sweep_thr_timeaware(trained["rf"],  Xtr, y_train, target=0.80, calibrate=False),
    "xgb": sweep_thr_timeaware(trained["xgb"], Xtr, y_train, target=0.80, calibrate=True),
}
print("Chosen thresholds:", swept)


Chosen thresholds: {'lr': np.float64(0.6), 'rf': np.float64(0.03), 'xgb': np.float64(0.01)}


In [9]:
#p
from sklearn.base import clone
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

def refine_thr(model, Xtr_full, ytr_full, thr0, target=0.80, window=0.03, steps=150):
    n = len(Xtr_full); cut = int(0.75*n)
    Xh, Xv = Xtr_full.iloc[:cut], Xtr_full.iloc[cut:]
    yh, yv = (ytr_full[:cut], ytr_full[cut:]) if not hasattr(ytr_full, "iloc") \
        else (ytr_full.iloc[:cut], ytr_full.iloc[cut:])
    m = clone(model).fit(Xh, yh)
    p = m.predict_proba(Xv)[:,1]
    lo, hi = max(0.0, thr0 - window), min(1.0, thr0 + window)
    best = (0.0, 0.0, thr0)
    for thr in np.linspace(lo, hi, steps):
        yhat = (p >= thr).astype(int)
        prec, rec, _, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
        if (rec >= target and prec > best[1]) or (best[0] < target and rec > best[0]):
            best = (rec, prec, thr)
    return best[2]

swept["xgb"] = refine_thr(trained["xgb"], Xtr, y_train, swept["xgb"], target=0.80, window=0.03, steps=150)
swept["lr"]  = refine_thr(trained["lr"],  Xtr, y_train, swept["lr"],  target=0.80, window=0.01, steps=80)
swept["rf"] = refine_thr(trained["rf"], Xtr, y_train, swept["rf"], target=0.80)


In [10]:
#p
def eval_with_thr(m, X, y, thr):
    p = m.predict_proba(X)[:, 1]
    yhat = (p >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, yhat).ravel()
    prec, rec, f1, _ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
    return {
        "precision": float(prec), "recall": float(rec), "f1": float(f1),
        "roc_auc": float(roc_auc_score(y, p)), "tp": int(tp), "fp": int(fp),
        "tn": int(tn), "fn": int(fn), "alerts": int(tp+fp), "alert_rate": (tp+fp)/len(y),
        "threshold": float(thr)
    }

# Ensure swept["rf"] contains the refined value before this
results = {name: eval_with_thr(trained[name], Xte, y_test, swept[name])
           for name in ["lr","rf","xgb"]}
print(results)

# CMs for both tree models (workload view)
for name in ["rf","xgb"]:
    m = trained[name]; thr = swept[name]
    p = m.predict_proba(Xte)[:,1]
    yhat = (p >= thr).astype(int)
    cm = confusion_matrix(y_test, yhat).tolist()
    print(f"{name.upper()} cm [[TN,FP],[FN,TP]]:", cm,
          "| alerts=", int((yhat==1).sum()))



{'lr': {'precision': 0.0387858347386172, 'recall': 0.92, 'f1': 0.0744336569579288, 'roc_auc': 0.9860065275604853, 'tp': 69, 'fp': 1710, 'tn': 55177, 'fn': 6, 'alerts': 1779, 'alert_rate': np.float64(0.031231347213932094), 'threshold': 0.609746835443038}, 'rf': {'precision': 0.5169491525423728, 'recall': 0.8133333333333334, 'f1': 0.6321243523316062, 'roc_auc': 0.9540391770820515, 'tp': 61, 'fp': 57, 'tn': 56830, 'fn': 14, 'alerts': 118, 'alert_rate': np.float64(0.002071556476247323), 'threshold': 0.03221476510067114}, 'xgb': {'precision': 0.5, 'recall': 0.8, 'f1': 0.6153846153846154, 'roc_auc': 0.986530724652967, 'tp': 60, 'fp': 60, 'tn': 56827, 'fn': 15, 'alerts': 120, 'alert_rate': np.float64(0.002106667602963379), 'threshold': 0.015838926174496646}}
RF cm [[TN,FP],[FN,TP]]: [[56830, 57], [14, 61]] | alerts= 118
XGB cm [[TN,FP],[FN,TP]]: [[56827, 60], [15, 60]] | alerts= 120


In [11]:
# --- FEATURE IMPORTANCE ---
def feat_importance(m, X, y, top=15):
    est = m.named_steps.get('clf', m) if hasattr(m, 'named_steps') else m
    names = list(X.columns)
    out = {}
    if hasattr(est, "coef_"):
        vals = np.abs(est.coef_).ravel()
        out["coef_abs"] = sorted(
            [{"feature": n, "importance": float(v)} for n, v in zip(names, vals)],
            key=lambda d: d["importance"], reverse=True
        )[:top]
    if hasattr(est, "feature_importances_"):
        vals = est.feature_importances_
        out["gini"] = sorted(
            [{"feature": n, "importance": float(v)} for n, v in zip(names, vals)],
            key=lambda d: d["importance"], reverse=True
        )[:top]
    perm = permutation_importance(est, X, y, scoring="f1", n_repeats=5,
                                  random_state=cfg['seed'], n_jobs=-1)
    out["perm_f1"] = sorted(
        [{"feature": n, "importance": float(v)} for n, v in zip(names, perm.importances_mean)],
        key=lambda d: d["importance"], reverse=True
    )[:top]
    return out

feat_imp=feat_importance(trained["rf"],Xte,y_test)
feat_imp

{'gini': [{'feature': 'V14', 'importance': 0.17726853459748598},
  {'feature': 'V10', 'importance': 0.12536484970292291},
  {'feature': 'V12', 'importance': 0.10591588178799394},
  {'feature': 'V17', 'importance': 0.09847160743382767},
  {'feature': 'V4', 'importance': 0.09429429965756099},
  {'feature': 'V11', 'importance': 0.07532776719700632},
  {'feature': 'V16', 'importance': 0.04384074747433529},
  {'feature': 'V7', 'importance': 0.035197457266418555},
  {'feature': 'V3', 'importance': 0.030966626460021376},
  {'feature': 'V2', 'importance': 0.0244631462516603},
  {'feature': 'V21', 'importance': 0.016953830745629412},
  {'feature': 'V18', 'importance': 0.01553854797398154},
  {'feature': 'V19', 'importance': 0.012377653977048252},
  {'feature': 'V9', 'importance': 0.012370803249994077},
  {'feature': 'Amount_z', 'importance': 0.011414255534963232}],
 'perm_f1': [{'feature': 'V12', 'importance': 0.787360132615002},
  {'feature': 'V14', 'importance': 0.7518590580732879},
  {'featu

In [12]:
# --- SAVE MODEL + ONNX (deterministic) ---
from pathlib import Path
import joblib, json
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnxmltools.convert import convert_xgboost
from onnxconverter_common.data_types import FloatTensorType as FloatTensorType2
import onnxruntime as ort
import time

models_dir = resolve_project_relative("ml-model/models")
models_dir.mkdir(parents=True, exist_ok=True)

def _write_class_labels_sidecar(best_model, n_features: int, out_path: Path):
    est = best_model.named_steps["clf"] if hasattr(best_model, "named_steps") and "clf" in best_model.named_steps else best_model
    if not hasattr(est, "classes_"):
        raise RuntimeError("Model has no classes.")
    labels = est.classes_.tolist()
    labels = [int(x) if x in (0, 1) or str(x) in ("0", "1") else str(x) for x in labels]
    sidecar = {
        "class_labels": labels,
        "positive_label": 1 if 1 in labels else "fraud",
        "n_features": int(n_features)
    }
    out_path.write_text(json.dumps(sidecar, indent=2))

def export_onnx(best_name, best_model, X_ref):
    n_features = X_ref.shape[1]
    onnx_path = models_dir / "best_model.onnx"
    joblib.dump(best_model, models_dir / "best_model.joblib")

    # unwrap pipeline if present
    est = best_model.named_steps["clf"] if hasattr(best_model, "named_steps") and "clf" in best_model.named_steps else best_model

    if best_name in ("lr", "lr_smote", "rf"):
        onx = convert_sklearn(
            best_model,  # pipeline is fine here
            initial_types=[("input", FloatTensorType([None, n_features]))],
            target_opset=15,
            options={"zipmap": False}
        )
        onnx_path.write_bytes(onx.SerializeToString())

    elif best_name == "xgb":
        onx = convert_xgboost(
            est,
            initial_types=[("input", FloatTensorType2([None, n_features]))]
        )
        onnx_path.write_bytes(onx.SerializeToString())
    else:
        raise RuntimeError(f"ONNX export not implemented for model '{best_name}'")

    _write_class_labels_sidecar(best_model, n_features, models_dir / "class_labels.json")
    return str(onnx_path)

# ---- choose RF explicitly ----
model_name = "rf"
best      = trained[model_name]
best_thr  = float(swept[model_name])

onnx_file = export_onnx(model_name, best, Xtr)

# sanity-load the ONNX
sess = ort.InferenceSession(onnx_file, providers=["CPUExecutionProvider"])
print("ONNX OK ->", onnx_file)


ONNX OK -> /home/k1rel/programming/rt-fraud-detection/ml-model/models/best_model.onnx


In [13]:
def bench(m, X, n=cfg['timing_samples']):
    n = min(n, len(X))
    Xs = X.iloc[:n]
    _ = m.predict(Xs.iloc[:32])     # warm-up
    t0 = time.perf_counter(); _ = m.predict(Xs); t1 = time.perf_counter()
    return 1000.0 * (t1 - t0) / n

sk_ms = bench(best, Xte)

onnx_ms = None
pname = sess.get_inputs()[0].name
Xarr = Xte.values.astype(np.float32)
n = min(cfg['timing_samples'], len(Xarr))
_ = sess.run(None, {pname: Xarr[:32]})  # warm-up
t0 = time.perf_counter(); _ = sess.run(None, {pname: Xarr[:n]}); t1 = time.perf_counter()
onnx_ms = 1000.0 * (t1 - t0) / n

print({'sk_ms': sk_ms, 'onnx_ms': onnx_ms, 'threshold': best_thr})

{'sk_ms': 0.012351150899939966, 'onnx_ms': 0.00531318690000262, 'threshold': 0.03221476510067114}


In [14]:
# --- REPORT (compact, serializable, reproducible) ---
from pathlib import Path
import json, datetime, numpy as np

rep = resolve_project_relative("ml-model/reports/model_evaluation.md")

rep.parent.mkdir(parents=True, exist_ok=True)

best_name = "rf"
best_thr  = float(swept[best_name])

def _to_native(o):
    if isinstance(o, (np.floating, np.integer)): return o.item()
    if isinstance(o, np.ndarray): return o.tolist()
    return o

report = {
    "timestamp_utc": datetime.datetime.utcnow().isoformat(timespec="seconds")+"Z",
    "chosen_model": best_name,
    "threshold": best_thr,
    "test_metrics": {k: _to_native(v) for k,v in results[best_name].items()},
    "baselines": {
        "lr":  {k: _to_native(v) for k,v in results["lr"].items()},
        "rf":  {k: _to_native(v) for k,v in results["rf"].items()},
        "xgb": {k: _to_native(v) for k,v in results["xgb"].items()}
    },
    "cv_results": {m: {k: _to_native(v) for k,v in d.items()} for m,d in cv_results.items()} if "cv_results" in globals() else {},
    "timing_ms_per_row": {"sklearn": float(sk_ms), "onnx": float(onnx_ms)},
    "thresholds": {k: float(v) for k,v in swept.items()},
    "feature_importance_top15": feat_imp
}

rep.write_text(
    '# Model Evaluation\n\n```json\n'
    + json.dumps(report, indent=2)
    + '\n```\n',
    encoding='utf-8'
)
print('Report written ->', rep)


Report written -> /home/k1rel/programming/rt-fraud-detection/ml-model/reports/model_evaluation.md
