# Payment Fraud Detection (E-commerce)

Imbalanced classification with cost-aware thresholding.


In [None]:
import os
from pathlib import Path

def find_project_root(start: Path, marker: str = "03_payment_fraud_detection") -> Path:
    p = start.resolve()
    for parent in [p] + list(p.parents):
        if parent.name == marker:
            return parent
    return start.resolve()

ROOT = find_project_root(Path.cwd())
os.chdir(ROOT)
print("Project root:", ROOT)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, precision_recall_curve, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import joblib


In [None]:
data_path = Path("data/transactions.csv")
if not data_path.exists():
    from data.make_dataset import main as make_data
    make_data(out_path=str(data_path))

df = pd.read_csv(data_path)
print("Fraud rate:", f"{df['fraud'].mean():.2%}")
df.head()


In [None]:
target = "fraud"
X = df.drop(columns=[target, "tx_id"])
y = df[target]

cat = ["device", "payment_method", "country_risk"]
num = [c for c in X.columns if c not in cat]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

rf = Pipeline([
    ("prep", pre),
    ("clf", RandomForestClassifier(
        n_estimators=500, random_state=42, n_jobs=-1,
        min_samples_leaf=4, class_weight="balanced_subsample"
    ))
])

rf.fit(X_train, y_train)
p = rf.predict_proba(X_test)[:,1]

print("ROC-AUC:", round(roc_auc_score(y_test, p), 4))
print("PR-AUC:", round(average_precision_score(y_test, p), 4))


In [None]:
# Cost-based thresholding
review_cost = 3.0
amounts = X_test["amount"].values

precision, recall, thr = precision_recall_curve(y_test, p)
thr = np.append(thr, 1.0)

def expected_cost(t):
    pred = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    fn_cost = amounts[(y_test.values==1) & (pred==0)].sum()
    review = pred.sum() * review_cost
    return fn_cost + review

costs = np.array([expected_cost(t) for t in thr])
best_thr = float(thr[int(costs.argmin())])
print("Best cost threshold:", round(best_thr, 4), "Expected cost:", round(float(costs.min()), 2))

y_pred = (p >= best_thr).astype(int)
print(classification_report(y_test, y_pred, digits=3))


In [None]:
plt.figure(figsize=(6,4))
plt.plot(recall, precision)
plt.title("Precision-Recall Curve (RandomForest)")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.tight_layout()

Path("reports").mkdir(exist_ok=True)
plt.savefig("reports/pr_curve.png", dpi=200, bbox_inches="tight")
plt.show()


In [None]:
Path("models").mkdir(exist_ok=True)
Path("reports").mkdir(exist_ok=True)

joblib.dump({"model": rf, "threshold": best_thr}, "models/fraud_model.joblib")

metrics = {
    "roc_auc": float(roc_auc_score(y_test, p)),
    "pr_auc": float(average_precision_score(y_test, p)),
    "best_cost_threshold": float(best_thr),
    "expected_cost_min": float(costs.min())
}
Path("reports/metrics.json").write_text(pd.Series(metrics).to_json(), encoding="utf-8")

print("Saved models/fraud_model.joblib and reports/*")


## Recommendations
- Use the cost-optimized threshold to minimize losses + review cost.
- Retrain monthly and monitor PR-AUC + cost.
- In production, add more signals (shipping mismatch, BIN, device fingerprint).
