In [6]:
# ðŸ“¦ Export a concise leakage-audit summary as JSON

import json
from pathlib import Path
import numpy as np
import pandas as pd

In [9]:
# 1) Load (or reuse df if already defined)
DATA_PATH = Path("../data/processed/enriched.csv")
assert DATA_PATH.exists(), "Enriched dataset missing. Generate it first."
df = df if "df" in globals() else pd.read_csv(DATA_PATH)

In [10]:
# 2) Basic stats
rows, cols = df.shape
fraud_count = int((df["Class"] == 1).sum())
legit_count = int((df["Class"] == 0).sum())
fraud_ratio = float(fraud_count / rows) if rows else 0.0
missing_total = int(df.isna().sum().sum())
duplicates = int(df.duplicated().sum())

In [11]:
# 3) Time-order sanity within device
ordered = df.sort_values(["device_id", "Time"])
time_violations = int((ordered.groupby("device_id")["Time"].diff() < 0).sum())

In [13]:
# 5) Velocity sanity (optional but useful)
vel_cols = [c for c in ["txn_count_5m","txn_count_30m","txn_count_60m"] if c in df.columns]
first_txn = ordered.groupby("device_id").head(1)
vel_first_eq1_rate = {c: float((first_txn[c] == 1).mean()) for c in vel_cols}

In [12]:
# 4) Quick correlations with label (top 10, abs value)
top_corr = (
    df.corr(numeric_only=True)["Class"]
      .drop("Class")
      .abs()
      .sort_values(ascending=False)
      .head(10)
      .round(4)
      .to_dict()
)

In [14]:
# 6) Build report dict
report = {
    "rows": rows,
    "cols": cols,
    "fraud_count": fraud_count,
    "legit_count": legit_count,
    "fraud_ratio": round(fraud_ratio * 100, 4),  # percent
    "missing_values_total": missing_total,
    "duplicate_rows": duplicates,
    "time_order_violations_per_device": time_violations,
    "top_corr_with_Class_abs": top_corr,
    "first_txn_counts_eq_1_rate": vel_first_eq1_rate,
    "notes": [
        "All features should be pre-authorization only.",
        "Monitor correlations again after train/validation/test split.",
        "Time-order violations > 0 may indicate data ordering issues."
    ],
}

In [15]:
# Save JSON to project root /artifacts/eda
import json
from pathlib import Path

# Get project root (two levels up from notebooks/)
ROOT_DIR = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
OUT_DIR = ROOT_DIR / "artifacts" / "eda"
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_PATH = OUT_DIR / "leakage_audit_report.json"

with open(OUT_PATH, "w") as f:
    json.dump(report, f, indent=2)

print(f"âœ… Leakage audit report saved to: {OUT_PATH}")
report

âœ… Leakage audit report saved to: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/artifacts/eda/leakage_audit_report.json


{'rows': 284807,
 'cols': 51,
 'fraud_count': 492,
 'legit_count': 284315,
 'fraud_ratio': 0.1727,
 'missing_values_total': 0,
 'duplicate_rows': 0,
 'time_order_violations_per_device': 0,
 'top_corr_with_Class_abs': {'V17': 0.3265,
  'V14': 0.3025,
  'V12': 0.2606,
  'V10': 0.2169,
  'V16': 0.1965,
  'V3': 0.193,
  'V7': 0.1873,
  'V11': 0.1549,
  'V4': 0.1334,
  'V18': 0.1115},
 'first_txn_counts_eq_1_rate': {'txn_count_5m': 0.8985,
  'txn_count_30m': 0.8985,
  'txn_count_60m': 0.8985},
 'notes': ['All features should be pre-authorization only.',
  'Monitor correlations again after train/validation/test split.',
  'Time-order violations > 0 may indicate data ordering issues.']}