In [1]:
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

REPO_ROOT = Path("/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection").resolve()
os.chdir(REPO_ROOT)
print("CWD:", Path.cwd())

ENRICHED = REPO_ROOT / "data/processed/enriched.csv"
TRAIN = REPO_ROOT / "data/processed/train.csv"              # processed (may include SMOTE)
VAL   = REPO_ROOT / "data/processed/val.csv"
TEST  = REPO_ROOT / "data/processed/test.csv"
TRAIN_NS = REPO_ROOT / "data/processed/train_nosmote.csv"   # processed, no SMOTE
VAL_NS   = REPO_ROOT / "data/processed/val_nosmote.csv"
TEST_NS  = REPO_ROOT / "data/processed/test_nosmote.csv"

FEATURES_META = REPO_ROOT / "artifacts/features.json"

FIG_DIR = REPO_ROOT / "docs/figures/data/03_feature_engineering_summary"
FIG_DIR.mkdir(parents=True, exist_ok=True)

for p in [ENRICHED, TRAIN, VAL, TEST, TRAIN_NS, VAL_NS, TEST_NS, FEATURES_META]:
    print(p.relative_to(REPO_ROOT), "exists=", p.exists())

print("FIG_DIR:", FIG_DIR.relative_to(REPO_ROOT))


CWD: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection
data/processed/enriched.csv exists= True
data/processed/train.csv exists= True
data/processed/val.csv exists= True
data/processed/test.csv exists= True
data/processed/train_nosmote.csv exists= True
data/processed/val_nosmote.csv exists= True
data/processed/test_nosmote.csv exists= True
artifacts/features.json exists= True
FIG_DIR: docs/figures/data/03_feature_engineering_summary


In [2]:
# features metadata (what you declared as numeric/categorical BEFORE encoding)
features_meta = {}
if FEATURES_META.exists():
    features_meta = json.loads(FEATURES_META.read_text(encoding="utf-8"))

cat_cols = features_meta.get("categorical_features", [])
num_cols = features_meta.get("numerical_features", [])
time_col = features_meta.get("time_column", "Time")
target_col = features_meta.get("target_column", "Class")

print("target_col:", target_col, "| time_col:", time_col)
print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols))

# Enriched (pre-encoding) dataset
df_enriched = pd.read_csv(ENRICHED) if ENRICHED.exists() else None
print("enriched:", None if df_enriched is None else df_enriched.shape)

# Processed (post-encoding) dataset — prefer no-smote for “clean” feature stats
df_proc = pd.read_csv(TRAIN_NS) if TRAIN_NS.exists() else pd.read_csv(TRAIN)
print("processed_train:", df_proc.shape)

# Feature columns in processed data
proc_feature_cols = [c for c in df_proc.columns if c != target_col]
print("processed feature count:", len(proc_feature_cols))


target_col: Class | time_col: Time
num_cols: 39 cat_cols: 10
enriched: (284807, 52)
processed_train: (199364, 103)
processed feature count: 102


In [3]:
# Enriched feature categories (high-level)
enriched_groups = {
    "base_kaggle": [c for c in (["Time","Amount"] + [f"V{i}" for i in range(1,29)] + [target_col]) if df_enriched is not None and c in df_enriched.columns],
    "device":      [c for c in ["device_id","device_os","browser","is_new_device"] if df_enriched is not None and c in df_enriched.columns],
    "network":     [c for c in ["ip_country","is_proxy_vpn","ip_reputation"] if df_enriched is not None and c in df_enriched.columns],
    "velocity":    [c for c in ["txn_count_5m","txn_count_30m","txn_count_60m","avg_amount_7d"] if df_enriched is not None and c in df_enriched.columns],
    "profile":     [c for c in ["account_age_days","token_age_days","avg_spend_user_30d"] if df_enriched is not None and c in df_enriched.columns],
    "geo":         [c for c in ["billing_country","shipping_country","geo_distance_km","country_mismatch"] if df_enriched is not None and c in df_enriched.columns],
    "derived":     [c for c in ["amount_zscore","night_txn","weekend_txn"] if df_enriched is not None and c in df_enriched.columns],
}

if df_enriched is not None:
    for g, cols in enriched_groups.items():
        print(f"{g:>12}: {len(cols):2d}  {cols}")
else:
    print("enriched.csv not found — cannot summarize enriched feature groups.")


 base_kaggle: 31  ['Time', 'Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Class']
      device:  4  ['device_id', 'device_os', 'browser', 'is_new_device']
     network:  3  ['ip_country', 'is_proxy_vpn', 'ip_reputation']
    velocity:  4  ['txn_count_5m', 'txn_count_30m', 'txn_count_60m', 'avg_amount_7d']
     profile:  3  ['account_age_days', 'token_age_days', 'avg_spend_user_30d']
         geo:  4  ['billing_country', 'shipping_country', 'geo_distance_km', 'country_mismatch']
     derived:  3  ['amount_zscore', 'night_txn', 'weekend_txn']


In [4]:
if df_enriched is None:
    raise RuntimeError("enriched.csv not found. Generate it first (scripts/enrich_synthetic.py).")

key_num = ["Amount", "ip_reputation", "txn_count_5m", "geo_distance_km", "account_age_days", "amount_zscore"]
key_num = [c for c in key_num if c in df_enriched.columns]

for col in key_num:
    plt.figure()
    df_enriched[col].hist(bins=60)
    plt.title(f"Distribution: {col}")
    plt.xlabel(col); plt.ylabel("count")
    out = FIG_DIR / f"dist_{col}.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()
    print("saved:", out.name)


saved: dist_Amount.png
saved: dist_ip_reputation.png
saved: dist_txn_count_5m.png
saved: dist_geo_distance_km.png
saved: dist_account_age_days.png
saved: dist_amount_zscore.png


In [9]:
# Color standard for thesis
COLOR_FRAUD = "#d62728"   # red
COLOR_LEGIT = "#1f77b4"   # blue

compare_cols = ["ip_reputation", "txn_count_5m", "geo_distance_km",
                "account_age_days", "amount_zscore"]
compare_cols = [c for c in compare_cols if c in df_enriched.columns]

fraud = df_enriched[df_enriched[target_col] == 1]
legit = df_enriched[df_enriched[target_col] == 0]

for col in compare_cols:
    plt.figure(figsize=(6,4))

    legit[col].sample(
        min(5000, len(legit)),
        random_state=42
    ).hist(
        bins=60,
        alpha=0.7,
        color=COLOR_LEGIT,
        label="Legitimate"
    )

    fraud[col].hist(
        bins=60,
        alpha=0.7,
        color=COLOR_FRAUD,
        label="Fraud"
    )

    plt.title(f"Legitimate vs Fraud — {col}")
    plt.xlabel(col)
    plt.ylabel("Transaction count")
    plt.legend()

    out = FIG_DIR / f"compare_legit_fraud_{col}.png"
    plt.savefig(out, dpi=200, bbox_inches="tight")
    plt.close()

    print("saved:", out.name)


saved: compare_legit_fraud_ip_reputation.png
saved: compare_legit_fraud_txn_count_5m.png
saved: compare_legit_fraud_geo_distance_km.png
saved: compare_legit_fraud_account_age_days.png
saved: compare_legit_fraud_amount_zscore.png


In [10]:
# processed feature naming convention from your pipeline:
# ColumnTransformer("num" -> scaler, "cat" -> onehot) typically yields names like: num__Amount, cat__browser_Chrome, ...
proc_cols = proc_feature_cols
num_like = [c for c in proc_cols if c.startswith("num__")]
cat_like = [c for c in proc_cols if c.startswith("cat__")]

print("Processed columns total:", len(proc_cols))
print("num__ columns:", len(num_like))
print("cat__ columns:", len(cat_like))

# Show top 30 most common active one-hot flags (mean close to prevalence)
if len(cat_like) > 0:
    means = df_proc[cat_like].mean(axis=0).sort_values(ascending=False).head(30)
    display(means)


Processed columns total: 102
num__ columns: 39
cat__ columns: 63


cat__is_new_device_False       0.949219
cat__country_mismatch_True     0.932907
cat__is_proxy_vpn_False        0.929892
cat__night_txn_False           0.849441
cat__weekend_txn_True          0.726239
cat__weekend_txn_False         0.273761
cat__device_os_Windows         0.251595
cat__browser_Chrome            0.251520
cat__browser_Firefox           0.250607
cat__browser_Edge              0.250080
cat__device_os_iOS             0.249759
cat__device_os_MacOS           0.249529
cat__device_os_Android         0.249117
cat__browser_Safari            0.247793
cat__night_txn_True            0.150559
cat__is_proxy_vpn_True         0.070108
cat__shipping_country_IN       0.067911
cat__billing_country_NL        0.067615
cat__shipping_country_US       0.067565
cat__ip_country_IT             0.067510
cat__ip_country_BR             0.067505
cat__billing_country_PT        0.067414
cat__ip_country_IN             0.067374
cat__shipping_country_IT       0.067184
cat__billing_country_DE        0.067108


In [11]:
# Correlation in huge one-hot space can be heavy; keep it small and safe
# We'll compute abs correlation for numeric-like processed features + top one-hot flags only.

cand = []
cand += num_like
if len(cat_like) > 0:
    # take only most frequent one-hot flags to keep it stable
    top_flags = df_proc[cat_like].mean(axis=0).sort_values(ascending=False).head(200).index.tolist()
    cand += top_flags

cand = [c for c in cand if c in df_proc.columns]

corrs = df_proc[cand + [target_col]].corr(numeric_only=True)[target_col].drop(target_col).abs()
top_corr = corrs.sort_values(ascending=False).head(25)

plt.figure()
top_corr[::-1].plot(kind="barh")
plt.title("Top |corr| with Class (processed feature space)")
plt.xlabel("|corr|")
out = FIG_DIR / "top_corr_processed.png"
plt.savefig(out, dpi=200, bbox_inches="tight")
plt.close()

top_corr


num__V17                    0.383900
num__V14                    0.326204
num__V12                    0.276997
num__V10                    0.255859
num__V3                     0.244368
num__V7                     0.237799
num__V16                    0.234110
num__V11                    0.171057
num__V4                     0.147105
num__V18                    0.140459
num__V1                     0.130187
num__V5                     0.128628
num__V2                     0.114080
num__V9                     0.107261
cat__is_proxy_vpn_True      0.054250
cat__is_proxy_vpn_False     0.054250
num__account_age_days       0.053116
num__txn_count_60m          0.053069
num__V6                     0.048739
num__V21                    0.047855
num__ip_reputation          0.047067
num__V19                    0.042710
cat__is_new_device_False    0.039862
cat__is_new_device_True     0.039862
num__txn_count_30m          0.034166
Name: Class, dtype: float64

In [12]:
summary = {
    "enriched_rows": None if df_enriched is None else int(len(df_enriched)),
    "enriched_cols": None if df_enriched is None else int(df_enriched.shape[1]),
    "processed_train_rows": int(len(df_proc)),
    "processed_feature_cols": int(len(proc_feature_cols)),
    "processed_num_cols": int(len(num_like)),
    "processed_cat_cols": int(len(cat_like)),
}

summary_df = pd.DataFrame([summary])
summary_df


Unnamed: 0,enriched_rows,enriched_cols,processed_train_rows,processed_feature_cols,processed_num_cols,processed_cat_cols
0,284807,52,199364,102,39,63
