In [1]:
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ✅ set your repo root (folder that contains api/, data/, docs/, scripts/, src/)
REPO_ROOT = Path("/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection").resolve()
os.chdir(REPO_ROOT)

print("CWD:", Path.cwd())

DATA_RAW = REPO_ROOT / "data/raw/creditcard.csv"
ENRICHED = REPO_ROOT / "data/processed/enriched.csv"

TRAIN_RAW = REPO_ROOT / "data/processed/train_raw.csv"
VAL_RAW   = REPO_ROOT / "data/processed/val_raw.csv"
TEST_RAW  = REPO_ROOT / "data/processed/test_raw.csv"

TRAIN = REPO_ROOT / "data/processed/train.csv"
VAL   = REPO_ROOT / "data/processed/val.csv"
TEST  = REPO_ROOT / "data/processed/test.csv"

TRAIN_NOSMOTE = REPO_ROOT / "data/processed/train_nosmote.csv"
VAL_NOSMOTE   = REPO_ROOT / "data/processed/val_nosmote.csv"
TEST_NOSMOTE  = REPO_ROOT / "data/processed/test_nosmote.csv"

PREPROCESSOR_PATH = REPO_ROOT / "artifacts/preprocess.joblib"
FEATURES_META_PATH = REPO_ROOT / "artifacts/features.json"

FIG_DIR = REPO_ROOT / "docs/figures/data/01_preprocessing_pipeline"
FIG_DIR.mkdir(parents=True, exist_ok=True)

for p in [DATA_RAW, ENRICHED]:
    print(p, "exists=", p.exists())
print("FIG_DIR:", FIG_DIR)


CWD: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection
/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/data/raw/creditcard.csv exists= True
/Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/data/processed/enriched.csv exists= True
FIG_DIR: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/data/01_preprocessing_pipeline


In [2]:
if not ENRICHED.exists():
    raise FileNotFoundError(
        f"{ENRICHED} not found.\n"
        "Run: python scripts/enrich_synthetic.py --input data/raw/creditcard.csv --output data/processed/enriched.csv --seed 42"
    )

df = pd.read_csv(ENRICHED)
print("enriched shape:", df.shape)
df.head(3)


enriched shape: (284807, 52)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,account_age_days,token_age_days,avg_spend_user_30d,billing_country,shipping_country,geo_distance_km,country_mismatch,amount_zscore,night_txn,weekend_txn
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,112,352,17.88,SE,FR,1882.9,True,1.0,True,True
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,421,115,4.38,ES,CA,7059.7,True,-1.0,True,True
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,322,13,6.39,GB,PL,1529.7,True,1.0,True,True


In [3]:
target_col = "Class"
time_col = "Time"

print("Missing values total:", int(df.isna().sum().sum()))
print("Duplicates:", int(df.duplicated().sum()))

counts = df[target_col].value_counts()
fraud_ratio = counts.get(1, 0) / len(df)
print("Class counts:", counts.to_dict())
print("Fraud ratio:", fraud_ratio)


Missing values total: 0
Duplicates: 0
Class counts: {0: 284315, 1: 492}
Fraud ratio: 0.001727485630620034


In [4]:
df_sorted = df.sort_values(time_col).reset_index(drop=True)
n = len(df_sorted)

train_end = int(0.70 * n)
val_end = int(0.85 * n)

train_df_raw = df_sorted.iloc[:train_end].copy()
val_df_raw   = df_sorted.iloc[train_end:val_end].copy()
test_df_raw  = df_sorted.iloc[val_end:].copy()

print("raw split sizes:", len(train_df_raw), len(val_df_raw), len(test_df_raw))

train_df_raw.to_csv(TRAIN_RAW, index=False)
val_df_raw.to_csv(VAL_RAW, index=False)
test_df_raw.to_csv(TEST_RAW, index=False)

print("saved:", TRAIN_RAW.name, VAL_RAW.name, TEST_RAW.name)


raw split sizes: 199364 42721 42722
saved: train_raw.csv val_raw.csv test_raw.csv


In [5]:
# global monotonicity
is_sorted = (df_sorted[time_col].values[:-1] <= df_sorted[time_col].values[1:]).all()
print("Global time sorted:", bool(is_sorted))

# per device ordering violations (should be 0)
if "device_id" in df_sorted.columns:
    violations = 0
    for _, g in df_sorted.groupby("device_id"):
        t = g[time_col].values
        if len(t) > 1 and not (t[:-1] <= t[1:]).all():
            violations += 1
    print("Time-order violations per device:", violations)
else:
    print("device_id not present - skipping per-device check")


Global time sorted: True
Time-order violations per device: 0


In [6]:
exclude_cols = {target_col, time_col, "device_id"}
feature_cols = [c for c in train_df_raw.columns if c not in exclude_cols]

cat_cols = train_df_raw[feature_cols].select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = [c for c in feature_cols if c not in cat_cols]

print("num cols:", len(num_cols))
print("cat cols:", len(cat_cols))
print("example cat cols:", cat_cols[:10])


num cols: 39
cat cols: 10
example cat cols: ['device_os', 'browser', 'is_new_device', 'ip_country', 'is_proxy_vpn', 'billing_country', 'shipping_country', 'country_mismatch', 'night_txn', 'weekend_txn']


In [7]:
import joblib
from src.preprocess.preprocess_pipeline import build_preprocess_pipeline

X_train_raw = train_df_raw[num_cols + cat_cols]
y_train = train_df_raw[target_col].astype(int)

X_val_raw = val_df_raw[num_cols + cat_cols]
y_val = val_df_raw[target_col].astype(int)

X_test_raw = test_df_raw[num_cols + cat_cols]
y_test = test_df_raw[target_col].astype(int)

preprocessor = build_preprocess_pipeline(
    categorical_features=cat_cols,
    numerical_features=num_cols,
)

X_train_proc = preprocessor.fit_transform(X_train_raw)
X_val_proc   = preprocessor.transform(X_val_raw)
X_test_proc  = preprocessor.transform(X_test_raw)

try:
    feature_names = preprocessor.get_feature_names_out().tolist()
except Exception:
    feature_names = [f"feature_{i}" for i in range(X_train_proc.shape[1])]

print("Processed shapes:", X_train_proc.shape, X_val_proc.shape, X_test_proc.shape)
print("feature_names:", len(feature_names))


Processed shapes: (199364, 102) (42721, 102) (42722, 102)
feature_names: 102


In [8]:
def df_from_transformed(X_transformed, y, feature_names):
    if hasattr(X_transformed, "toarray"):
        X_dense = X_transformed.toarray()
    else:
        X_dense = X_transformed
    out = pd.DataFrame(X_dense, columns=feature_names)
    out[target_col] = pd.Series(y).reset_index(drop=True).astype(int)
    return out

train_nosmote_df = df_from_transformed(X_train_proc, y_train, feature_names)
val_df_proc = df_from_transformed(X_val_proc, y_val, feature_names)
test_df_proc = df_from_transformed(X_test_proc, y_test, feature_names)

train_nosmote_df.to_csv(TRAIN_NOSMOTE, index=False)
val_df_proc.to_csv(VAL_NOSMOTE, index=False)
test_df_proc.to_csv(TEST_NOSMOTE, index=False)

print("saved:", TRAIN_NOSMOTE.name, VAL_NOSMOTE.name, TEST_NOSMOTE.name)


saved: train_nosmote.csv val_nosmote.csv test_nosmote.csv


In [9]:
from imblearn.over_sampling import SMOTE

use_smote = True

if use_smote:
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(
        train_nosmote_df.drop(columns=[target_col]),
        train_nosmote_df[target_col],
    )
    train_df_final = pd.DataFrame(X_res, columns=feature_names)
    train_df_final[target_col] = y_res.astype(int)
else:
    train_df_final = train_nosmote_df.copy()

train_df_final.to_csv(TRAIN, index=False)
val_df_proc.to_csv(VAL, index=False)
test_df_proc.to_csv(TEST, index=False)

print("saved processed:", TRAIN.name, VAL.name, TEST.name)
print("train class distribution (after smote):", train_df_final[target_col].value_counts().to_dict())


  train_df_final[target_col] = y_res.astype(int)


saved processed: train.csv val.csv test.csv
train class distribution (after smote): {0: 198980, 1: 198980}


In [10]:
joblib.dump(preprocessor, PREPROCESSOR_PATH)

meta = {
    "target_column": target_col,
    "time_column": time_col,
    "categorical_features": cat_cols,
    "numerical_features": num_cols,
    "feature_names_after_preprocessing": feature_names,
    "train_rows_before_smote": int(len(train_nosmote_df)),
    "train_rows_after_smote": int(len(train_df_final)),
    "val_rows": int(len(val_df_proc)),
    "test_rows": int(len(test_df_proc)),
    "use_smote": bool(use_smote),
    "paths": {
        "train_raw": str(TRAIN_RAW),
        "val_raw": str(VAL_RAW),
        "test_raw": str(TEST_RAW),
        "train_nosmote": str(TRAIN_NOSMOTE),
        "val_nosmote": str(VAL_NOSMOTE),
        "test_nosmote": str(TEST_NOSMOTE),
        "train": str(TRAIN),
        "val": str(VAL),
        "test": str(TEST),
        "preprocess_artifact": str(PREPROCESSOR_PATH),
    },
}

FEATURES_META_PATH.write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("saved:", PREPROCESSOR_PATH, FEATURES_META_PATH)


saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/artifacts/preprocess.joblib /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/artifacts/features.json


In [11]:
# split size bar
sizes = pd.Series({
    "train_raw": len(train_df_raw),
    "val_raw": len(val_df_raw),
    "test_raw": len(test_df_raw),
})
plt.figure()
sizes.plot(kind="bar")
plt.title("Time-based split sizes (70/15/15)")
plt.ylabel("Rows")
plt.tight_layout()
out1 = FIG_DIR / "split_sizes.png"
plt.savefig(out1, dpi=200)
plt.close()
print("saved:", out1)

# class distribution before/after SMOTE
before = train_nosmote_df[target_col].value_counts().sort_index()
after  = train_df_final[target_col].value_counts().sort_index()

dist = pd.DataFrame({"before_smote": before, "after_smote": after}).fillna(0).astype(int)
plt.figure()
dist.plot(kind="bar")
plt.title("Train class distribution (before vs after SMOTE)")
plt.ylabel("Rows")
plt.tight_layout()
out2 = FIG_DIR / "train_class_distribution_smote.png"
plt.savefig(out2, dpi=200)
plt.close()
print("saved:", out2)

dist


saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/data/01_preprocessing_pipeline/split_sizes.png
saved: /Users/lavanyasrinivas/Documents/AI-First-Preauth-Fraud-Detection/AI-First-Preauth-Fraud-Detection/docs/figures/data/01_preprocessing_pipeline/train_class_distribution_smote.png


Unnamed: 0_level_0,before_smote,after_smote
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,198980,198980
1,384,198980


<Figure size 640x480 with 0 Axes>

In [12]:
# This is a lightweight “no overlap” check by hashing a few stable columns.
# (We don't use Class for overlap test; we use Time+Amount+device_id if available.)

cols = ["Time", "Amount"]
if "device_id" in df.columns:
    cols.append("device_id")

def hash_rows(d):
    return pd.util.hash_pandas_object(d[cols], index=False)

h_train = set(hash_rows(train_df_raw).astype(int).tolist())
h_val   = set(hash_rows(val_df_raw).astype(int).tolist())
h_test  = set(hash_rows(test_df_raw).astype(int).tolist())

print("train∩val:", len(h_train & h_val))
print("train∩test:", len(h_train & h_test))
print("val∩test:", len(h_val & h_test))


train∩val: 0
train∩test: 0
val∩test: 0
