In [5]:
#############################################################
# FRAUDGEN v3 ‚Äî FINAL PIPELINE (CPU, CTGAN, VALIDATION, ANOMALY)
#############################################################

import os, warnings, json, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score
)
from sklearn.decomposition import PCA
from scipy.stats import ks_2samp, wasserstein_distance

import matplotlib
matplotlib.use("Agg")  # non-interactive backend, avoids GUI issues
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

# ============================================================
# CONFIG
# ============================================================
CONFIG = {
    "RAW_TRANS": "IEEE Primary Data/train_transaction.csv",  # input
    "PROCESSED": "Processed/train_processed.csv",
    "SYNTHETIC": "Synthetic/synthetic_fraud_v3.csv",
    "MODELS_DIR": "Models",
    "REPORTS_DIR": "Reports",
    "PLOTS_DIR": "Distribution_Plots",
    "N_SYNTH": 200_000,
    "CTGAN_EPOCHS": 15,
    "CTGAN_BATCH": 128,
    "CTGAN_PAC": 1,
    "SEED": 42,
}

for d in ["Processed", "Synthetic", "Models", "Reports", "Distribution_Plots"]:
    os.makedirs(d, exist_ok=True)

SEED = CONFIG["SEED"]
random.seed(SEED)
np.random.seed(SEED)

# ============================================================
# 0) PATCH JOBLIB TO FORCE SERIAL (CPU / WINDOWS SAFE)
# ============================================================
from joblib import Parallel as JoblibParallel

class SerialParallel(JoblibParallel):
    def __init__(self, *args, **kwargs):
        kwargs["n_jobs"] = 1
        super().__init__(*args, **kwargs)

joblib.Parallel = SerialParallel  # override BEFORE importing ctgan

# Torch seed (if available)
try:
    import torch
    torch.manual_seed(SEED)
except Exception:
    torch = None

# CTGAN
from ctgan import CTGAN


# ============================================================
# 1) PREPROCESSING
# ============================================================
def preprocess_transactions():
    print("[INFO] Loading raw transaction data...")
    df = pd.read_csv(CONFIG["RAW_TRANS"])

    if "isFraud" not in df.columns:
        raise KeyError("Column 'isFraud' not found in dataset.")

    # Drop columns with >70% missing
    miss_ratio = df.isna().mean()
    drop_cols = miss_ratio[miss_ratio > 0.7].index.tolist()
    if drop_cols:
        print(f"[INFO] Dropping {len(drop_cols)} cols with >70% missing.")
        df = df.drop(columns=drop_cols)

    # Separate numeric vs categorical based on dtypes
    numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns.tolist()
    if "isFraud" in numeric_cols:
        numeric_cols.remove("isFraud")

    cat_cols = [c for c in df.columns if c not in numeric_cols + ["isFraud"]]

    print(f"[INFO] Numeric cols: {len(numeric_cols)}")
    print(f"[INFO] Categorical cols: {len(cat_cols)}")

    # Fill NaNs
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    for c in cat_cols:
        df[c] = df[c].astype(str).fillna("missing")

    # Scale numeric with QuantileTransformer
    qt = QuantileTransformer(
        n_quantiles=min(500, len(df)),
        output_distribution="normal",
        random_state=SEED
    )
    df[numeric_cols] = qt.fit_transform(df[numeric_cols])
    joblib.dump(qt, os.path.join(CONFIG["MODELS_DIR"], "quantile_transformer.pkl"))

    # Save processed
    df.to_csv(CONFIG["PROCESSED"], index=False)
    print("[OK] Processed data saved ->", CONFIG["PROCESSED"])

    return df, numeric_cols, cat_cols


df, numeric_cols, cat_cols = preprocess_transactions()

# Split fraud / non-fraud after preprocessing
fraud_df = df[df["isFraud"] == 1].reset_index(drop=True)
legit_df = df[df["isFraud"] == 0].reset_index(drop=True)
print(f"[INFO] Fraud rows: {len(fraud_df)}, Legit rows: {len(legit_df)}")

if len(fraud_df) < 500:
    print("[WARN] Very few fraud samples. CTGAN may be unstable.")


# ============================================================
# 2) PREPARE DATA FOR CTGAN
# ============================================================
# CTGAN expects a DataFrame (no target), plus discrete column list

# Discrete columns = categorical columns (non-numeric), with >1 unique
discrete_cols = []
for c in cat_cols:
    if fraud_df[c].nunique() > 1:
        discrete_cols.append(c)

# Ensure discrete cols not empty (CTGAN can handle all continuous, but we keep this for clarity)
print(f"[INFO] CTGAN discrete columns: {len(discrete_cols)}")

# Training data for CTGAN: ONLY fraud rows, NO label
ctgan_train = fraud_df.drop(columns=["isFraud"]).copy()


# ============================================================
# 3) TRAIN CTGAN
# ============================================================
def train_ctgan(train_df, discrete_columns):
    print("[INFO] Training CTGAN...")
    ctgan = CTGAN(
        epochs=CONFIG["CTGAN_EPOCHS"],
        batch_size=CONFIG["CTGAN_BATCH"],
        pac=CONFIG["CTGAN_PAC"],
        verbose=True
    )
    # Patch transformer to avoid leftover state
    ctgan._data_transformer = None
    ctgan.fit(train_df, discrete_columns=discrete_columns)
    model_path = os.path.join(CONFIG["MODELS_DIR"], "ctgan_fraudgen_v3.pkl")
    joblib.dump(ctgan, model_path)
    print("[OK] CTGAN model saved ->", model_path)
    return ctgan


ctgan = train_ctgan(ctgan_train, discrete_columns=discrete_cols)


# ============================================================
# 4) GENERATE SYNTHETIC FRAUD
# ============================================================
print("[INFO] Generating synthetic fraud samples...")
synthetic = ctgan.sample(CONFIG["N_SYNTH"])
synthetic["isFraud"] = 1

# Reorder columns to match df
cols_order = [c for c in df.columns if c in synthetic.columns]
synthetic = synthetic[cols_order]

synth_path = CONFIG["SYNTHETIC"]
synthetic.to_csv(synth_path, index=False)
print("[OK] Synthetic data saved ->", synth_path)


# ============================================================
# 5) SYNTHETIC VALIDATION SYSTEM
# ============================================================
reports = {}

# Use fraud_df (real fraud) vs synthetic (all fraud by design)
real_fraud = fraud_df.copy()
synthetic_fraud = synthetic.copy()

# ---------- KS & Wasserstein for numeric features ----------
rows = []
for c in numeric_cols:
    try:
        r = real_fraud[c].astype(float)
        s = synthetic_fraud[c].astype(float)
        ks = ks_2samp(r, s).statistic
        ws = wasserstein_distance(r, s)
        rows.append([c, ks, ws])
    except Exception:
        continue

ks_ws_df = pd.DataFrame(rows, columns=["Feature", "KS", "Wasserstein"])
ks_ws_df.to_csv(os.path.join(CONFIG["REPORTS_DIR"], "validation_ks_wasserstein.csv"), index=False)

reports["mean_KS"] = float(ks_ws_df["KS"].mean())
reports["mean_Wasserstein"] = float(ks_ws_df["Wasserstein"].mean())

# ---------- PCA distance (distribution overlap check) ----------
pca = PCA(n_components=2, random_state=SEED)
X_real = real_fraud[numeric_cols].values
X_synth = synthetic_fraud[numeric_cols].values
X_combined = np.vstack([X_real, X_synth])
X_pca = pca.fit_transform(X_combined)
real_pca = X_pca[: len(X_real)]
synth_pca = X_pca[len(X_real):]

pca_distance = float(np.linalg.norm(real_pca.mean(axis=0) - synth_pca.mean(axis=0)))
reports["PCA_distance"] = pca_distance

# ---------- Classifier Two-Sample Test (CTST) ----------
labels = np.concatenate([
    np.zeros(len(X_real), dtype=int),
    np.ones(len(X_synth), dtype=int)
])
X_train_ctst, X_test_ctst, y_train_ctst, y_test_ctst = train_test_split(
    X_combined, labels, test_size=0.3, random_state=SEED, stratify=labels
)

rf_ctst = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    n_jobs=-1,
    random_state=SEED
)
rf_ctst.fit(X_train_ctst, y_train_ctst)
probs_ctst = rf_ctst.predict_proba(X_test_ctst)[:, 1]
auc_ctst = roc_auc_score(y_test_ctst, probs_ctst)
reports["CTST_AUC"] = float(auc_ctst)

# ---------- Correlation matrix Frobenius norm ----------
corr_real = real_fraud[numeric_cols].corr().values
corr_synth = synthetic_fraud[numeric_cols].corr().values
corr_diff = float(np.linalg.norm(corr_real - corr_synth, ord="fro"))
reports["Corr_Frobenius"] = corr_diff

# Save validation summary
with open(os.path.join(CONFIG["REPORTS_DIR"], "synthetic_validation_summary.json"), "w") as f:
    json.dump(reports, f, indent=2)

print("[OK] Synthetic validation metrics saved.")


# ============================================================
# 6) FRAUD DETECTION MODEL: REAL vs REAL+SYNTH
# ============================================================
print("[INFO] Training fraud classifiers (baseline vs synthetic-aug)...")

features = numeric_cols  # Use only numeric for RF

X = df[features]
y = df["isFraud"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# ---- Baseline model: Real only ----
rf_real = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    n_jobs=-1,
    random_state=SEED
)
rf_real.fit(X_train, y_train)
probs_real = rf_real.predict_proba(X_test)[:, 1]
pred_real = (probs_real >= 0.5).astype(int)

metrics_real = {
    "AUC": roc_auc_score(y_test, probs_real),
    "F1": f1_score(y_test, pred_real),
    "Precision": precision_score(y_test, pred_real),
    "Recall": recall_score(y_test, pred_real),
}
pd.DataFrame([metrics_real]).to_csv(
    os.path.join(CONFIG["REPORTS_DIR"], "rf_baseline_metrics.csv"),
    index=False
)

# ---- Synthetic-augmented model ----
X_synth_aug = synthetic_fraud[features]
y_synth_aug = np.ones(len(X_synth_aug), dtype=int)

X_train_aug = pd.concat([X_train, X_synth_aug], axis=0).reset_index(drop=True)
y_train_aug = pd.concat([y_train, pd.Series(y_synth_aug)], axis=0).reset_index(drop=True)

rf_aug = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    n_jobs=-1,
    random_state=SEED
)
rf_aug.fit(X_train_aug, y_train_aug)
probs_aug = rf_aug.predict_proba(X_test)[:, 1]
pred_aug = (probs_aug >= 0.5).astype(int)

metrics_aug = {
    "AUC": roc_auc_score(y_test, probs_aug),
    "F1": f1_score(y_test, pred_aug),
    "Precision": precision_score(y_test, pred_aug),
    "Recall": recall_score(y_test, pred_aug),
}
pd.DataFrame([metrics_aug]).to_csv(
    os.path.join(CONFIG["REPORTS_DIR"], "rf_synth_aug_metrics.csv"),
    index=False
)

print("[OK] Fraud detection metrics (baseline vs augmented) saved.")


# ============================================================
# 7) ANOMALY DETECTION WITH ISOLATION FOREST
# ============================================================
print("[INFO] Training IsolationForest anomaly detector...")

iso = IsolationForest(
    n_estimators=200,
    contamination=0.03,
    random_state=SEED,
    n_jobs=-1
)

X_legit = legit_df[features]
iso.fit(X_legit)

scores_test = -iso.score_samples(X_test)  # higher = more anomalous
auc_iso = roc_auc_score(y_test, scores_test)

anomaly_report = {
    "IsolationForest_AUC": float(auc_iso)
}
with open(os.path.join(CONFIG["REPORTS_DIR"], "anomaly_iforest_metrics.json"), "w") as f:
    json.dump(anomaly_report, f, indent=2)

print("[OK] Anomaly detection metrics saved.")


# ============================================================
# 8) DISTRIBUTION PLOTS FOR TOP FEATURES
# ============================================================
print("[INFO] Saving KDE distribution plots for top numeric features...")

# Take features with lowest KS (closest real vs synth)
top_for_plots = ks_ws_df.sort_values("KS").head(6)["Feature"].tolist()

for col in top_for_plots:
    plt.figure(figsize=(6, 3))
    sns.kdeplot(real_fraud[col], label="Real Fraud", fill=True, alpha=0.4)
    sns.kdeplot(synthetic_fraud[col], label="Synthetic Fraud", fill=True, alpha=0.4)
    plt.title(f"Real vs Synthetic ‚Äî {col}")
    plt.legend()
    out_path = os.path.join(CONFIG["PLOTS_DIR"], f"{col}_kde.png")
    plt.savefig(out_path, dpi=200, bbox_inches="tight")
    plt.close()

print("üéâ DONE. FRAUDGEN v3 PIPELINE FINISHED SUCCESSFULLY.")


[INFO] Loading raw transaction data...
[INFO] Dropping 168 cols with >70% missing.
[INFO] Numeric cols: 212
[INFO] Categorical cols: 13
[OK] Processed data saved -> Processed/train_processed.csv
[INFO] Fraud rows: 20663, Legit rows: 569877
[INFO] CTGAN discrete columns: 13
[INFO] Training CTGAN...
Epoch 1, Loss G:  0.7285,Loss D: -0.7536
Epoch 2, Loss G:  0.1313,Loss D: -1.2873
Epoch 3, Loss G: -0.8933,Loss D: -0.9818
Epoch 4, Loss G: -0.6472,Loss D: -0.8660
Epoch 5, Loss G:  0.1004,Loss D: -1.1312
Epoch 6, Loss G: -0.0793,Loss D: -0.7246
Epoch 7, Loss G: -0.3549,Loss D: -0.4958
Epoch 8, Loss G: -1.9143,Loss D: -0.3350
Epoch 9, Loss G: -1.0340,Loss D: -0.7519
Epoch 10, Loss G: -1.0568,Loss D: -0.5028
Epoch 11, Loss G: -0.7876,Loss D: -0.6470
Epoch 12, Loss G: -0.8281,Loss D: -0.7404
Epoch 13, Loss G: -0.6912,Loss D: -0.4781
Epoch 14, Loss G: -0.5377,Loss D: -0.6898
Epoch 15, Loss G: -0.8907,Loss D: -0.8770
[OK] CTGAN model saved -> Models\ctgan_fraudgen_v3.pkl
[INFO] Generating synthet

In [7]:
import pandas as pd
df = pd.read_csv("Processed/train_preprocessed.csv")
print("Fraud count =", df[df["isFraud"]==1].shape[0])


Fraud count = 20663


In [17]:
import pandas as pd
df = pd.read_csv("Synthetic/synthetic_fraud_v3.csv")
print("Fraud count =", df[df["isFraud"]==1].shape[0])

Fraud count = 200000


In [19]:
"""
FraudGen ‚Äì Performance & Synthetic Data Validation Report Generator
-------------------------------------------------------------------
This script:
  - Loads existing CSV/JSON metric files from your project
  - Creates modern-style matplotlib plots
  - Generates a multi-page PDF report:
      * Title & summary page
      * Synthetic data quality
      * Model performance (baseline vs synthetic-augmented)
      * Anomaly detection (Isolation Forest)
  - Output: FraudGen_Performance_Report.pdf

Run inside your `fraudgen` conda env:

    python generate_fraudgen_report.py

You can tweak paths in the CONFIG block if your files live elsewhere.
"""

import os
import json
import textwrap

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# =====================================================
# CONFIG ‚Äì EDIT PATHS HERE IF NEEDED
# =====================================================

CONFIG = {
    "BASE_DIR": ".",  # project root
    "OUTPUT_PDF": "FraudGen_Performance_Report.pdf",

    # metrics files
    "ANOMALY_JSON": "anomaly_iforest_metrics.json",
    "SYNTH_VAL_JSON": "synthetic_validation_summary.json",  # optional
    "SYNTH_VAL_CSV": "synthetic_data_validation_summary.csv",
    "KS_WS_CSV": "validation_ks_wasserstein.csv",
    "PEARSON_FEATS_CSV": "pearson_feature_ranking.csv",
    "RF_BASELINE_CSV": "rf_baseline_metrics.csv",
    "RF_SYNTH_CSV": "rf_synth_aug_metrics.csv",
    "CTGAN_METRICS_CSV": "performance_metrics_ctgan.csv",  # optional
}

# =====================================================
# HELPER ‚Äì MODERN MATPLOTLIB STYLE
# =====================================================

def set_modern_style():
    plt.style.use("default")
    plt.rcParams.update({
        "figure.facecolor": "white",
        "axes.facecolor": "#f7f7f7",
        "axes.grid": True,
        "grid.alpha": 0.3,
        "axes.edgecolor": "#cccccc",
        "axes.titleweight": "bold",
        "axes.titlesize": 12,
        "axes.labelsize": 10,
        "xtick.labelsize": 9,
        "ytick.labelsize": 9,
        "legend.fontsize": 9,
        "font.family": "sans-serif",
        "font.sans-serif": ["DejaVu Sans", "Arial", "Helvetica"],
    })


# nice palette for modern look (you asked for this)
COLORS = {
    "baseline": "#1f77b4",
    "synth": "#ff7f0e",
    "other": "#2ca02c",
}

# =====================================================
# SAFE LOADERS
# =====================================================

def load_csv_safe(path):
    if not os.path.exists(path):
        print(f"[WARN] CSV not found: {path} (skipping)")
        return None
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"[ERROR] Failed to read CSV {path}: {e}")
        return None

def load_json_safe(path):
    if not os.path.exists(path):
        print(f"[WARN] JSON not found: {path} (skipping)")
        return None
    try:
        with open(path, "r") as f:
            return json.load(f)
    except Exception as e:
        print(f"[ERROR] Failed to read JSON {path}: {e}")
        return None

# =====================================================
# 1) TITLE & SUMMARY PAGE
# =====================================================

def add_title_page(pdf, synth_summary_df, ctgan_metrics_df):
    set_modern_style()
    fig, ax = plt.subplots(figsize=(8.27, 11.69))  # A4-ish

    ax.axis("off")

    title = "FraudGen: Synthetic Data and Anomaly Detection\nfor Banking Transactions"
    subtitle = "Performance, Synthetic Data Validation, and Anomaly Detection Report"

    y = 0.9
    ax.text(0.5, y, title, ha="center", va="top", fontsize=18, weight="bold")
    y -= 0.05
    ax.text(0.5, y, subtitle, ha="center", va="top", fontsize=11, color="#555555")

    y -= 0.08

    # Synthetic quality quick summary (if available)
    if synth_summary_df is not None and not synth_summary_df.empty:
        summary = synth_summary_df.iloc[0].to_dict()
        lines = []
        if "Mean_KS" in summary:
            lines.append(f"Mean KS Statistic (numeric features): {summary['Mean_KS']:.4f}")
        if "Mean_Wasserstein" in summary:
            lines.append(f"Mean Wasserstein Distance: {summary['Mean_Wasserstein']:.4f}")
        if "CTST_AUC" in summary:
            lines.append(f"Classifier Two-Sample Test AUC: {summary['CTST_AUC']:.3f}")
        if "TSTR_F1" in summary:
            lines.append(f"TSTR F1 (train synthetic, test real): {summary['TSTR_F1']:.3f}")

        if lines:
            ax.text(0.1, y, "Synthetic Data Quality (Summary):", fontsize=11, weight="bold")
            y -= 0.03
            for line in lines:
                ax.text(0.12, y, "‚Ä¢ " + line, fontsize=10)
                y -= 0.025

    # CTGAN training stats, if present
    if ctgan_metrics_df is not None and not ctgan_metrics_df.empty:
        ct = ctgan_metrics_df.iloc[0].to_dict()
        lines = []
        for k in ["epochs", "batch_size", "train_time_minutes"]:
            if k in ct:
                lines.append(f"{k.replace('_', ' ').title()}: {ct[k]}")
        if lines:
            y -= 0.03
            ax.text(0.1, y, "CTGAN Training Summary:", fontsize=11, weight="bold")
            y -= 0.03
            for line in lines:
                ax.text(0.12, y, "‚Ä¢ " + str(line), fontsize=10)
                y -= 0.025

    # High-level narrative block
    y -= 0.04
    paragraph = (
        "This report summarizes the performance of the FraudGen system, which generates "
        "synthetic fraudulent banking transactions using a CTGAN-based model and evaluates "
        "their impact on downstream fraud and anomaly detection. The analysis compares base "
        "models trained on imbalanced real data against models trained or augmented with "
        "synthetic fraud, and reports gains or trade-offs in F1, precision, recall, and "
        "anomaly detection performance."
    )
    wrapped = textwrap.fill(paragraph, width=90)
    ax.text(0.1, y, wrapped, fontsize=10, va="top")

    pdf.savefig(fig)
    plt.close(fig)


# =====================================================
# 2) SYNTHETIC DATA QUALITY PLOTS
# =====================================================

def add_ks_wasserstein_page(pdf, ks_df):
    if ks_df is None or ks_df.empty:
        return

    set_modern_style()
    fig, ax = plt.subplots(figsize=(8, 5))

    # Expect columns like: Feature, KS_Statistic, Wasserstein_Distance
    cols = ks_df.columns.str.lower()
    ks_col = None
    ws_col = None
    for c in ks_df.columns:
        cl = c.lower()
        if "ks" in cl and "stat" in cl:
            ks_col = c
        if "wasser" in cl:
            ws_col = c
    if ks_col is None and "KS" in ks_df.columns:
        ks_col = "KS"
    if ws_col is None and "Wasserstein" in ks_df.columns:
        ws_col = "Wasserstein"

    text_lines = []
    if ks_col is not None:
        ks_vals = ks_df[ks_col].astype(float)
        text_lines.append(f"Mean KS statistic: {ks_vals.mean():.4f}")
        text_lines.append(f"Median KS statistic: {ks_vals.median():.4f}")
        ax.hist(ks_vals, bins=30, color=COLORS["baseline"], alpha=0.8)
        ax.set_title("Distribution of KS Statistics Across Features")
        ax.set_xlabel("KS Statistic")
        ax.set_ylabel("Feature Count")
    else:
        ax.text(0.5, 0.5, "KS statistics column not found.", ha="center", va="center")

    fig2, ax2 = plt.subplots(figsize=(8, 5))
    if ws_col is not None:
        ws_vals = ks_df[ws_col].astype(float)
        text_lines.append(f"Mean Wasserstein distance: {ws_vals.mean():.4f}")
        text_lines.append(f"Median Wasserstein distance: {ws_vals.median():.4f}")
        ax2.hist(ws_vals, bins=30, color=COLORS["synth"], alpha=0.8)
        ax2.set_title("Distribution of Wasserstein Distances Across Features")
        ax2.set_xlabel("Wasserstein Distance")
        ax2.set_ylabel("Feature Count")
    else:
        ax2.text(0.5, 0.5, "Wasserstein column not found.", ha="center", va="center")

    # Add text summary at bottom of first fig
    if text_lines:
        fig.subplots_adjust(bottom=0.25)
        y0 = 0.02
        for line in text_lines:
            fig.text(0.01, y0, line, fontsize=9)
            y0 += 0.03

    pdf.savefig(fig)
    plt.close(fig)

    pdf.savefig(fig2)
    plt.close(fig2)


def add_top_feature_corr_page(pdf, pearson_df, top_k=10):
    if pearson_df is None or pearson_df.empty:
        return

    # Expect something like: Feature, Pearson_Corr
    cols = pearson_df.columns.str.lower()
    feat_col = pearson_df.columns[0]
    pearson_col = pearson_df.columns[1] if len(pearson_df.columns) > 1 else None

    for c in pearson_df.columns:
        cl = c.lower()
        if "feature" in cl or "name" in cl:
            feat_col = c
        if "pearson" in cl or "corr" in cl:
            pearson_col = c

    if pearson_col is None:
        return

    df = pearson_df.copy()
    df[pearson_col] = df[pearson_col].astype(float).abs()
    df_top = df.sort_values(pearson_col, ascending=False).head(top_k)

    set_modern_style()
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.barh(df_top[feat_col], df_top[pearson_col], color=COLORS["other"])
    ax.invert_yaxis()
    ax.set_title(f"Top {top_k} Features by |Pearson Correlation| with isFraud")
    ax.set_xlabel("Absolute Pearson Correlation")
    pdf.savefig(fig)
    plt.close(fig)


# =====================================================
# 3) MODEL PERFORMANCE COMPARISON
# =====================================================

def add_model_performance_page(pdf, baseline_df, synth_df):
    if baseline_df is None or baseline_df.empty or synth_df is None or synth_df.empty:
        return

    # Assume single-row CSVs with metrics like F1, Precision, Recall, AUC
    base = baseline_df.iloc[0].to_dict()
    synth = synth_df.iloc[0].to_dict()

    # pick common numeric metrics
    metrics = []
    for m in ["F1", "F1_score", "f1", "Precision", "precision", "Recall", "recall", "AUC", "roc_auc"]:
        if m in base and m in synth:
            metrics.append(m)
    metrics = list(dict.fromkeys(metrics))  # unique & ordered

    if not metrics:
        print("[WARN] No common numeric metrics between baseline and synth metrics.")
        return

    set_modern_style()
    fig, ax = plt.subplots(figsize=(8, 5))

    x = np.arange(len(metrics))
    width = 0.35

    base_vals = [float(base[m]) for m in metrics]
    synth_vals = [float(synth[m]) for m in metrics]

    ax.bar(x - width/2, base_vals, width, label="Real-only Baseline", color=COLORS["baseline"], alpha=0.9)
    ax.bar(x + width/2, synth_vals, width, label="Real + Synthetic (FraudGen)", color=COLORS["synth"], alpha=0.9)

    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.set_ylabel("Score")
    ax.set_ylim(0, max(base_vals + synth_vals) * 1.15)
    ax.set_title("Model Performance: Real vs Real+Synthetic (FraudGen)")
    ax.legend()

    # annotate bars
    for i, (bx, sx) in enumerate(zip(base_vals, synth_vals)):
        ax.text(x[i] - width/2, bx + 0.01, f"{bx:.3f}", ha="center", va="bottom", fontsize=8)
        ax.text(x[i] + width/2, sx + 0.01, f"{sx:.3f}", ha="center", va="bottom", fontsize=8)

    pdf.savefig(fig)
    plt.close(fig)


# =====================================================
# 4) ANOMALY DETECTION PAGE (ISOLATION FOREST)
# =====================================================

def add_anomaly_page(pdf, anomaly_json):
    if anomaly_json is None:
        return

    # Assume anomaly_json is dict of metric_name -> value
    metrics = {k: v for k, v in anomaly_json.items() if isinstance(v, (int, float))}
    if not metrics:
        return

    names = list(metrics.keys())
    vals = [float(metrics[k]) for k in names]

    set_modern_style()
    fig, ax = plt.subplots(figsize=(8, 5))
    x = np.arange(len(names))
    ax.bar(x, vals, color=COLORS["other"], alpha=0.9)
    ax.set_xticks(x)
    ax.set_xticklabels(names, rotation=30, ha="right")
    ax.set_ylabel("Score")
    ax.set_title("Anomaly Detection Performance (Isolation Forest)")

    for i, v in enumerate(vals):
        ax.text(i, v + 0.01, f"{v:.3f}", ha="center", va="bottom", fontsize=8)

    pdf.savefig(fig)
    plt.close(fig)


# =====================================================
# 5) MAIN
# =====================================================

def main():
    base = CONFIG["BASE_DIR"]

    # Load everything
    synth_summary_csv = load_csv_safe(os.path.join(base, CONFIG["SYNTH_VAL_CSV"]))
    synth_val_json = load_json_safe(os.path.join(base, CONFIG["SYNTH_VAL_JSON"]))
    ks_ws_df = load_csv_safe(os.path.join(base, CONFIG["KS_WS_CSV"]))
    pearson_df = load_csv_safe(os.path.join(base, CONFIG["PEARSON_FEATS_CSV"]))
    rf_base_df = load_csv_safe(os.path.join(base, CONFIG["RF_BASELINE_CSV"]))
    rf_synth_df = load_csv_safe(os.path.join(base, CONFIG["RF_SYNTH_CSV"]))
    ctgan_df = load_csv_safe(os.path.join(base, CONFIG["CTGAN_METRICS_CSV"]))
    anomaly_json = load_json_safe(os.path.join(base, CONFIG["ANOMALY_JSON"]))

    # If JSON version of synthetic summary exists but CSV doesn't have those columns,
    # you can enhance later. For now, we mainly use CSV.

    out_path = os.path.join(base, CONFIG["OUTPUT_PDF"])
    print(f"[INFO] Creating PDF report -> {out_path}")

    with PdfPages(out_path) as pdf:
        # 1) Title page
        add_title_page(pdf, synth_summary_csv, ctgan_df)

        # 2) Synthetic data quality pages: KS & Wasserstein + feature correlations
        add_ks_wasserstein_page(pdf, ks_ws_df)
        add_top_feature_corr_page(pdf, pearson_df, top_k=10)

        # 3) Model performance baseline vs synthetic
        add_model_performance_page(pdf, rf_base_df, rf_synth_df)

        # 4) Anomaly detection metrics
        add_anomaly_page(pdf, anomaly_json)

        # 5) Final short conclusions page
        set_modern_style()
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.axis("off")
        txt = """
        Conclusions

        ‚Ä¢ Synthetic fraud data generated by FraudGen was integrated into the model
          training pipeline and evaluated against a real-data-only baseline.
        ‚Ä¢ The validation metrics (KS, Wasserstein, and classifier two-sample tests)
          show the degree of alignment between real and synthetic distributions,
          while TSTR gives a task-level measure of utility.
        ‚Ä¢ The comparison between baseline and synthetic-augmented models shows how
          synthetic data affects F1, precision, recall, and AUC on real held-out data.
        ‚Ä¢ Anomaly detection metrics (Isolation Forest) indicate whether synthetic
          data helps the model learn more robust decision boundaries for rare fraud.
        
        This report can now be cited as the quantitative backbone of the FraudGen
        project, and the figures can be reused in your final course report, slides,
        or a future conference/journal submission.
        """
        ax.text(0.05, 0.95, "Summary & Conclusions", fontsize=14, weight="bold", va="top")
        ax.text(0.05, 0.9, textwrap.fill(txt, width=90), fontsize=10, va="top")
        pdf.savefig(fig)
        plt.close(fig)

    print(f"[OK] Report saved -> {out_path}")


if __name__ == "__main__":
    main()


[WARN] CSV not found: .\synthetic_data_validation_summary.csv (skipping)
[WARN] JSON not found: .\synthetic_validation_summary.json (skipping)
[WARN] CSV not found: .\validation_ks_wasserstein.csv (skipping)
[WARN] CSV not found: .\pearson_feature_ranking.csv (skipping)
[WARN] CSV not found: .\rf_baseline_metrics.csv (skipping)
[WARN] CSV not found: .\rf_synth_aug_metrics.csv (skipping)
[WARN] CSV not found: .\performance_metrics_ctgan.csv (skipping)
[WARN] JSON not found: .\anomaly_iforest_metrics.json (skipping)
[INFO] Creating PDF report -> .\FraudGen_Performance_Report.pdf
[OK] Report saved -> .\FraudGen_Performance_Report.pdf


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ===============================
# LOAD VALIDATION + DATA
# ===============================
ks_path = "Reports/validation_ks_wasserstein.csv"
real_path = "Processed/train_preprocessed.csv"
synthetic_path = "Synthetic/synthetic_fraud_v3.csv"  # change if name differs

ks_df = pd.read_csv(ks_path)
real_df = pd.read_csv(real_path)
synthetic_df = pd.read_csv(synthetic_path)

# ===============================
# FIX COLUMN NAME FOR KS
# ===============================

# Try to find correct KS column even if mislabeled
possible_names = ["KS", "KS_Statistic", "Kolmogorov_Smirnov", "KS Value", "KS_value"]

ks_col = None
for col in ks_df.columns:
    if col in possible_names:
        ks_col = col
        break

# If no match, detect numeric column automatically
if ks_col is None:
    numeric_cols = ks_df.select_dtypes(include="number").columns
    if len(numeric_cols) >= 1:
        ks_col = numeric_cols[0]
        print(f"[WARN] Using numeric column '{ks_col}' as KS statistic")
    else:
        raise KeyError("‚ÄºÔ∏è ERROR: No numeric KS column found in validation file.")

print(f"[OK] Using KS column = '{ks_col}'")

# ===============================
# PICK TOP FEATURES (LOW KS = GOOD MATCH)
# ===============================
ks_df_sorted = ks_df.sort_values(by=ks_col)
top_features = ks_df_sorted["Feature"].head(8).tolist()

print("[INFO] BEST MATCHED FEATURES:", top_features)

# ===============================
# GENERATE KDE PLOTS
# ===============================

save_dir = "Plots"
os.makedirs(save_dir, exist_ok=True)

for col in top_features:
    if col not in real_df.columns or col not in synthetic_df.columns:
        print(f"[SKIP] Missing column: {col}")
        continue
    
    plt.figure(figsize=(6, 3))
    sns.kdeplot(real_df[col].dropna(), fill=True, label="Real", alpha=0.45)
    sns.kdeplot(synthetic_df[col].dropna(), fill=True, label="Synthetic", alpha=0.45)
    plt.title(f"KDE Comparison: {col}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{save_dir}/kde_{col}.png", dpi=300)
    plt.close()

print(f"[DONE] Plots saved to folder: {save_dir}")


[OK] Using KS column = 'KS'
[INFO] BEST MATCHED FEATURES: ['TransactionDT', 'TransactionID', 'card2', 'card1', 'D4', 'addr1', 'TransactionAmt', 'C6']
[DONE] Plots saved to folder: Plots


In [33]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# =====================================================
# LOAD DATA (CHANGE PATHS IF NEEDED)
# =====================================================
real_df = pd.read_csv("Processed/train_preprocessed.csv")
synthetic_df = pd.read_csv("Synthetic/synthetic_fraud_v3.csv")

# Filter real fraud
real_fraud = real_df[real_df["isFraud"] == 1]

# Align common columns
common_cols = [c for c in real_fraud.columns if c in synthetic_df.columns and c != "isFraud"]
real_fraud = real_fraud[common_cols]
synthetic_df = synthetic_df[common_cols]

# =====================================================
# 1) HISTOGRAM OVERLAY COMPARISON
# =====================================================
numeric_cols = real_fraud.select_dtypes(include=[np.number]).columns[:8]  # top 8 for clarity

for col in numeric_cols:
    plt.figure(figsize=(7, 4))
    sns.histplot(real_fraud[col], color="blue", kde=True, stat="density", label="Real", alpha=0.5)
    sns.histplot(synthetic_df[col], color="orange", kde=True, stat="density", label="Synthetic", alpha=0.5)
    plt.title(f"Distribution Comparison: {col}", fontsize=14, weight='bold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"Plots/hist_{col}.png", dpi=350)
    plt.close()

print("[OK] Histogram comparison plots saved in /Plots folder")

# =====================================================
# 2) CORRELATION HEATMAP COMPARISON
# =====================================================
plt.figure(figsize=(9, 6))
sns.heatmap(real_fraud.corr(), cmap="coolwarm", center=0, annot=False)
plt.title("Real Fraud Correlation Heatmap", fontsize=14, weight='bold')
plt.tight_layout()
plt.savefig("Plots/heatmap_real.png", dpi=350)
plt.close()

plt.figure(figsize=(9, 6))
sns.heatmap(synthetic_df.corr(), cmap="coolwarm", center=0, annot=False)
plt.title("Synthetic Fraud Correlation Heatmap", fontsize=14, weight='bold')
plt.tight_layout()
plt.savefig("Plots/heatmap_synth.png", dpi=350)
plt.close()

print("[OK] Correlation heatmaps saved")

# =====================================================
# 3) PCA DIMENSIONAL REDUCTION (SCATTER PLOT) [FIXED]
# =====================================================
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select only numeric columns
numeric_real = real_fraud.select_dtypes(include=[np.number])
numeric_synth = synthetic_df.select_dtypes(include=[np.number])

# Standardize numeric features
scaler = StandardScaler()
scaled_real = scaler.fit_transform(numeric_real.fillna(0))
scaled_synth = scaler.transform(numeric_synth.fillna(0))

# PCA Projection
pca = PCA(n_components=2)
pca_real = pca.fit_transform(scaled_real)
pca_synth = pca.transform(scaled_synth)

# Plotting PCA comparison
plt.figure(figsize=(7, 5))
plt.scatter(pca_real[:, 0], pca_real[:, 1], alpha=0.4, s=10, label="Real Fraud", color="blue")
plt.scatter(pca_synth[:, 0], pca_synth[:, 1], alpha=0.4, s=10, label="Synthetic Fraud", color="orange")

plt.title("PCA Comparison: Real vs Synthetic Fraud", fontsize=14, weight='bold')
plt.legend()
plt.tight_layout()
plt.savefig("Plots/pca_comparison.png", dpi=350)
plt.close()

print("[OK] Fixed PCA plot generated successfully!")



[OK] Histogram comparison plots saved in /Plots folder
[OK] Correlation heatmaps saved
[OK] Fixed PCA plot generated successfully!
