# FraudGen — Optimized Notebook

This notebook is optimized for a local machine with **Ryzen 5980**, **4GB VRAM**, and **16GB RAM**. It includes reproducibility setup, preprocessing, feature selection, optimized CTGAN training, synthetic generation, fast validation, and an Optuna tuning module. Adjust the `CONFIG` cell as needed before running. Save models to the `Models/` folder to avoid retraining.

In [1]:

# CONFIG - adjust before running
CONFIG = {
    "BASE_PATH": "IEEE Primary Data",
    "PROCESSED_DIR": "Processed",
    "SYNTH_DIR": "Synthetic",
    "MODELS_DIR": "Models",
    "REPORTS_DIR": "Reports",
    "SEED": 42,
    "CTGAN_DEFAULT_BATCH": 256,
    "CTGAN_DEFAULT_EPOCHS": 60,
    "TOP_K_FEATURES": 160,
    "OPTUNA_TRIALS": 20,
    "OPTUNA_TIMEOUT": 60*60,
    "N_SYNTH_SAMPLES": 20000
}


In [3]:

# Reproducibility, device detection, and imports
import os, random, json, datetime, gc, math
import numpy as np, pandas as pd
SEED = CONFIG['SEED']
random.seed(SEED)
np.random.seed(SEED)

import torch
torch.manual_seed(SEED)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if DEVICE == 'cuda':
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print("Device:", DEVICE)

import matplotlib.pyplot as plt, seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.neighbors import NearestNeighbors
from scipy.stats import ks_2samp, wasserstein_distance
import joblib


Device: cpu


In [4]:

BASE_PATH = CONFIG['BASE_PATH']
trans_path = os.path.join(BASE_PATH, "train_transaction.csv")
id_path = os.path.join(BASE_PATH, "train_identity.csv")
print("Loading from:", trans_path, id_path)
transaction = pd.read_csv(trans_path)
identity = pd.read_csv(id_path)
print("Transaction shape:", transaction.shape)
print("Identity shape:", identity.shape)

df = transaction.merge(identity, on='TransactionID', how='left')
print("Merged shape:", df.shape)

os.makedirs(CONFIG['PROCESSED_DIR'], exist_ok=True)
os.makedirs(CONFIG['SYNTH_DIR'], exist_ok=True)
os.makedirs(CONFIG['MODELS_DIR'], exist_ok=True)
os.makedirs(CONFIG['REPORTS_DIR'], exist_ok=True)


Loading from: IEEE Primary Data\train_transaction.csv IEEE Primary Data\train_identity.csv
Transaction shape: (590540, 394)
Identity shape: (144233, 41)
Merged shape: (590540, 434)


In [5]:

possible_categorical = [
    'ProductCD', 'card1','card2','card3','card4','card5','card6',
    'addr1','addr2','P_emaildomain','R_emaildomain',
    'M1','M2','M3','M4','M5','M6','M7','M8','M9',
    'DeviceType','DeviceInfo'
]
for i in range(12,39):
    col = f"id_{i}"
    if col in df.columns:
        possible_categorical.append(col)

categorical_cols = [c for c in possible_categorical if c in df.columns]
target = 'isFraud'
exclude = set(['TransactionID', target] + categorical_cols)
numeric_cols = [c for c in df.columns if c not in exclude and df[c].dtype in [np.float64, np.int64]]

print("Categorical (curated):", len(categorical_cols))
print("Numeric cols:", len(numeric_cols))


Categorical (curated): 49
Numeric cols: 383


In [6]:

missing = df.isnull().mean()
drop_high = list(missing[missing > 0.9].index)
df.drop(columns=drop_high, inplace=True, errors='ignore')
print("Dropped high-missing columns:", len(drop_high))

for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(df[c].median())
for c in categorical_cols:
    if c in df.columns:
        df[c] = df[c].astype(str).fillna('missing')

mi_sample = df.sample(n=min(100000, len(df)), random_state=SEED)
feat_pool = [c for c in (numeric_cols + categorical_cols) if c in mi_sample.columns]
X_mi = mi_sample[feat_pool].copy()
le_map = {}
for c in categorical_cols:
    if c in X_mi.columns:
        le = LabelEncoder()
        X_mi[c] = le.fit_transform(X_mi[c].astype(str))
        le_map[c] = le
y_mi = mi_sample[target].astype(int)
mi = mutual_info_classif(X_mi.fillna(0), y_mi, random_state=SEED)
mi_s = pd.Series(mi, index=X_mi.columns).sort_values(ascending=False)
K = CONFIG['TOP_K_FEATURES']
top_features = mi_s.index[:K].tolist()
print("Top features selected:", len(top_features))
with open(os.path.join(CONFIG['MODELS_DIR'],'top_features.json'),'w') as f:
    json.dump(top_features, f)


Dropped high-missing columns: 12
Top features selected: 160


In [7]:

top = top_features
ctgan_cols = [c for c in top if c in df.columns]
ctgan_df = df[ctgan_cols + [target]].copy()

num_in_ctgan = [c for c in ctgan_cols if c in numeric_cols]
scaler = MinMaxScaler()
if num_in_ctgan:
    ctgan_df[num_in_ctgan] = scaler.fit_transform(ctgan_df[num_in_ctgan])

discrete_columns = [c for c in categorical_cols if c in ctgan_cols]
print("CTGAN features:", len(ctgan_cols), "numeric:", len(num_in_ctgan), "discrete:", len(discrete_columns))

joblib.dump(scaler, os.path.join(CONFIG['MODELS_DIR'],'minmax_scaler.pkl'))
with open(os.path.join(CONFIG['MODELS_DIR'],'ctgan_cols.json'),'w') as f:
    json.dump({"ctgan_cols": ctgan_cols, "discrete_columns": discrete_columns}, f)


CTGAN features: 160 numeric: 129 discrete: 31


In [13]:

try:
    from sdv.tabular import CTGAN as CTGAN_impl
except Exception:
    try:
        from sdv.single_table.ctgan import CTGAN as CTGAN_impl
    except Exception:
        from ctgan import CTGAN as CTGAN_impl

ctgan_train_df = ctgan_df[ctgan_df[target]==1].drop(columns=[target]).reset_index(drop=True)
print("Fraud-only rows for CTGAN:", len(ctgan_train_df))

batch = CONFIG['CTGAN_DEFAULT_BATCH']
epochs = CONFIG['CTGAN_DEFAULT_EPOCHS']

ctgan = CTGAN_impl(
    epochs=epochs,
    batch_size= 128,
    generator_dim=(256,256),
    discriminator_dim=(256,128),
    generator_lr=2e-4,
    discriminator_lr=2e-4,
    pac=1,
    verbose=True
)
ctgan.random_state = SEED

ctgan.fit(ctgan_train_df, discrete_columns=discrete_columns)
joblib.dump(ctgan, os.path.join(CONFIG['MODELS_DIR'],'ctgan_optimized.pkl'))
print("CTGAN trained and saved.")


Fraud-only rows for CTGAN: 20663


Gen. (-0.26) | Discrim. (-0.26): 100%|███████████████████████████████████████████████| 60/60 [1:20:26<00:00, 80.44s/it]


CTGAN trained and saved.


In [15]:

import joblib
ctgan = joblib.load(os.path.join(CONFIG['MODELS_DIR'],'ctgan_optimized.pkl'))
n = CONFIG['N_SYNTH_SAMPLES']
synthetic = ctgan.sample(n)
synthetic[target] = 1

if num_in_ctgan:
    synthetic[num_in_ctgan] = scaler.inverse_transform(synthetic[num_in_ctgan])

synthetic.to_csv(os.path.join(CONFIG['SYNTH_DIR'],'synthetic_fraud_optimized.csv'), index=False)
print("Synthetic saved:", synthetic.shape)


  synthetic[target] = 1


Synthetic saved: (20000, 161)


In [16]:

real_all = df[df[target]==1][ctgan_cols].sample(n=min(5000, df[df[target]==1].shape[0]), random_state=SEED)
synth_sample = synthetic.sample(n=min(5000, len(synthetic)), random_state=SEED)

val_cols = [c for c in ctgan_cols if c in real_all.columns and c in synth_sample.columns]
real_val = real_all[val_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
synth_val = synth_sample[val_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

keep_cols = [c for c in val_cols if real_val[c].std()!=0 and synth_val[c].std()!=0]
real_val = real_val[keep_cols]; synth_val = synth_val[keep_cols]

pca = PCA(n_components=min(50, real_val.shape[1]), random_state=SEED)
real_p = pca.fit_transform(real_val)
synth_p = pca.transform(synth_val)

from sklearn.metrics.pairwise import rbf_kernel
def mmd_rbf(X, Y, gamma=None):
    if gamma is None:
        gamma = 1.0 / X.shape[1]
    Kxx = rbf_kernel(X, X, gamma=gamma); Kyy = rbf_kernel(Y, Y, gamma=gamma); Kxy = rbf_kernel(X, Y, gamma=gamma)
    return Kxx.mean() + Kyy.mean() - 2*Kxy.mean()

mmd_val = mmd_rbf(real_p, synth_p)
print("MMD (PCA):", mmd_val)

X = np.vstack([real_p, synth_p])
y = np.hstack([np.zeros(len(real_p)), np.ones(len(synth_p))])
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=SEED)
clf.fit(Xtr, ytr)
auc = roc_auc_score(yte, clf.predict_proba(Xte)[:,1])
print("CTST AUC:", auc)

corr_real = np.corrcoef(real_p, rowvar=False); corr_synth = np.corrcoef(synth_p, rowvar=False)
corr_diff = np.linalg.norm(corr_real - corr_synth, ord='fro')
print("Corr Frobenius (PCA):", corr_diff)


MMD (PCA): 0.0005426255357171088
CTST AUC: 0.9894873333333334
Corr Frobenius (PCA): 14.382459134441547


In [20]:
!pip install optuna
import optuna
def objective(trial):
    epochs = trial.suggest_int('epochs', 20, 120)
    batch = trial.suggest_categorical('batch_size', [128, 256, 512])
    gen_dim = trial.suggest_categorical('gen_dim', [(128,128),(256,256),(512,512)])
    disc_dim = trial.suggest_categorical('disc_dim', [(128,128),(256,128),(512,256)])
    pac = trial.suggest_int('pac', 1, 10)
    glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
    dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)

    try:
        ct = CTGAN_impl(epochs=epochs, batch_size=batch,
                        generator_dim=gen_dim, discriminator_dim=disc_dim,
                        generator_lr=glr, discriminator_lr=dlr, pac=pac, verbose=False)
        ct.random_state = SEED
        sample_df = ctgan_df.sample(n=min(20000, len(ctgan_df)), random_state=SEED)
        fraud_only = sample_df[sample_df[target]==1].drop(columns=[target]).reset_index(drop=True)
        ct.fit(fraud_only, discrete_columns=discrete_columns)
        synth_test = ct.sample(min(2000, len(fraud_only)))
        from sklearn.decomposition import PCA
        real_small = fraud_only.sample(n=min(2000, len(fraud_only)), random_state=SEED)
        real_arr = real_small.select_dtypes(include=[np.number]).fillna(0).values
        synth_arr = synth_test.select_dtypes(include=[np.number]).fillna(0).values
        min_dim = min(real_arr.shape[1], synth_arr.shape[1])
        pca = PCA(n_components=min(20, min_dim), random_state=SEED)
        Xr = pca.fit_transform(real_arr[:2000,:min_dim])
        Xs = pca.transform(synth_arr[:2000,:min_dim])
        X = np.vstack([Xr, Xs]); y = np.hstack([np.zeros(len(Xr)), np.ones(len(Xs))])
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=SEED)
        clf.fit(Xtr, ytr)
        auc = roc_auc_score(yte, clf.predict_proba(Xte)[:,1])
        return auc
    except Exception as e:
        print("Trial failed:", e)
        return 1.0

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(objective, n_trials=CONFIG['OPTUNA_TRIALS'], timeout=CONFIG['OPTUNA_TIMEOUT'])
print("Best params:", study.best_params)
joblib.dump(study.best_params, os.path.join(CONFIG['MODELS_DIR'],'optuna_best_params.pkl'))


[I 2025-11-17 07:49:03,659] A new study created in memory with name: no-name-a1a28fa1-2f84-4789-b941-9a07c1c020e6


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:50:38,598] Trial 0 finished with value: 0.9645511845251032 and parameters: {'epochs': 57, 'batch_size': 128, 'gen_dim': (128, 128), 'disc_dim': (128, 128), 'pac': 1, 'g_lr': 0.0008706020878304854, 'd_lr': 0.000462258900102083}. Best is trial 0 with value: 0.9645511845251032.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:51:31,026] Trial 1 finished with value: 0.9982938491632254 and parameters: {'epochs': 41, 'batch_size': 512, 'gen_dim': (128, 128), 'disc_dim': (128, 128), 'pac': 4, 'g_lr': 8.168455894760161e-05, 'd_lr': 0.00037183641805732076}. Best is trial 0 with value: 0.9645511845251032.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:52:28,056] Trial 2 finished with value: 0.9999456639860901 and parameters: {'

Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:55:37,121] Trial 5 finished with value: 0.9807650510758531 and parameters: {'epochs': 74, 'batch_size': 256, 'gen_dim': (128, 128), 'disc_dim': (256, 128), 'pac': 8, 'g_lr': 0.000348771262454593, 'd_lr': 1.4063366777718176e-05}. Best is trial 3 with value: 0.9500869376222559.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:56:12,356] Trial 6 finished with value: 1.0 and parameters: {'epochs': 56, 'batch_size': 256, 'gen_dim': (128, 128), 'disc_dim': (256, 128), 'pac': 9, 'g_lr': 8.798929749689021e-05, 'd_lr': 1.7345566642360933e-05}. Best is trial 3 with value: 0.9500869376222559.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:56:46,971] Trial 7 finished with value: 1.0 and parameters: {'epochs': 92, 'batch_size': 512, 'gen_dim': (256, 256), 'disc_dim': (256, 128), 'pac': 7, 'g_lr': 4.253162363790868e-05, 'd_lr': 0.0001040258761588385}. Best is trial 3 with value: 0.9500869376222559.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:57:21,438] Trial 8 finished with value: 1.0 and parameters: {'epochs': 111, 'batch_size': 512, 'gen_dim': (512, 512), 'disc_dim': (256, 128), 'pac': 7, 'g_lr': 0.0005532496914298506, 'd_lr': 0.00040489662225846743}. Best is trial 3 with value: 0.9500869376222559.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:57:55,947] Trial 9 finished with value: 1.0 and parameters: {'epochs': 38, 'batch_size': 128, 'gen_dim': (128, 128), 'disc_dim': (512, 256), 'pac': 9, 'g_lr': 1.0325337616482036e-05, 'd_lr': 0.00010507384024181397}. Best is trial 3 with value: 0.9500869376222559.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 07:59:18,838] Trial 10 finished with value: 0.9381982177787438 and parameters: {'epochs': 80, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 1, 'g_lr': 0.00024015639024822428, 'd_lr': 0.0008554258700460189}. Best is trial 10 with value: 0.9381982177787438.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:00:43,881] Trial 11 finished with value: 0.9363616605085852 and parameters: {'epochs': 83, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 1, 'g_lr': 0.00029184859523967237, 'd_lr': 0.0009544737803573184}. Best is trial 11 with value: 0.9363616605085852.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:01:19,331] Trial 12 finished with value: 1.0 and parameters: {'epochs':

Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:02:50,145] Trial 13 finished with value: 0.955509671810476 and parameters: {'epochs': 92, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 1, 'g_lr': 0.00018233752877493294, 'd_lr': 0.00098182268781891}. Best is trial 11 with value: 0.9363616605085852.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:03:26,060] Trial 14 finished with value: 1.0 and parameters: {'epochs': 78, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 5, 'g_lr': 0.00018848131449384186, 'd_lr': 0.0001972329756850234}. Best is trial 11 with value: 0.9363616605085852.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:05:04,009] Trial 15 finished with value: 0.9545207563573136 and parameters: {'epochs': 98, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 2, 'g_lr': 0.0003827647122678082, 'd_lr': 0.0007149179025319618}. Best is trial 11 with value: 0.9363616605085852.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:05:39,751] Trial 16 finished with value: 1.0 and parameters: {'epochs': 104, 'batch_size': 128, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 3, 'g_lr': 0.00015190640610450326, 'd_lr': 5.597001347736213e-05}. Best is trial 11 with value: 0.9363616605085852.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:07:02,242] Trial 17 finished with value: 0.994533797000652 and parameters: {'epochs': 77, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 1, 'g_lr': 0.0005267106659654972, 'd_lr': 0.0001900802228822253}. Best is trial 11 with value: 0.9363616605085852.
  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:07:36,493] Trial 18 finished with value: 1.0 and parameters: {'epochs': 21, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (128, 128), 'pac': 6, 'g_lr': 0.0001315926964812684, 'd_lr': 0.0006085111049720903}. Best is trial 11 with value: 0.9363616605085852.


Trial failed: 


  glr = trial.suggest_loguniform('g_lr', 1e-5, 1e-3)
  dlr = trial.suggest_loguniform('d_lr', 1e-5, 1e-3)
[I 2025-11-17 08:09:28,297] Trial 19 finished with value: 0.9533253640512932 and parameters: {'epochs': 66, 'batch_size': 128, 'gen_dim': (256, 256), 'disc_dim': (512, 256), 'pac': 2, 'g_lr': 4.09026357932468e-05, 'd_lr': 4.371993126708937e-05}. Best is trial 11 with value: 0.9363616605085852.


Best params: {'epochs': 83, 'batch_size': 256, 'gen_dim': (512, 512), 'disc_dim': (512, 256), 'pac': 1, 'g_lr': 0.00029184859523967237, 'd_lr': 0.0009544737803573184}


['Models\\optuna_best_params.pkl']

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp, wasserstein_distance
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# -----------------------------
# Assume your data:

real_df = pd.read_csv("IEEE Primary Data/test_transaction.csv")
synth_df = pd.read_csv("Synthetic/synthetic_fraud_optimized.csv")

# -----------------------------

# Automatically detect numeric columns
num_cols = real_df.select_dtypes(include=[np.number]).columns


# ============================================================
# 1. Distribution Comparison (Histogram + KDE)
# ============================================================

def plot_distributions(real_df, synth_df, num_cols, bins=40):
    for col in num_cols:
        plt.figure(figsize=(7, 4))
        sns.histplot(real_df[col], bins=bins, stat='density', label='Real', alpha=0.5)
        sns.histplot(synth_df[col], bins=bins, stat='density', label='Synthetic', alpha=0.5)
        sns.kdeplot(real_df[col], color='blue')
        sns.kdeplot(synth_df[col], color='orange')
        plt.title(f"Distribution Comparison for {col}")
        plt.legend()
        plt.show()

plot_distributions(real_df, synth_df, num_cols)
