In [6]:
# Imports
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [7]:
def cap_rare_inplace(df: pd.DataFrame, cols, min_count=20, min_frac=0.005, other_token="_OTHER_"):
    n = len(df)
    thr = min_count if min_frac is None else max(min_count, int(n * min_frac))
    for c in cols:
        s = df[c].astype(str)
        vc = s.value_counts(dropna=False)
        rare = vc[vc < thr].index
        df.loc[:, c] = s.where(~s.isin(rare), other_token)

def _make_onehot():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)  
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

In [8]:
class AutoTabularDataset(Dataset):
    """
    Unsupervised tabular dataset for GANs:
      - Drops ID-like columns to avoid huge one-hots
      - Imputes numeric (median) and categorical (most_frequent)
      - Scales numeric to [-1, 1]
      - One-hot encodes categoricals (sklearn <=/>=1.2)
      - Stores the fitted preprocessor for reuse on new data
    """
    def __init__(self, df: pd.DataFrame, min_count=20, min_frac=0.005, feature_range=(-1, 1)):
        df = df.copy()
        id_like = [c for c in df.columns
                   if c.lower().endswith("id") or c.lower().endswith("_id") or c.lower() == "id"]
        if id_like:
            df.drop(columns=id_like, inplace=True)

        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in df.columns if c not in num_cols]

        if cat_cols:
            cap_rare_inplace(df, cat_cols, min_count=min_count, min_frac=min_frac)

        onehot = _make_onehot()
        num_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler(feature_range=feature_range)),
        ])
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", onehot),
        ])
        self.preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_pipe, num_cols),
                ("cat", cat_pipe, cat_cols),
            ],
            remainder="drop",
            verbose_feature_names_out=False,
        )

        Xp = self.preprocessor.fit_transform(df)
        self.X = torch.as_tensor(Xp, dtype=torch.float32)

        self.num_cols = num_cols
        self.cat_cols = cat_cols
        try:
            self.feature_names_ = self.preprocessor.get_feature_names_out().tolist()
        except Exception:
            self.feature_names_ = None

    def __len__(self):  return self.X.shape[0]
    def __getitem__(self, i): return {"input": self.X[i]}

    def transform_df(self, df_new: pd.DataFrame) -> torch.Tensor:
        df_new = df_new.copy()
        for c in list(df_new.columns):
            if c.lower().endswith("id") or c.lower().endswith("_id") or c.lower() == "id":
                df_new.drop(columns=[c], inplace=True, errors="ignore")
        if self.cat_cols:
            cap_rare_inplace(df_new, self.cat_cols, min_count=1, min_frac=None) # type: ignore
        Xp = self.preprocessor.transform(df_new)
        return torch.as_tensor(Xp, dtype=torch.float32)

In [9]:
torch.cuda.manual_seed_all(42)
   
FILE_PATH = "../data/Loan_default.csv"
raw_df = pd.read_csv(FILE_PATH)

dataset = AutoTabularDataset(raw_df, min_count=20, min_frac=0.005)
print("Processed dim:", dataset.X.shape)

batch_size = 256
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# ===== 3) Models (simple MLPs)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)
latent_dim = 20
in_dim = dataset.X.shape[1]

Processed dim: torch.Size([255347, 32])


In [10]:
class Generator(nn.Module):
    def __init__(self, z, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, d), nn.Tanh(),
        )
    def forward(self, z): return self.net(z)
    
G = Generator(latent_dim, in_dim).to(DEVICE)
optG = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5, 0.999))

In [11]:

class Discriminator(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
        )
    def forward(self, x): return self.net(x)

D = Discriminator(in_dim).to(DEVICE)
optD = optim.Adam(D.parameters(), lr=1e-4, betas=(0.5, 0.999))

In [12]:
num_epochs = 50
n_critic = 2  
noise_std = 0.05

In [13]:
def noise_schedule(epoch, start=0.05, end=0.0, decay_epochs=30):
    if epoch <= 1: return start
    if epoch >= decay_epochs: return end
    t = (epoch - 1) / (decay_epochs - 1)
    return start + t * (end - start)

In [14]:
for epoch in range(1, num_epochs + 1):
    G.train(); D.train()
    D_real_epoch = D_fake_epoch = G_epoch = 0.0
    steps = 0
    sigma = noise_schedule(epoch)

    for i, batch in enumerate(loader):
        real = batch["input"].to(DEVICE)
        b = real.size(0)

        # --- D step(s)
        for _ in range(n_critic):
            z = torch.randn(b, latent_dim, device=DEVICE)
            fake = G(z)

            optD.zero_grad(set_to_none=True)

            real_noisy = real + sigma * torch.randn_like(real)
            fake_noisy = fake.detach() + sigma * torch.randn_like(fake)

            d_real = D(real_noisy)
            d_fake = D(fake_noisy)

            hinge_D = F.relu(1.0 - d_real).mean() + F.relu(1.0 + d_fake).mean()

            # Lazy R1 (γ=1.0) on real every 16 steps
            if i % 16 == 0:
                x_real = real.detach().requires_grad_(True)
                d_sum = D(x_real).sum()
                grads = torch.autograd.grad(d_sum, x_real, create_graph=True)[0]
                r1 = grads.pow(2).sum(dim=1).mean() * 1.0
                loss_D_total = hinge_D + r1
            else:
                loss_D_total = hinge_D

            loss_D_total.backward()
            optD.step()

        # --- G step
        z = torch.randn(b, latent_dim, device=DEVICE)
        fake = G(z)
        optG.zero_grad(set_to_none=True)
        loss_G = -D(fake).mean()
        loss_G.backward()
        optG.step()

        # Accumulate epoch metrics (hinge penalties; lower is better)
        with torch.no_grad():
            D_real_epoch += F.relu(1 - d_real).mean().item()
            D_fake_epoch += F.relu(1 + d_fake).mean().item()
            G_epoch      += loss_G.item()
            steps += 1

    # epoch averages
    D_real_avg = D_real_epoch / steps
    D_fake_avg = D_fake_epoch / steps
    G_avg      = G_epoch / steps

    # Optional raw score peek on the *last* batch
    D_real_score = d_real.mean().item()
    D_fake_score = d_fake.mean().item()
    G_score = -loss_G.item()

    print(
        f"Epoch {epoch:03d}: "
        f"D_real={D_real_avg:.4f}  D_fake={D_fake_avg:.4f}  G_loss={G_avg:.4f}  "
        f"(noise={sigma:.3f})  "
        f"[scores: D(real)={D_real_score:.3f}, D(fake)={D_fake_score:.3f}, G_score={G_score:.3f}]"
    )

Epoch 001: D_real=0.8856  D_fake=0.7368  G_loss=0.3445  (noise=0.050)  [scores: D(real)=0.147, D(fake)=-0.401, G_score=-0.396]
Epoch 002: D_real=0.9885  D_fake=0.7629  G_loss=0.2871  (noise=0.048)  [scores: D(real)=-0.103, D(fake)=-0.289, G_score=-0.349]
Epoch 003: D_real=0.9810  D_fake=0.7563  G_loss=0.2843  (noise=0.047)  [scores: D(real)=0.034, D(fake)=-0.386, G_score=-0.411]
Epoch 004: D_real=0.9034  D_fake=0.7696  G_loss=0.2625  (noise=0.045)  [scores: D(real)=0.167, D(fake)=-0.167, G_score=-0.163]
Epoch 005: D_real=0.8398  D_fake=0.7073  G_loss=0.3252  (noise=0.043)  [scores: D(real)=0.281, D(fake)=-0.228, G_score=-0.224]
Epoch 006: D_real=0.8570  D_fake=0.7918  G_loss=0.2313  (noise=0.041)  [scores: D(real)=0.190, D(fake)=-0.331, G_score=-0.357]
Epoch 007: D_real=0.9089  D_fake=0.8604  G_loss=0.1609  (noise=0.040)  [scores: D(real)=0.035, D(fake)=-0.004, G_score=-0.028]
Epoch 008: D_real=0.9143  D_fake=0.8593  G_loss=0.1624  (noise=0.038)  [scores: D(real)=0.155, D(fake)=-0.201,