In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
# ---------- helpers
def cap_rare_inplace(df: pd.DataFrame, cols, min_count=20, min_frac=0.005, other_token="_OTHER_"):
    """Replace infrequent categories with 'other_token' (in-place)."""
    n = len(df)
    thr = min_count if min_frac is None else max(min_count, int(n * min_frac))
    for c in cols:
        s = df[c].astype(str)
        vc = s.value_counts(dropna=False)
        rare = vc[vc < thr].index
        df.loc[:, c] = s.where(~s.isin(rare), other_token)

def _make_onehot():
    """Create OneHotEncoder compatible with both old/new sklearn."""
    try:
        # sklearn >= 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        # sklearn < 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

In [3]:
# ---------- dataset
class AutoTabularDataset(Dataset):
    """
    Unsupervised tabular dataset for GANs:
      - Drops ID-like columns to avoid huge one-hots
      - Imputes numeric (median) and categorical (most_frequent)
      - Scales numeric to [-1, 1]
      - One-hot encodes categorically (compat with sklearn <1.2 and >=1.2)
      - Stores the fitted preprocessor for reuse on new data
    """
    def __init__(self, df: pd.DataFrame, min_count=20, min_frac=0.005, feature_range=(-1, 1)):
        df = df.copy()

        # 1) drop obvious ID-like columns
        id_like = [c for c in df.columns
                   if c.lower().endswith("id") or c.lower().endswith("_id") or c.lower() == "id"]
        if id_like:
            df.drop(columns=id_like, inplace=True)

        # 2) split by dtype
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in df.columns if c not in num_cols]

        # 3) reduce categorical cardinality before encoding
        if cat_cols:
            cap_rare_inplace(df, cat_cols, min_count=min_count, min_frac=min_frac)

        # 4) build preprocessor
        onehot = _make_onehot()
        num_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler(feature_range=feature_range)),
        ])
        cat_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", onehot),
        ])
        self.preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_pipe, num_cols),
                ("cat", cat_pipe, cat_cols),
            ],
            remainder="drop",
            verbose_feature_names_out=False,
        )

        # 5) fit + transform
        Xp = self.preprocessor.fit_transform(df)
        self.X = torch.as_tensor(Xp, dtype=torch.float32)

        # keep some metadata (optional but handy)
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        try:
            self.feature_names_ = self.preprocessor.get_feature_names_out().tolist()
        except Exception:
            self.feature_names_ = None

    # ---- torch Dataset protocol
    def __len__(self):  return self.X.shape[0]
    def __getitem__(self, i): return {"input": self.X[i]}

    # ---- transform new data later with the SAME preprocessor
    def transform_df(self, df_new: pd.DataFrame) -> torch.Tensor:
        df_new = df_new.copy()
        # drop the same id-like columns if present
        for c in list(df_new.columns):
            if c.lower().endswith("id") or c.lower().endswith("_id") or c.lower() == "id":
                df_new.drop(columns=[c], inplace=True, errors="ignore")
        # cap rare on same categorical columns (prevent unseen blowups)
        if self.cat_cols:
            cap_rare_inplace(df_new, self.cat_cols, min_count=1, min_frac=None) # type: ignore
        Xp = self.preprocessor.transform(df_new)
        return torch.as_tensor(Xp, dtype=torch.float32)

In [4]:
torch.cuda.manual_seed_all(42)
   
FILE_PATH = "Loan_default.csv"
raw_df = pd.read_csv(FILE_PATH)

dataset = AutoTabularDataset(raw_df, min_count=20, min_frac=0.005)
print("Processed dim:", dataset.X.shape)

batch_size = 256
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# ===== 3) Models (simple MLPs)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)
latent_dim = 20
in_dim = dataset.X.shape[1]

Processed dim: torch.Size([255347, 32])


In [5]:
class Generator(nn.Module):
    def __init__(self, z, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, d), nn.Tanh(),
        )
    def forward(self, z): return self.net(z)
    
G = Generator(latent_dim, in_dim).to(DEVICE)
optG = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5, 0.999))

In [6]:
class Discriminator(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 1),  # scores (not probabilities)
        )
    def forward(self, x): return self.net(x)
    
D = Discriminator(in_dim).to(DEVICE)
optD = optim.Adam(D.parameters(), lr=1e-4, betas=(0.5, 0.999))

In [7]:
num_epochs = 50
n_critic = 2  # D steps per G step (typical for hinge)
noise_std = 0.05

In [8]:
for epoch in range(1, num_epochs + 1):
    D_real_epoch, D_fake_epoch, G_epoch, steps = 0.0, 0.0, 0.0, 0

    for i, batch in enumerate(loader):
        real = batch["input"].to(DEVICE)
        b = real.size(0)

        # --- D step(s)
        for _ in range(n_critic):
            z = torch.randn(b, latent_dim, device=DEVICE)
            fake = G(z)

            optD.zero_grad(set_to_none=True)
            
            real_noisy = real + noise_std * torch.randn_like(real)
            fake_noisy = fake.detach() + noise_std * torch.randn_like(fake)

            d_real = D(real_noisy)
            d_fake = D(fake_noisy)

            # Hinge loss for D: E[max(0, 1 - D(real))] + E[max(0, 1 + D(fake))]
            loss_D = F.relu(1.0 - d_real).mean() + F.relu(1.0 + d_fake).mean()
            
            if i % 16 == 0:
                x_real = real.detach().requires_grad_(True)
                d_real_full = D(x_real).sum()
                grads = torch.autograd.grad(d_real_full, x_real, create_graph=True)[0]
                r1_penalty = grads.pow(2).sum(dim=1).mean() * 1.0  
                r1_penalty.backward()
            
            loss_D.backward()
            optD.step()

        # --- G step
        z = torch.randn(b, latent_dim, device=DEVICE)
        fake = G(z)

        optG.zero_grad(set_to_none=True)
        # Hinge loss for G: -E[D(fake)]
        loss_G = -D(fake).mean()
        loss_G.backward()
        optG.step()

        # track averages
        D_real_epoch += (1.0 - d_real.detach()).clamp_min(0).mean().item()
        D_fake_epoch += (1.0 + d_fake.detach()).clamp_min(0).mean().item()
        G_epoch      += loss_G.detach().item()
        steps += 1
        
        with torch.no_grad():
            D_real_term = torch.relu(torch.ones_like(d_real) - d_real).mean().item()
            D_fake_term = torch.relu(torch.ones_like(d_fake) + d_fake).mean().item()

            # Raw scores (for intuition)
            D_real_score = d_real.mean().item()
            D_fake_score = d_fake.mean().item()

            # Generator objectives
            G_loss_val = loss_G.detach().item()   # you already computed loss_G above
            G_score    = -G_loss_val              # because loss_G = -mean(D(fake))

    print(
        f"Epoch {epoch:03d}: "
        f"D_real={D_real_term:.4f}  D_fake={D_fake_term:.4f}  G_loss={G_loss_val:.4f}  "
        f"[scores: D(real)={D_real_score:.3f}, D(fake)={D_fake_score:.3f}, G_score={G_score:.3f}]"
    )

Epoch 001: D_real=0.8727  D_fake=0.6213  G_loss=0.3962  [scores: D(real)=0.147, D(fake)=-0.401, G_score=-0.396]
Epoch 002: D_real=1.1091  D_fake=0.7336  G_loss=0.3368  [scores: D(real)=-0.104, D(fake)=-0.275, G_score=-0.337]
Epoch 003: D_real=0.9746  D_fake=0.7120  G_loss=0.3251  [scores: D(real)=0.042, D(fake)=-0.297, G_score=-0.325]
Epoch 004: D_real=0.8818  D_fake=0.7036  G_loss=0.3424  [scores: D(real)=0.125, D(fake)=-0.307, G_score=-0.342]
Epoch 005: D_real=0.8138  D_fake=0.6941  G_loss=0.3558  [scores: D(real)=0.207, D(fake)=-0.316, G_score=-0.356]
Epoch 006: D_real=0.9090  D_fake=1.0186  G_loss=-0.0373  [scores: D(real)=0.093, D(fake)=0.017, G_score=0.037]
Epoch 007: D_real=0.9147  D_fake=0.7694  G_loss=0.2656  [scores: D(real)=0.088, D(fake)=-0.235, G_score=-0.266]
Epoch 008: D_real=0.9337  D_fake=0.8629  G_loss=0.1231  [scores: D(real)=0.067, D(fake)=-0.137, G_score=-0.123]
Epoch 009: D_real=0.9470  D_fake=0.7833  G_loss=0.1953  [scores: D(real)=0.057, D(fake)=-0.224, G_score=