In [7]:
# ---- 1) Imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# ---- 2) Safer rare-cap and dataset
def cap_rare_inplace(X: pd.DataFrame, cols, min_count=20, min_frac=None):
    """Replace infrequent categories with '_OTHER_' using .loc to avoid chained assignment."""
    n = len(X)
    for col in cols:
        s = X[col].astype(str)
        # choose threshold
        t = min_count
        if min_frac is not None:
            t = max(t, int(min_frac * n))
        vc = s.value_counts(dropna=False)
        rare = vc[vc < t].index
        X.loc[:, col] = s.where(~s.isin(rare), "_OTHER_")

class AutoTabularDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, target: str | None = None, device=None,
                 min_count=20, min_frac=0.005):
        df = dataframe.copy()

        # Drop obvious ID-like columns (prevents 250k-wide one-hots)
        id_like = [c for c in df.columns
                   if c.lower().endswith("id") or c.lower().endswith("_id") or c.lower() == "id"]
        if id_like:
            df = df.drop(columns=id_like)

        # Split X/y
        if target is not None and target in df.columns:
            y_raw = df[target]
            X_raw = df.drop(columns=[target]).copy()  # copy() avoids SettingWithCopyWarning
        else:
            y_raw = None
            X_raw = df.copy()

        # Column types
        num_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in X_raw.columns if c not in num_cols]

        # Reduce cardinality before encoding
        if cat_cols:
            cap_rare_inplace(X_raw, cat_cols, min_count=min_count, min_frac=min_frac)

        # Pipelines
        num_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler(feature_range=(-1, 1))),
        ])
        try:
            cat_pipe = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ])
        except TypeError:  # older sklearn
            cat_pipe = Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
            ])

        pre = ColumnTransformer(
            transformers=[
                ("num", num_pipe, num_cols),
                ("cat", cat_pipe, cat_cols),
            ],
            remainder="drop",
            verbose_feature_names_out=False,
        )

        Xp = pre.fit_transform(X_raw)

        self.X = torch.as_tensor(Xp, dtype=torch.float32)
        if y_raw is None:
            self.y = self.X
        else:
            self.y = torch.as_tensor(y_raw.to_numpy(), dtype=torch.float32).unsqueeze(-1)

        self.device = device
        if device is not None:
            self.X = self.X.to(device)
            self.y = self.y.to(device)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return {"input": self.X[idx], "label": self.y[idx]}

# ---- 3) Load data and build loader (use .copy() when slicing)
FILE_PATH = "Loan_default.csv"
df = pd.read_csv(FILE_PATH)

# If you want a small test first:
# df = df.head(5000).copy()

dataset = AutoTabularDataset(df, target=None, device=None,
                             min_count=20, min_frac=0.005)

print("Processed dim:", dataset.X.shape)

batch_size = 32  # start small; raise after stable
dataloader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True, drop_last=True,
    num_workers=0, pin_memory=False, persistent_workers=False
)

# ---- 4) Minimal models + training loop (CPU first)
DEVICE = "cuda"
latent_dim = 20
in_dim = dataset.X.shape[1]

class Generator(nn.Module):
    def __init__(self, z, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(z, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, d), nn.Tanh(),
        )
    def forward(self, z): return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 256), nn.LeakyReLU(0.2, inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
        )
    def forward(self, x): return self.net(x)

G = Generator(latent_dim, in_dim).to(DEVICE)
D = Discriminator(in_dim).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optD = optim.Adam(D.parameters(), lr=2e-4, betas=(0.5, 0.999))
optG = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5, 0.999))

num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        real = batch["input"].to(DEVICE)
        bsz  = real.size(0)

        # ----- Train D
        real_lab = torch.empty(bsz, 1, device=DEVICE).uniform_(0.9, 1.0)  # label smoothing
        fake_lab = torch.empty(bsz, 1, device=DEVICE).uniform_(0.0, 0.1)

        optD.zero_grad()
        d_real = D(real)
        loss_real = criterion(d_real, real_lab)

        z = torch.randn(bsz, latent_dim, device=DEVICE)
        fake = G(z)
        d_fake = D(fake.detach())
        loss_fake = criterion(d_fake, fake_lab)

        (loss_real + loss_fake).backward()
        torch.nn.utils.clip_grad_norm_(D.parameters(), 1.0)  # optional but helpful
        optD.step()

        # ----- Train G
        optG.zero_grad()
        valid = torch.empty(bsz, 1, device=DEVICE).uniform_(0.9, 1.0)
        g_loss = criterion(D(fake), valid)
        g_loss.backward()
        torch.nn.utils.clip_grad_norm_(G.parameters(), 1.0)
        optG.step()

    print(f"Epoch {epoch+1}: D_real={loss_real.item():.4f}  D_fake={loss_fake.item():.4f}  G={g_loss.item():.4f}")

Processed dim: torch.Size([255347, 32])
Epoch 1: D_real=0.6777  D_fake=0.6207  G=1.0526
Epoch 2: D_real=0.2924  D_fake=0.3845  G=1.9531
Epoch 3: D_real=0.2740  D_fake=0.2293  G=2.6575
