Data: https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset

In [None]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# -----------------------------
# Config
# -----------------------------
FILE_PATH   = "Loan_default.csv"
BATCH_SIZE  = 32
LATENT_DIM  = 64          # noise vector size for the generator
TARGET_COL  = None        # e.g., "defaulted" for supervised; None for GAN-style (label == input)
DEVICE      = "cpu"
PRETRAINED  = False       # set True to load from PRETRAIN_PATH
PRETRAIN_PATH = "pretrained_model.pth"

In [None]:
# defining a single generation block function
def FC_Layer_blockGen(input_dim, output_dim):
    single_block = nn.Sequential(
        nn.Linear(input_dim, output_dim),

        nn.ReLU()
    )
    return single_block

# DEFINING THE GENERATOR
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Tanh()  
        )

    def forward(self, x):
        return self.model(x)

# defining a single discriminator block       
def FC_Layer_BlockDisc(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.ReLU(),
        nn.Dropout(0.4)
    )

# Defining the discriminator

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

#Defining training parameters
batch_size = 128
num_epochs = 500
lr = 0.0002
num_features = 6
latent_dim = 20

# MODEL INITIALIZATION
generator = Generator(latent_dim, num_features)
discriminator = Discriminator(num_features)

# LOSS FUNCTION AND OPTIMIZERS
criterion = nn.BCELoss()
gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)
disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)

In [None]:
file_path = 'Loan_default.csv'
data = pd.read_csv(file_path)
# X = data.values
# X_normalized = torch.FloatTensor((X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) * 2 - 1)
# real_data = X_normalized

#Creating a dataset

class AutoTabularDataset(Dataset):
    """
    Automatically preprocesses tabular data with mixed numeric + categorical features.
      - Numeric: median impute -> MinMaxScaler(feature_range=(-1, 1))
      - Categorical: most-frequent impute -> OneHotEncoder(handle_unknown='ignore'), dense (0/1)
    Returns tensors suitable for PyTorch.
    """
    def __init__(
        self,
        dataframe: pd.DataFrame,
        target: str | None = None,
        preprocessor: ColumnTransformer | None = None,
        fit_preprocessor: bool = True,
        device: torch.device | str | None = None,
    ):
        df = dataframe.copy()

        # Separate X / y
        if target is not None:
            if target not in df.columns:
                raise ValueError(f"target='{target}' not found in dataframe columns.")
            y_raw = df[target]
            X_raw = df.drop(columns=[target])
        else:
            # For GAN-like usage: label == input
            X_raw = df
            y_raw = None

        # Identify column types
        numeric_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X_raw.select_dtypes(exclude=[np.number]).columns.tolist()

        # Pipelines
        numeric_pipeline = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler(feature_range=(-1, 1))),
        ])

        # Use dense (non-sparse) output for easy torch conversion.
        try:
            categorical_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
            ])
        except TypeError:
            # For older scikit-learn versions that use 'sparse' instead of 'sparse_output'
            categorical_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
            ])

        # ColumnTransformer
        if preprocessor is None:
            self.preprocessor = ColumnTransformer(
                transformers=[
                    ("num", numeric_pipeline, numeric_cols),
                    ("cat", categorical_pipeline, categorical_cols),
                ],
                remainder="drop",
                verbose_feature_names_out=False,
            )
        else:
            self.preprocessor = preprocessor

        # Fit or just transform
        if fit_preprocessor:
            X_processed = self.preprocessor.fit_transform(X_raw)
        else:
            X_processed = self.preprocessor.transform(X_raw)

        # Targets
        if target is not None:
            if pd.api.types.is_numeric_dtype(y_raw):
                y_processed = y_raw.to_numpy().astype(np.float32) # type: ignore
                y_tensor = torch.from_numpy(y_processed).unsqueeze(-1)  # [N, 1]
                self.target_type_ = "numeric"
                self.target_classes_ = None
            else:
                # Simple label encoding (no leakage of encoder across splits by default)
                classes = sorted(y_raw.astype(str).unique()) # type: ignore
                self.target_classes_ = {cls: i for i, cls in enumerate(classes)}
                y_indices = y_raw.astype(str).map(self.target_classes_).to_numpy().astype(np.int64) # type: ignore
                y_tensor = torch.from_numpy(y_indices)
                self.target_type_ = "categorical"
        else:
            y_tensor = None
            self.target_type_ = None
            self.target_classes_ = None

        # Store tensors
        self.X = torch.as_tensor(X_processed, dtype=torch.float32)
        self.y = self.X if y_tensor is None else y_tensor

        # Metadata
        self.feature_names_out_ = (
            self.preprocessor.get_feature_names_out()
            if hasattr(self.preprocessor, "get_feature_names_out")
            else None
        )
        self.numeric_cols_ = numeric_cols
        self.categorical_cols_ = categorical_cols

        # Optional device move
        self.device = device
        if self.device is not None:
            self.X = self.X.to(self.device)
            self.y = self.y.to(self.device)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return {
            "input": self.X[idx],
            "label": self.y[idx],
        }
        
data = pd.read_csv(FILE_PATH)
dataset = AutoTabularDataset(
    dataframe=data,
    target=TARGET_COL,        # None => GAN-style (label == input)
    preprocessor=None,        # pass a fitted preprocessor here for val/test
    fit_preprocessor=True,
    device=None,              # keep on CPU for DataLoader; move to DEVICE later
)

input_dim = dataset.X.shape[1]
print(f"Processed feature dimension: {input_dim}")

dataloader = dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True,
                        num_workers=0, pin_memory=False, persistent_workers=False)

: 

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True,
                        num_workers=0, pin_memory=False, persistent_workers=False)

model_save_freq = 100

latent_dim =20
for epoch in range(num_epochs):
    for batch in dataloader:
        real_data_batch = batch['input']
        # Train discriminator on real data
        real_labels = torch.FloatTensor(np.random.uniform(0.9, 1.0, (batch_size, 1)))
        disc_optimizer.zero_grad()
        output_real = discriminator(real_data_batch)
        loss_real = criterion(output_real, real_labels)
        loss_real.backward()

        # Train discriminator on generated data
        fake_labels = torch.FloatTensor(np.random.uniform(0, 0.1, (batch_size, 1)))
        noise = torch.FloatTensor(np.random.normal(0, 1, (batch_size, latent_dim)))
        generated_data = generator(noise)
        output_fake = discriminator(generated_data.detach())
        loss_fake = criterion(output_fake, fake_labels)
        loss_fake.backward()

        disc_optimizer.step()

        # Train generator 
        valid_labels = torch.FloatTensor(np.random.uniform(0.9, 1.0, (batch_size, 1)))
        gen_optimizer.zero_grad()
        output_g = discriminator(generated_data)
        loss_g = criterion(output_g, valid_labels)
        loss_g.backward()
        gen_optimizer.step()

    # Print progress
    print(f"Epoch {epoch}, D Loss Real: {loss_real.item()}, D Loss Fake: {loss_fake.item()}, G Loss: {loss_g.item()}")


In [None]:
import seaborn as sns

# Generate synthetic data 
synthetic_data = generator(torch.FloatTensor(np.random.normal(0, 1, (real_data.shape[0], latent_dim))))

# Plot the results
fig, axs = plt.subplots(2, 3, figsize=(12, 8))
fig.suptitle('Real and Synthetic Data Distributions', fontsize=16)

for i in range(2):
    for j in range(3):
        sns.histplot(synthetic_data[:, i * 3 + j].detach().numpy(), bins=50, alpha=0.5, label='Synthetic Data', ax=axs[i, j], color='blue')
        sns.histplot(real_data[:, i * 3 + j].numpy(), bins=50, alpha=0.5, label='Real Data', ax=axs[i, j], color='orange')
        axs[i, j].set_title(f'Parameter {i * 3 + j + 1}', fontsize=12)
        axs[i, j].set_xlabel('Value')
        axs[i, j].set_ylabel('Frequency')
        axs[i, j].legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Create a 2x3 grid of subplots
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Comparison of Real and Synthetic Data', fontsize=16)

# Define parameter names
param_names = ['Parameter 1', 'Parameter 2', 'Parameter 3', 'Parameter 4', 'Parameter 5', 'Parameter 6']

# Scatter plots for each parameter
for i in range(2):
    for j in range(3):
        param_index = i * 3 + j
        sns.scatterplot(real_data[:, 0].numpy(), real_data[:, param_index].numpy(), label='Real Data', alpha=0.5, ax=axs[i, j])
        sns.scatterplot(synthetic_data[:, 0].detach().numpy(), synthetic_data[:, param_index].detach().numpy(), label='Generated Data', alpha=0.5, ax=axs[i, j])
        axs[i, j].set_title(param_names[param_index], fontsize=12)
        axs[i, j].set_xlabel(f'Real Data - {param_names[param_index]}')
        axs[i, j].set_ylabel(f'Real Data - {param_names[param_index]}')
        axs[i, j].legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [1]:
import sys, platform
print(sys.version)
print(platform.platform())

import numpy, pandas, sklearn, torch
print("numpy", numpy.__version__)
print("pandas", pandas.__version__)
print("sklearn", sklearn.__version__)
print("torch", torch.__version__)
print("cuda available:", torch.cuda.is_available())


3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0]
Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.39
numpy 2.3.2
pandas 2.3.1
sklearn 1.7.1
torch 2.8.0+cu128
cuda available: True


In [2]:
import torch
print(torch.version.cuda)  # must match the wheel (e.g., '12.1')
print(torch.cuda.is_available())

12.8
True


In [3]:
import torch, pandas as pd, numpy as np, sklearn
print("OK:", torch.__version__, pd.__version__, np.__version__, sklearn.__version__)
x = torch.randn(1024, 1024) @ torch.randn(1024, 1024)
print("matmul done:", x.shape)

OK: 2.8.0+cu128 2.3.1 2.3.2 1.7.1
matmul done: torch.Size([1024, 1024])


In [1]:
import sys, platform
print(sys.version)
print(platform.platform())

import numpy, pandas, sklearn, torch
print("numpy", numpy.__version__)
print("pandas", pandas.__version__)
print("sklearn", sklearn.__version__)
print("torch", torch.__version__)
print("cuda available:", torch.cuda.is_available())

# tiny matmul to confirm native BLAS is happy
_ = (torch.randn(512,512) @ torch.randn(512,512)).sum().item()
print("matmul ok")

3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0]
Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.39
numpy 2.3.2
pandas 2.3.1
sklearn 1.7.1
torch 2.8.0+cu128
cuda available: True
matmul ok
