# Imports

In [None]:
!pip install --quiet torch torchvision pandas numpy scikit-learn matplotlib tqdm sdv

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.19.0 requires botocore<1.36.4,>=1.36.0, but you have botocore 1.42.54 which is incompatible.


In [None]:
import time
import torch, torch.nn as nn, torch.optim as optim

from sdv.single_table import CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
from sdv.metadata import SingleTableMetadata

In [None]:
import random
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#PREPROCESSING
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

#GENERATIVE MODELS
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
import torch.nn as nn
import torch.optim as optim


# Generative Models

A generative model is a machine learning model designed to create new data that is similar to its training data. Generative artificial intelligence (AI) models learn the patterns and distributions of the training data, then apply those understandings to generate novel content in response to new input data. (https://www.ibm.com/think/topics/generative-model)

# The 3 Big Categories of Generative Models
## Generative Adversarial Networks (GANs)

A generative adversarial network, or GAN, is a machine learning model designed to generate realistic data by learning patterns from existing training datasets. It operates within an unsupervised learning framework by using deep learning techniques, where two neural networks work in opposition—one generates data, while the other evaluates whether the data is real or generated. (https://www.ibm.com/think/topics/generative-adversarial-networks)


## Variational Autoencoders (VAEs)

Variational autoencoders (VAEs) are generative models used in machine learning (ML) to generate new data in the form of variations of the input data they’re trained on. In addition to this, they also perform tasks common to other autoencoders, such as denoising. (https://www.ibm.com/think/topics/variational-autoencoder)

## Diffusion Models

Diffusion models are generative models used primarily for image generation and other computer vision tasks. Diffusion-based neural networks are trained through deep learning to progressively “diffuse” samples with random noise, then reverse that diffusion process. (https://www.ibm.com/think/topics/diffusion-models)


# Model Definition

## DDPM
Denoising Diffusion Probabilistic Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

class DiffusionNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        
        self.time_embed = nn.Sequential(
            nn.Linear(1, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.input_embed = nn.Linear(input_dim, hidden_dim)

        self.block1 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
        self.block2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
        self.block3 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
        
        self.final = nn.Linear(hidden_dim, input_dim)
        self.activation = nn.ReLU()

    def forward(self, x, t):
        t_emb = self.time_embed(t.float().view(-1, 1))
        x_emb = self.input_embed(x)
        h = self.activation(x_emb + t_emb)
        
        h = h + self.block1(h)
        h = h + self.block2(h)
        h = h + self.block3(h)
        return self.final(h)

class DiffusionModel:
    def __init__(self, input_dim, n_steps=100, device='cpu'):
        self.n_steps = n_steps
        self.device = device
        self.model = DiffusionNetwork(input_dim).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.criterion = nn.MSELoss()
        
        self.beta = torch.linspace(1e-4, 0.02, n_steps).to(device)
        self.alpha = 1. - self.beta
        self.alpha_hat = torch.cumprod(self.alpha, dim=0)

    def train_one_batch(self, x0):
        self.model.train()
        
        t = torch.randint(0, self.n_steps, (x0.shape[0],), device=self.device).long()
        
        epsilon = torch.randn_like(x0)
        
        sqrt_alpha = torch.sqrt(self.alpha_hat[t]).view(-1, 1)
        sqrt_one_minus = torch.sqrt(1 - self.alpha_hat[t]).view(-1, 1)
        x_t = sqrt_alpha * x0 + sqrt_one_minus * epsilon
        
        pred_epsilon = self.model(x_t, t.float() / self.n_steps)
        
        loss = self.criterion(pred_epsilon, epsilon)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    @torch.no_grad()
    def sample(self, n_samples):
        self.model.eval()
        x = torch.randn((n_samples, self.model.final.out_features)).to(self.device)
        for i in reversed(range(self.n_steps)):
            t = torch.tensor([i] * n_samples, device=self.device)
            pred_eps = self.model(x, t.float() / self.n_steps)
            alpha_t, alpha_hat_t, beta_t = self.alpha[i], self.alpha_hat[i], self.beta[i]
            
            noise = torch.randn_like(x) if i > 0 else torch.zeros_like(x)
            term1 = 1 / torch.sqrt(alpha_t)
            term2 = (1 - alpha_t) / torch.sqrt(1 - alpha_hat_t)
            x = term1 * (x - term2 * pred_eps) + torch.sqrt(beta_t) * noise
        return x

def get_synthetic_data_ddpm(X_in, y_in, n_samples_ratio=0.5, epochs=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if hasattr(X_in, "toarray"): X_in = X_in.toarray()
    if hasattr(X_in, "values"): X_in = X_in.values
    if hasattr(y_in, "values"): y_in = y_in.values
    
    y_in = y_in.reshape(-1, 1)
    
    data_combined = np.hstack([X_in, y_in])
    
    internal_scaler = StandardScaler()
    data_scaled = internal_scaler.fit_transform(data_combined)
    
    dataset = torch.tensor(data_scaled, dtype=torch.float32).to(device)
    ddpm = DiffusionModel(input_dim=dataset.shape[1], n_steps=50, device=device)
    
    batch_size = min(256, len(dataset))
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for _ in range(epochs):
        for batch in dataloader:
            ddpm.train_one_batch(batch)
            
    n_samples = int(len(X_in) * n_samples_ratio)
    generated_scaled = ddpm.sample(n_samples).cpu().numpy()
    
    generated_data = internal_scaler.inverse_transform(generated_scaled)
    
    X_syn = generated_data[:, :-1]
    y_syn = generated_data[:, -1]
    
    return X_syn, y_syn

## GAN
Generative Adversarial Network

In [None]:
import math, random, time
import torch, torch.nn as nn, torch.optim as optim
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler

class Generator(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

def get_synthetic_data_gan(X_in, y_in, n_samples_ratio=0.5, epochs=100, latent_dim=64):
    """
    Train a GAN on X_in, y_in and return synthetic X_syn, y_syn.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if hasattr(X_in, "toarray"): X_in = X_in.toarray()
    if hasattr(X_in, "values"): X_in = X_in.values
    if hasattr(y_in, "values"): y_in = y_in.values
    
    y_in = y_in.reshape(-1, 1)
    data_combined = np.hstack([X_in, y_in])
    
    internal_scaler = StandardScaler()
    data_scaled = internal_scaler.fit_transform(data_combined)
    
    real_data = torch.tensor(data_scaled, dtype=torch.float32).to(device)
    data_dim = real_data.shape[1]
    
    generator = Generator(latent_dim, data_dim).to(device)
    discriminator = Discriminator(data_dim).to(device)
    
    lr = 0.0002
    opt_g = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
    opt_d = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
    criterion = nn.BCELoss()
    
    batch_size = min(128, len(real_data))
    n_batches = int(np.ceil(len(real_data) / batch_size))
    
    generator.train()
    discriminator.train()
    
    for epoch in range(epochs):
        indices = np.random.permutation(len(real_data))
        
        for i in range(n_batches):
            idx = indices[i*batch_size : (i+1)*batch_size]
            real_batch = real_data[idx]
            curr_batch_size = real_batch.size(0)
            
            real_labels = torch.ones(curr_batch_size, 1).to(device)
            fake_labels = torch.zeros(curr_batch_size, 1).to(device)
            
            opt_d.zero_grad()
            
            outputs = discriminator(real_batch)
            d_loss_real = criterion(outputs, real_labels)
            
            z = torch.randn(curr_batch_size, latent_dim).to(device)
            fake_batch = generator(z)
            outputs = discriminator(fake_batch.detach())
            d_loss_fake = criterion(outputs, fake_labels)
            
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            opt_d.step()
            
            opt_g.zero_grad()
            
            outputs = discriminator(fake_batch)
            g_loss = criterion(outputs, real_labels) 
            
            g_loss.backward()
            opt_g.step()

    generator.eval()
    n_samples = int(len(X_in) * n_samples_ratio)
    
    with torch.no_grad():
        z = torch.randn(n_samples, latent_dim).to(device)
        generated_scaled = generator(z).cpu().numpy()

    generated_data = internal_scaler.inverse_transform(generated_scaled)
    
    X_syn = generated_data[:, :-1]
    y_syn = generated_data[:, -1]
    
    return X_syn, y_syn

## VAE
Variational autoencoder

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16, hidden_dim=128):
        super().__init__()
        self.act = nn.ReLU()

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)

        self.fc2 = nn.Linear(latent_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h = self.act(self.fc1(x))
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = self.act(self.fc2(z))
        return self.fc3(h)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_hat = self.decode(z)
        return x_hat, mu, logvar

def vae_loss(x, x_hat, mu, logvar):
    recon_loss = nn.MSELoss()(x_hat, x)
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kl

def get_synthetic_data_vae(
    X_in, y_in,
    n_samples_ratio=0.5,
    epochs=80,
    latent_dim=16,
    hidden_dim=128,
):
    """
    Train a VAE on the joint (X, y) distribution and return synthetic samples.
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if hasattr(X_in, "toarray"): X_in = X_in.toarray()
    if hasattr(X_in, "values"): X_in = X_in.values
    if hasattr(y_in, "values"): y_in = y_in.values

    y_in = y_in.reshape(-1, 1)

    data_combined = np.hstack([X_in, y_in])

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_combined)

    data_t = torch.tensor(data_scaled, dtype=torch.float32).to(device)
    input_dim = data_t.shape[1]

    model = VAE(input_dim, latent_dim, hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    batch_size = min(256, len(data_t))
    loader = torch.utils.data.DataLoader(data_t, batch_size=batch_size, shuffle=True)

    model.train()
    for _ in range(epochs):
        for batch in loader:
            x_hat, mu, logvar = model(batch)
            loss = vae_loss(batch, x_hat, mu, logvar)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    n_samples = int(len(X_in) * n_samples_ratio)

    with torch.no_grad():
        z = torch.randn(n_samples, latent_dim).to(device)
        generated_scaled = model.decode(z).cpu().numpy()

    generated_data = scaler.inverse_transform(generated_scaled)

    X_syn = generated_data[:, :-1]
    y_syn = generated_data[:, -1]

    return X_syn, y_syn


## SDV (Synthetic Data Vault - library)
- CTGAN
- TVAE
- CopulaGAN

In [None]:
import torch

def get_synthetic_data_sdv(X_in, y_in, model_name="ctgan", n_samples_ratio=0.5):
    """
    Wraps SDV models to work with Numpy arrays/Sparse matrices.
    model_name options: 'ctgan', 'tvae', 'copulagan'
    """
    
    if hasattr(X_in, "toarray"):
        X_in = X_in.toarray()

    X_arr = X_in.values if hasattr(X_in, "values") else X_in
    y_arr = y_in.values if hasattr(y_in, "values") else y_in
    y_arr = y_arr.reshape(-1, 1)
    
    n_features = X_arr.shape[1]
    col_names = [f"col_{i}" for i in range(n_features)] + ["target"]
    
    data_combined = np.hstack([X_arr, y_arr])
    df_temp = pd.DataFrame(data_combined, columns=col_names)

    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df_temp)

    is_using_gpu = torch.cuda.is_available()
    
    if model_name.lower() == "tvae":
        model = TVAESynthesizer(
            metadata,
            enable_gpu=is_using_gpu,
            verbose=True
        )
    elif model_name.lower() == "copulagan":
        model = CopulaGANSynthesizer(
            metadata,
            enable_gpu=is_using_gpu,
            verbose=True
        )
    elif model_name.lower() == "ctgan":
        model = CTGANSynthesizer(
            metadata,
            enable_gpu=is_using_gpu,
            verbose=True
        )

    model.fit(df_temp)
    
    n_samples = int(len(X_arr) * n_samples_ratio)
    synthetic_df = model.sample(num_rows=n_samples)
    
    data_syn = synthetic_df.values
    
    X_syn = data_syn[:, :-1]
    y_syn = data_syn[:, -1]

    return X_syn, y_syn

def get_synthetic_data_ctgan(X_in, y_in, n_samples_ratio=0.5):
    return get_synthetic_data_sdv(X_in, y_in, model_name="ctgan", n_samples_ratio=n_samples_ratio)

def get_synthetic_data_tvae(X_in, y_in, n_samples_ratio=0.5):
    return get_synthetic_data_sdv(X_in, y_in, model_name="tvae", n_samples_ratio=n_samples_ratio)

def get_synthetic_data_copulagan(X_in, y_in, n_samples_ratio=0.5):
    return get_synthetic_data_sdv(X_in, y_in, model_name="copulagan", n_samples_ratio=n_samples_ratio)

# augment_data()

In [None]:
n_samples_ratio = 1.00
gen_model_params = {
    "DDPM" : {
        "n_samples_ratio": n_samples_ratio,
        "epochs": 100
    },
    "GAN" : {
        "n_samples_ratio": n_samples_ratio,
        "epochs": 100, 
        "latent_dim": 64
    },
    "VAE" : {
        "n_samples_ratio": n_samples_ratio,
        "epochs":80,
        "latent_dim":16,
        "hidden_dim":128,
    },
    "CTGAN" : {
        "n_samples_ratio": n_samples_ratio,
    },
    "TVAE" : {
        "n_samples_ratio": n_samples_ratio,
    },
    "COPULAGAN" : {
        "n_samples_ratio": n_samples_ratio,
    }
}

In [None]:
def augment_data(model_name, X_train, y_train, params={}):
    """
    model_name options: DDPM, GAN, VAE, CTGAN, TVAE, COPULAGAN
    """
    if model_name.upper() == "DDPM":
        return get_synthetic_data_ddpm(X_train, y_train, **params)
    elif model_name.upper() == "GAN":
        return get_synthetic_data_gan(X_train, y_train, **params)
    elif model_name.upper() == "VAE":
        return get_synthetic_data_vae(X_train, y_train, **params)
    elif model_name.upper() == "COPULAGAN":
        return get_synthetic_data_copulagan(X_train, y_train, **params)
    elif model_name.upper() == "TVAE":
        return get_synthetic_data_tvae(X_train, y_train, **params)
    else: # Defaults to CTGAN
        return get_synthetic_data_ctgan(X_train, y_train, **params)


# Testar os modelos

In [None]:
loaded_dataset = "california"

if loaded_dataset == "california":
    from sklearn.datasets import fetch_california_housing
    data = fetch_california_housing()
    X = data.data
    y = data.target
elif loaded_dataset == "custo":
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    
    file_path = "dataset.csv"
    dataset_path = "username/dataset"

    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      dataset_path,
      file_path,
    )
    df = df.rename(columns={'custo': 'Target'})
    
    X = df.drop(columns={"Target"})
    y = df["Target"]

    cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype).startswith('category')]
    num_cols = [c for c in X.columns if c not in cat_cols]
    
    ord_cols = []
    
    if "class_etaria" in cat_cols:
        ord_cols.append("class_etaria")
        cat_cols.remove("class_etaria")
    
    onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ordinal_encoder = OrdinalEncoder().set_output(transform="pandas")
    
    preprocess_transformer = ColumnTransformer(
       transformers=[("onehot_encoder", onehot_encoder, cat_cols),
                     ("ordinal_encoder", ordinal_encoder, ord_cols)],
       remainder="passthrough"
    )

    X = preprocess_transformer.fit_transform(X)
    y = y.to_numpy()

In [None]:
synth_data = {}

print("GAN")
X_aug, y_aug = augment_data("GAN", X, y, gen_model_params["GAN"])
synth_data["GAN"] = {"x": X_aug, "y": y_aug}

print("CTGAN")
X_aug, y_aug = augment_data("CTGAN", X, y, gen_model_params["CTGAN"])
synth_data["CTGAN"] = {"x": X_aug, "y": y_aug}

print("COPULAGAN")
X_aug, y_aug = augment_data("COPULAGAN", X, y, gen_model_params["COPULAGAN"])
synth_data["COPULAGAN"] = {"x": X_aug, "y": y_aug}

print("VAE")
X_aug, y_aug = augment_data("VAE", X, y, gen_model_params["VAE"])
synth_data["VAE"] = {"x": X_aug, "y": y_aug}

print("TVAE")
X_aug, y_aug = augment_data("TVAE", X, y, gen_model_params["TVAE"])
synth_data["TVAE"] = {"x": X_aug, "y": y_aug}

print("DDPM")
X_aug, y_aug = augment_data("DDPM", X, y, gen_model_params["DDPM"])
synth_data["DDPM"] = {"x": X_aug, "y": y_aug}

In [None]:
import matplotlib.pyplot as plt

n_features = X.shape[1]

for i in range(n_features):
    plt.hist(X[:, i], bins=30, alpha=0.4, density=True, label="Original")
    plt.hist(synth_data["DDPM"]["x"][:, i], bins=30, alpha=0.4, density=True, label="DDPM")
    plt.hist(synth_data["GAN"]["x"][:, i], bins=30, alpha=0.4, density=True, label="GAN")
    plt.hist(synth_data["VAE"]["x"][:, i], bins=30, alpha=0.4, density=True, label="VAE")
    plt.hist(synth_data["COPULAGAN"]["x"][:, i], bins=30, alpha=0.4, density=True, label="COPULAGAN")
    plt.hist(synth_data["TVAE"]["x"][:, i], bins=30, alpha=0.4, density=True, label="TVAE")
    plt.hist(synth_data["CTGAN"]["x"][:, i], bins=30, alpha=0.4, density=True, label="CTGAN")
    plt.title(f"Feature {i}")
    plt.legend()
    plt.show()

In [None]:
import pickle

with open(f"/kaggle/working/synth data {loaded_dataset} {n_samples_ratio}.pkl", "wb") as f:
    print(synth_data.keys())
    pickle.dump(synth_data, f)

#with open("/kaggle/working/synthetic_data_dict.pkl", "rb") as f:
#    synth_data = pickle.load(f)
#    print(synth_data.keys())

In [None]:
for model in synth_data.keys():
    print(f"{model} - {synth_data[model]["x"].shape}")