<a href="https://colab.research.google.com/github/MonishSoundarRaj/data_check_generator/blob/main/main_findiff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required libaries.

In [None]:
# install required libraries
!pip install sdv # install the synthetic data vault library



Import required libraries.

In [None]:
# import data science libraries
import pandas as pd
import numpy as np
import math

# import scikit-learn preprocessing
from sklearn.preprocessing import LabelEncoder, QuantileTransformer

# import pytorch libraries
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

# import synthetic data vault libraries
from sdv.metadata import SingleTableMetadata
import sdv.evaluation.single_table as sdv_st

# import utility libraries
from tqdm import tqdm
import xlrd
import requests
from io import BytesIO
from zipfile import ZipFile
from datetime import datetime

# import visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

Init and set experiment parameters.

In [None]:
# set random seed
seed = 1234

# set dimension of categorical embeddings
cat_emb_dim = 2

# set number of neurons per layer
mlp_layers = [1024, 1024, 1024, 1024]

# set non-linear activation function
activation = 'lrelu'

# set number of diffusion steps
diffusion_steps = 500

# set diffusion start and end betas
diffusion_beta_start = 1e-4
diffusion_beta_end = 0.02

# set diffusion scheduler
scheduler = 'linear'

# set number of training epochs
epochs = 30

# set training batch size
batch_size = 512

# set training learning rate
learning_rate = 1e-4

Set random seed values.

In [None]:
# set numpy seed
np.random.seed(seed)

# set pytorch seed
torch.manual_seed(seed)

# set cuda seed
torch.cuda.manual_seed(seed)

## Load, pre-process, and init the UCU Credit Card dataset
The dataset is available under https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients.

In [None]:
# # set data path
# data_url = 'https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip'

# # download the file
# response = requests.get(data_url)

# # determine the zip file
# zip_file = ZipFile(BytesIO(response.content))

# # extract the zip file
# zip_file.extractall('data')

# read the UCI credit card dataset
train_raw = pd.read_csv("philly_data_training.csv")

Inspect the top 10 rows and attribute names of the dataset retreived.

In [None]:
# display top 10 rows
train_raw.head(10)

Unnamed: 0,u_id,job,user,project,gpu_num,cpu_num,node_num,interval,run_time,wall_time,node_hour,core_hour,new_status
0,0,1,1,1,1,0,1,0.0,1389243.0,0,1389243.0,1389243.0,Killed
1,1,2,2,2,8,0,4,395146.0,14405.0,0,57620.0,115240.0,Failed
2,2,3,3,1,1,0,1,257893.0,66336.0,0,66336.0,66336.0,Killed
3,3,3,3,1,1,0,1,0.0,1042044.0,0,1042044.0,1042044.0,Killed
4,4,3,3,1,1,0,1,0.0,2504635.0,0,2504635.0,2504635.0,Killed
5,5,4,2,2,8,0,1,195071.0,100547.0,0,100547.0,804376.0,Failed
6,7,5,1,1,1,0,1,7949.0,816099.0,0,816099.0,816099.0,Failed
7,8,5,1,1,1,0,1,0.0,120018.0,0,120018.0,120018.0,Failed
8,6,5,1,1,1,0,1,0.0,478477.0,0,478477.0,478477.0,Failed
9,9,6,1,1,1,0,1,63543.0,1064989.0,0,1064989.0,1064989.0,Killed


In [None]:
# display attribute names
train_raw.columns

Index(['u_id', 'job', 'user', 'project', 'gpu_num', 'cpu_num', 'node_num',
       'interval', 'run_time', 'wall_time', 'node_hour', 'core_hour',
       'new_status'],
      dtype='object')

In [None]:
# prompt: drop job, project, node_hour, core_hour, and new_status

train_raw = train_raw.drop(columns=['job', 'project', 'node_hour', 'core_hour', 'new_status'])


In [None]:
train_raw.columns

Index(['u_id', 'user', 'gpu_num', 'cpu_num', 'node_num', 'interval',
       'run_time', 'wall_time'],
      dtype='object')

Set numerical and categorical dataset attributes.

Pre-process dataset attributes.

In [None]:
# remove underscore in column names for correct inverse decoding
train_raw.columns = [col.replace('_', '') for col in train_raw.columns]

# # convert categorical attributes to string
# train_raw[cat_attrs] = train_raw[cat_attrs].astype(str)

# # iterate over categorical attributes
# for cat_attr in cat_attrs:

#     # add col name to every categorical entry to make them distinguishable for embedding
#     train_raw[cat_attr] = cat_attr + '_' + train_raw[cat_attr].astype('str')

In [None]:
# determine categorical attributes
# cat_attrs = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY0', 'PAY2', 'PAY3', 'PAY4', 'PAY5', 'PAY6']

# determine numerical attributes
num_attrs = ['uid', 'user', 'gpunum', 'cpunum', 'nodenum', 'interval', 'runtime', 'walltime']

Set dataset label.

In [None]:
# # extract label
# label = train_raw['default payment next month']

Determine the training data.

In [None]:
# merge selected categorical and numerical attributes
train = train_raw[num_attrs]

Transform the numerical attributes.

In [None]:
# init the quantile transformation
num_scaler = QuantileTransformer(output_distribution='normal', random_state=seed)

# fit transformation to numerical attributes
num_scaler.fit(train[num_attrs])

# transform numerical attributes
train_num_scaled = num_scaler.transform(train[num_attrs])

Transform the categorical attributes.

In [None]:
# # get vocabulary of categorical attributes
# vocabulary_classes = np.unique(train[cat_attrs])

# # init categorical attribute encoder
# label_encoder = LabelEncoder()

# # fit encoder to categorical attributes
# label_encoder.fit(vocabulary_classes)

# # transform categorical attributes
# train_cat_scaled = train[cat_attrs].apply(label_encoder.transform)

# # collect unique values of each categorical attribute
# vocab_per_attr = {cat_attr: set(train_cat_scaled[cat_attr]) for cat_attr in cat_attrs}

Convert numerical and categorical attributes as well as the labels to tensors.

In [None]:
# convert numerical attributes
train_num_torch = torch.FloatTensor(train_num_scaled)

# # convert categorical attributes
# train_cat_torch = torch.LongTensor(train_cat_scaled.values)

# # convert label
# label_torch = torch.LongTensor(label)

Convert dataset to tensor dataset.

In [None]:
# init tensor dataset
train_set = TensorDataset(
    train_num_torch, # numerical attributes
)

Init the data loader.

In [None]:
# init the data loader
dataloader = DataLoader(
    dataset=train_set, # training dataset
    batch_size=batch_size, # training batch size
    num_workers=0, # number of workers
    shuffle=True # shuffle training data
)

## Implement the FinDiff model

Implement the FinDiff backbone model.

In [None]:
# define base feedforward network
class BaseNetwork(nn.Module):

    # define base network constructor
    def __init__(self, hidden_size, activation='lrelu'):

        # call super calass constructor
        super(BaseNetwork, self).__init__()

        # init
        self.layers = self.init_layers(hidden_size)

        # case: lrelu activation
        if activation == 'lrelu':

            # set lrelu activation
            self.activation = nn.LeakyReLU(negative_slope=0.4, inplace=True)

        # case: relu activation
        elif activation == 'relu':

            # set relu activation
            self.activation = nn.ReLU(inplace=True)

        # case: tanh activation
        elif activation == 'tanh':

            # set tanh activation
            self.activation = nn.Tanh()

        # case: sigmoid activation
        else:

            # set sigmoid activation
            self.activation = nn.Sigmoid()

    # define layer initialization
    def init_layers(self, layer_dimensions):

        # init layers
        layers = []

        # iterate over layer dimensions
        for i in range(len(layer_dimensions)-1):

            # init linear layer
            layer = nn.Linear(layer_dimensions[i], layer_dimensions[i + 1], bias=True)

            # init linear layer weights
            nn.init.xavier_uniform_(layer.weight)

            # init linear layer bias
            nn.init.constant_(layer.bias, 0.0)

            # collecet linear layer
            layers.append(layer)

            # register linear layer parameters
            self.add_module('linear_' + str(i), layer)

        # return layers
        return layers

    # define forward pass
    def forward(self, x):

        # iterate over layers
        for i in range(len(self.layers)):

            # run layer forward pass
            x = self.activation(self.layers[i](x))

        # return forward pass result
        return x

Implement the FinDiff model synthesizer.

In [None]:
class MLPSynthesizer(nn.Module):
    def __init__(
            self,
            d_in: int,
            hidden_layers: list,
            activation: str='lrelu', # layer activation
            dim_t: int=64
        ):
        super(MLPSynthesizer, self).__init__()

        self.dim_t = dim_t
        self.backbone = BaseNetwork([dim_t, *hidden_layers], activation=activation)

        self.projection = nn.Sequential(
            nn.Linear(d_in, dim_t), # linear layer
            nn.SiLU(), # silu activation
            nn.Linear(dim_t, dim_t) # linear layer
        )

        self.time_embed = nn.Sequential(
            nn.Linear(dim_t, dim_t), # linear layer
            nn.SiLU(), # silu activation
            nn.Linear(dim_t, dim_t) # linear layer
        )

        self.head = nn.Linear(hidden_layers[-1], d_in)

    def embed_time(self, timesteps, dim_out, max_period=10000):
        half_dim_out = dim_out // 2
        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half_dim_out, dtype=torch.float32) / half_dim_out)
        freqs = freqs.to(device=timesteps.device)
        args = timesteps[:, None].float() * freqs[None]
        time_embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)

        if dim_out % 2:
            time_embedding = torch.cat([time_embedding, torch.zeros_like(time_embedding[:, :1])], dim=-1)

        return time_embedding

    def forward(self, x, timesteps):
        time_emb = self.embed_time(timesteps, self.dim_t)
        time_emb = self.time_embed(time_emb)
        x = self.projection(x) + time_emb  # Removed label embedding addition
        x = self.backbone(x)
        x = self.head(x)
        return x


Implement the FinDiff model base diffuser.

In [None]:
# define BaseDiffuser network
class BaseDiffuser(object):

    # define base diffuser network constructor
    def __init__(
            self,
            total_steps=1000,
            beta_start=1e-4,
            beta_end=0.02,
            device='cpu',
            scheduler='linear'
        ):

        # set diffusion steps
        self.total_steps = total_steps

        # set diffusion start beta
        self.beta_start = beta_start

        # set diffusion end beta
        self.beta_end = beta_end

        # set compute device
        self.device = device

        # set noise schedule alphas and betas
        self.alphas, self.betas = self.prepare_noise_schedule(scheduler=scheduler)

        # set noise schedule alhpa hats
        self.alphas_hat = torch.cumprod(self.alphas, dim=0)

    # define noise schedule
    def prepare_noise_schedule(self, scheduler: str):

        # determine noise scheduler scale
        scale = 1000 / self.total_steps

        # scale beta start
        beta_start = scale * self.beta_start

        # scale beta end
        beta_end = scale * self.beta_end

        # case: linear noise scheduler
        if scheduler == 'linear':

            # determine linear noise schedule betas
            betas = torch.linspace(beta_start, beta_end, self.total_steps)

            # determine linear noise schedule alphas
            alphas = 1.0 - betas

        # case: quadratic noise scheduler
        elif scheduler == 'quad':

            # determine quadratic noise schedule betas
            betas = torch.linspace(self.beta_start ** 0.5, self.beta_end ** 0.5, self.total_steps) ** 2

            # determine quadratic noise schedule alphas
            alphas = 1.0 - betas

        # return noise scheduler alphas and betas
        return alphas.to(self.device), betas.to(self.device)

    # define random timesteps sampler
    def sample_random_timesteps(self, n: int):

        # sample random timesteps
        t = torch.randint(low=1, high=self.total_steps, size=(n,), device=self.device)

        # return random timesteps
        return t

    # define gaussian noise addition
    def add_gauss_noise(self, x_num, t):

        # determine noise alpha hat
        sqrt_alpha_hat = torch.sqrt(self.alphas_hat[t])[:, None]

        # determine noise one minius alpha hat
        sqrt_one_minus_alpha_hat = torch.sqrt(1 - self.alphas_hat[t])[:, None]

        # determine numeric noise
        noise_num = torch.randn_like(x_num)

        # determine x numeric noise
        x_noise_num = sqrt_alpha_hat * x_num + sqrt_one_minus_alpha_hat * noise_num

        # return x numeric noise and numeric noise
        return x_noise_num, noise_num

    # define gaussian noise sampling
    def p_sample_gauss(self, model_out, z_norm, timesteps):

        # determine noise alpha hat
        sqrt_alpha_t = torch.sqrt(self.alphas[timesteps])[:, None]

        # determine noise betas
        betas_t = self.betas[timesteps][:, None]

        # determine noise one minius alpha hat
        sqrt_one_minus_alpha_hat_t = torch.sqrt(1 - self.alphas_hat[timesteps])[:, None]

        epsilon_t = torch.sqrt(self.betas[timesteps][:, None])

        # determine random noise
        random_noise = torch.randn_like(z_norm)
        random_noise[timesteps == 0] = 0.0

        # determine model mean
        model_mean = ((1 / sqrt_alpha_t) * (z_norm - (betas_t * model_out / sqrt_one_minus_alpha_hat_t)))

        # determine z norm
        z_norm = model_mean + (epsilon_t * random_noise)

        # return z norm
        return z_norm

## Initialize and train the FinDiff model

In [None]:
# determine number unique categorical tokens
# n_cat_tokens = len(np.unique(train[cat_attrs]))

# determine total categorical embedding dimension
# cat_dim = cat_emb_dim * len(cat_attrs)

# determine total numerical embedding dimension
num_dim = len(num_attrs)

# determine total embedding dimension
encoded_dim = num_dim

In [None]:
# initialize the FinDiff synthesizer model
synthesizer_model = MLPSynthesizer(
    d_in=encoded_dim,
    hidden_layers=mlp_layers,
    activation=activation
)


In [None]:
# initialize the FinDiff base diffuser model
diffuser_model = BaseDiffuser(
    total_steps=diffusion_steps,
    beta_start=diffusion_beta_start,
    beta_end=diffusion_beta_end,
    scheduler=scheduler
)

Init optimizer, scheduler and loss function.

In [None]:
# determine synthesizer model parameters
parameters = filter(lambda p: p.requires_grad, synthesizer_model.parameters())

# init Adam optimizer
optimizer = optim.Adam(parameters, lr=learning_rate)

# init learning rate scheduler
lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs, verbose=False)

# int mean-squared-error loss
loss_fnc = nn.MSELoss()

In [None]:
# Initialize collection of training epoch losses
train_epoch_losses = []

# Set the model in training mode
synthesizer_model.train()

# Initialize the training progress bar
pbar = tqdm(iterable=range(epochs), position=0, leave=True)

# Iterate over training epochs
for epoch in pbar:
    # Initialize epoch training batch losses
    batch_losses = []

    # Iterate over epoch batches
    for batch_num in dataloader:  # Assuming the DataLoader now only yields numerical data batches

        batch_num = batch_num[0]
        # Sample diffusion timestep
        timesteps = diffuser_model.sample_random_timesteps(n=batch_num.shape[0])

        # Add diffuser Gaussian noise
        batch_noise_t, noise_t = diffuser_model.add_gauss_noise(x_num=batch_num, t=timesteps)

        # Conduct synthesizer model forward pass without labels
        predicted_noise = synthesizer_model(x=batch_noise_t, timesteps=timesteps)

        # Compute training batch loss
        batch_loss = loss_fnc(input=noise_t, target=predicted_noise)

        # Reset model gradients
        optimizer.zero_grad()

        # Run model backward pass
        batch_loss.backward()

        # Optimize model parameters
        optimizer.step()

        # Collect training batch losses
        batch_losses.append(batch_loss.detach().cpu().numpy())

    # Determine mean training epoch loss
    batch_losses_mean = np.mean(np.array(batch_losses))

    # Update learning rate scheduler
    lr_scheduler.step()

    # Collect mean training epoch loss
    train_epoch_losses.append(batch_losses_mean)

    # Prepare and set training epoch progress bar update
    now = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    pbar.set_description('[LOG {}] epoch: {}, train-loss: {}'.format(str(now), str(epoch).zfill(4), str(batch_losses_mean)))


[LOG 2024-03-13 13:54:07] epoch: 0029, train-loss: 0.2029375: 100%|██████████| 30/30 [03:12<00:00,  6.42s/it]


## Generate Data using the FinDiff model

Init and set sampling parameters.

In [None]:
# set number of generated samples
n_samples = None

# set number of diffusion steps
diffusion_steps = 10

Use FinDiff to generate new data samples.

Decode generated FinDiff samples.

In [None]:
# Use FinDiff to generate new data samples.
samples = torch.randn((15000, encoded_dim))  # Example size, adjust as necessary

pbar = tqdm(iterable=reversed(range(0, diffusion_steps)), position=0, leave=True)

for diffusion_step in pbar:
    now = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    pbar.set_description('[LOG {}] Diffusion Step: {}'.format(str(now), str(diffusion_step).zfill(4)))

    timesteps = torch.full((samples.shape[0],), diffusion_step, dtype=torch.long)
    model_out = synthesizer_model(x=samples.float(), timesteps=timesteps)
    samples = diffuser_model.p_sample_gauss(model_out, samples, timesteps)

samples = samples.detach().numpy()

# Denormalize numeric attributes
z_norm_upscaled = num_scaler.inverse_transform(samples)
z_norm_df = pd.DataFrame(z_norm_upscaled, columns=num_attrs)

samples_decoded = z_norm_df


[LOG 2024-03-13 13:54:39] Diffusion Step: 0000: : 10it [00:02,  3.44it/s]


In [None]:
samples_decoded.head(10)

Unnamed: 0,uid,user,gpunum,cpunum,nodenum,interval,runtime,walltime
0,7398.025879,45.0,1.0,0.0,1.0,4.0,964.961426,0.0
1,13111.475586,51.0,1.0,0.0,1.0,4.0,956.678406,0.0
2,12048.351562,128.043137,1.0,0.0,1.0,5.0,122791.960938,0.0
3,523.095337,47.0,1.0,0.0,1.0,0.0,2711.081299,0.0
4,5903.635742,10.0,1.0,0.0,1.0,31.0,13354.5,0.0
5,2102.328369,10.0,2.0,0.0,1.0,0.0,99.241859,0.0
6,5567.489746,21.881939,8.0,0.0,1.0,15.866567,73.63401,0.0
7,11151.073242,77.059204,1.0,0.0,1.0,11.327175,899.12616,0.0
8,10095.381836,64.367233,1.0,0.0,1.0,27.0,77.0,0.0
9,10309.771484,47.0,1.0,0.0,1.0,45.468571,48.0,0.0


In [None]:
samples_decoded.to_csv("philly_syn_findiff.csv", index=False)

In [None]:
# prompt: download the the .csv file

from google.colab import files

files.download("philly_syn_findiff.csv")


## Evaluate the Generated Data

In [None]:
# build a metadata for evaluation (from SDV)
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train)

# generate quality report
quality_report = sdv_st.evaluate_quality(
    real_data=train,
    synthetic_data=samples_decoded,
    metadata=metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 8/8 [00:00<00:00, 308.38it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 28/28 [00:01<00:00, 27.55it/s]

Overall Score: 55.23%

Properties:
- Column Shapes: 97.24%
- Column Pair Trends: 13.22%
