In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sdmetrics.reports.utils import get_column_plot, get_column_pair_plot
from sdmetrics.reports.single_table import QualityReport
from sdv import Metadata
import seaborn as sns
import matplotlib.pyplot as plt

In [37]:
import warnings
warnings.filterwarnings('ignore')

# Data pre-process

## Read cleaned data

In [38]:
# df = pd.read_csv('../OlympicHistory/CleanedData.csv')
df = pd.read_csv('../OlympicHistory/CleanedData.csv').iloc[0:10000, :]

In [39]:
df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal,AOS,AOE,YOB
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,Thanks,1,1,1968
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,Thanks,1,1,1989
2,3,Gunnar Nielsen Aaby,M,24.0,175.0,71.0,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,Thanks,1,1,1896
3,4,Edgar Lindenau Aabye,M,34.0,182.0,95.0,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1,1,1866
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,Thanks,1,2,1967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7390,Serhiy Pavlovych Baltacha,M,22.0,181.0,75.0,Soviet Union,URS,1980,Summer,Moskva,Football,Football Men's Football,Bronze,1,1,1958
9996,7391,Eduardo Baltar,M,19.0,160.0,48.0,Philippines,PHI,1976,Summer,Montreal,Boxing,Boxing Men's Light-Flyweight,Thanks,1,1,1957
9997,7392,Dimitrios Baltas,M,34.0,175.0,70.0,Greece,GRE,1992,Summer,Barcelona,Shooting,"Shooting Men's Air Pistol, 10 metres",Thanks,1,3,1958
9998,7393,Nikolaos Baltatzis-Mavrokordatos,M,25.0,185.0,84.0,Greece,GRE,1920,Summer,Antwerpen,Water Polo,Water Polo Men's Water Polo,Thanks,1,1,1895


## Find continuous data

In [40]:
df_conti = df[['Age','Height', 'Weight']]
df_conti

Unnamed: 0,Age,Height,Weight
0,24.0,180.0,80.0
1,23.0,170.0,60.0
2,24.0,175.0,71.0
3,34.0,182.0,95.0
4,21.0,185.0,82.0
...,...,...,...
9995,22.0,181.0,75.0
9996,19.0,160.0,48.0
9997,34.0,175.0,70.0
9998,25.0,185.0,84.0


## Normalization

In [41]:
def max_abs_norm(data: pd.DataFrame, column: str):
    max_val = data[column].abs().max()
    data[column] = data[column] / max_val
    return data, max_val


def min_max_norm(data: pd.DataFrame, column: str):
    min_val = data[column].min()
    max_val = data[column].max()
    data[column] = (data[column] - min_val) / (max_val - min_val)
    return data, min_val, max_val


def standardization(data: pd.DataFrame, column: str):
    mean_val = data[column].mean()
    std_val = data[column].std()
    data[column] = (data[column] - mean_val) / std_val
    return data, mean_val, std_val


def max_abs_denorm(data: pd.DataFrame, column: str, norm_dict: dict):
    max_val = norm_dict[column]
    data[column] = data[column] * max_val
    return data


def min_max_denorm(data: pd.DataFrame, column: str, norm_dict: dict):
    min_val = norm_dict[column][0]
    max_val = norm_dict[column][1]
    data[column] = data[column] * (max_val - min_val) + min_val
    return data


def destandardization(data: pd.DataFrame, column: str, norm_dict: dict):
    mean_val = norm_dict[column][0]
    std_val = norm_dict[column][1]
    data[column] = data[column] * std_val + mean_val
    return data

In [42]:
def norm(data: pd.DataFrame, columns: [], norm_types: []):
    norm_dict = {}
    for i in range(len(columns)):
        if norm_types[i] == 'max_abs':
            data, max_val = max_abs_norm(data, columns[i])
            norm_dict.update({columns[i]: max_val})
        
        if norm_types[i] == 'min_max':
            data, min_val, max_val = min_max_norm(data, columns[i])
            norm_dict.update({columns[i]: [min_val, max_val]})
            
        if norm_types[i] == 'standard':
            data, mean_val, std_val = standardization(data, columns[i])
            norm_dict.update({columns[i]: [mean_val, std_val]})

    return data, norm_dict

def denorm(data: pd.DataFrame, columns: [], norm_types: [], norm_dict: dict):
    for i in range(len(columns)):
        if norm_types[i] == 'max_abs':
            data = max_abs_denorm(data, columns[i], norm_dict)
        
        if norm_types[i] == 'min_max':
            data = min_max_denorm(data, columns[i], norm_dict)
        
        if norm_types[i] == 'standard':
            data = destandardization(data, columns[i], norm_dict)
    
    return data

In [43]:
norm_list = ['Age', 'Height', 'Weight']
norm_types = ['standard', 'standard', 'standard']
df_conti_norm, dict_conti = norm(df_conti, norm_list, norm_types)
df_conti_norm

Unnamed: 0,Age,Height,Weight
0,-0.268326,0.425031,0.591794
1,-0.438175,-0.635134,-0.889469
2,-0.268326,-0.105052,-0.074774
3,1.430157,0.637064,1.702742
4,-0.777871,0.955113,0.739921
...,...,...,...
9995,-0.608023,0.531047,0.221478
9996,-1.117568,-1.695300,-1.778227
9997,1.430157,-0.105052,-0.148837
9998,-0.098478,0.955113,0.888047


## Reshape data to 4d array

In [44]:
df_length = len(df_conti_norm.columns)
input_data = df_conti_norm.to_numpy().reshape(-1, 1, 1, df_length)
input_data.shape

(10000, 1, 1, 3)

# Build PyTorch Dataset

## Dataset class

In [45]:
class OlympicDataset(Dataset):
    
    def __init__(self, data: pd.DataFrame, transform=None):
        self.data = torch.from_numpy(data).float()
#         self.data = data
#         self.transform = transforms.Compose([transforms.ToTensor()]) 
        
    
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        data_content = self.data[idx]
#         data_label = int(1)
#         return (data_content, data_label)
#         return self.transform((data_content, data_label))
#         return self.transform(data_content)
#         return (data_content, data_label)[0]
        return data_content

## Pytorch Normalization

In [46]:
# loader_all = DataLoader(input_data, batch_size=len(input_data), num_workers=1)
# data = next(iter(loader_all))
# mean = float(data.mean().detach().numpy())
# std = float(data.std().detach().numpy())

In [47]:
# transforms = transforms.Compose(
#     [transforms.ToTensor(), transforms.Normalize(mean, std)]
# )

transforms = transforms.Compose(
    [transforms.ToTensor()]
)

# Build GAN

## Hyperparameters

In [48]:
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 3e-4
z_dim = 64 # 128, 256
# image_dim = 5 * 5 * 1  # 25
# image_dim = 1 * 5 * 1  # 5
image_dim = 1 * df_length * 1


batch_size = 32
num_epochs = 200

fixed_noise = torch.randn((batch_size, z_dim)).to(device)

## Dataset and DataLoader

In [49]:
dataset = OlympicDataset(input_data, transform=transforms)
# dataset = OlympicDataset(input_data, transform=None)

loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Discriminator

In [50]:
class Discriminator(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.01),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.disc(x)

disc = Discriminator(image_dim).to(device)

## Generator

In [51]:
class Generator(nn.Module):
    def __init__(self, z_dim, img_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),
            nn.LeakyReLU(0.01),
            nn.Linear(256, img_dim),
#             nn.Tanh(),  # normalize inputs to [-1, 1] so make outputs [-1, 1]
        )

    def forward(self, x):
        return self.gen(x)
    
gen = Generator(z_dim, image_dim).to(device)

## Optimizer

In [52]:
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)
criterion = nn.BCELoss()

## Tensorboard

In [53]:
# writer_fake = SummaryWriter(f"logs/fake")
# writer_real = SummaryWriter(f"logs/real")

# Train/Test

In [54]:
step = 0
for epoch in range(num_epochs):
#     for batch_idx, (real, _) in enumerate(loader):
    for batch_idx, real in enumerate(loader):

#         real = real.view(-1, 25).to(device)
#         real = real.view(-1, 5).to(device)
        real = real.view(-1, 1*df_length).to(device)

        batch_size = real.shape[0]

        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
        noise = torch.randn(batch_size, z_dim).to(device)
        fake = gen(noise)
        disc_real = disc(real).view(-1)
        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        disc_fake = disc(fake).view(-1)
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2
        disc.zero_grad()
        lossD.backward(retain_graph=True)
        opt_disc.step()

        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
        # where the second option of maximizing doesn't suffer from
        # saturating gradients
        output = disc(fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        if batch_idx == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
            )

            with torch.no_grad():
#                 fake = gen(fixed_noise).reshape(-1, 1, 5, 5)
#                 data = real.reshape(-1, 1, 5, 5)
                
#                 fake = gen(fixed_noise).reshape(-1, 1, 1, 5)
#                 data = real.reshape(-1, 1, 1, 5)

                fake = gen(fixed_noise).reshape(-1, 1, 1, df_length)
                data = real.reshape(-1, 1, 1, df_length)
    
                img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
                img_grid_real = torchvision.utils.make_grid(data, normalize=True)

#                 writer_fake.add_image(
#                     "Fake Data", img_grid_fake, global_step=step
#                 )
#                 writer_real.add_image(
#                     "Real Data", img_grid_real, global_step=step
#                 )
                step += 1

Epoch [0/200] Batch 0/313                       Loss D: 0.6982, loss G: 0.6406
Epoch [1/200] Batch 0/313                       Loss D: 0.5377, loss G: 1.1310
Epoch [2/200] Batch 0/313                       Loss D: 0.6778, loss G: 0.7434
Epoch [3/200] Batch 0/313                       Loss D: 0.7791, loss G: 0.5496
Epoch [4/200] Batch 0/313                       Loss D: 0.7047, loss G: 0.6244
Epoch [5/200] Batch 0/313                       Loss D: 0.6841, loss G: 0.7741
Epoch [6/200] Batch 0/313                       Loss D: 0.6905, loss G: 0.6626
Epoch [7/200] Batch 0/313                       Loss D: 0.7651, loss G: 0.6261
Epoch [8/200] Batch 0/313                       Loss D: 0.7599, loss G: 0.7585
Epoch [9/200] Batch 0/313                       Loss D: 0.5835, loss G: 0.8490
Epoch [10/200] Batch 0/313                       Loss D: 0.6742, loss G: 0.7630
Epoch [11/200] Batch 0/313                       Loss D: 0.6257, loss G: 0.7867
Epoch [12/200] Batch 0/313                       L

Epoch [103/200] Batch 0/313                       Loss D: 0.7088, loss G: 0.6493
Epoch [104/200] Batch 0/313                       Loss D: 0.6923, loss G: 0.6923
Epoch [105/200] Batch 0/313                       Loss D: 0.6827, loss G: 0.7234
Epoch [106/200] Batch 0/313                       Loss D: 0.6830, loss G: 0.6796
Epoch [107/200] Batch 0/313                       Loss D: 0.7119, loss G: 0.6783
Epoch [108/200] Batch 0/313                       Loss D: 0.6861, loss G: 0.7224
Epoch [109/200] Batch 0/313                       Loss D: 0.6942, loss G: 0.7107
Epoch [110/200] Batch 0/313                       Loss D: 0.6920, loss G: 0.6915
Epoch [111/200] Batch 0/313                       Loss D: 0.6719, loss G: 0.7239
Epoch [112/200] Batch 0/313                       Loss D: 0.6857, loss G: 0.7362
Epoch [113/200] Batch 0/313                       Loss D: 0.7154, loss G: 0.7214
Epoch [114/200] Batch 0/313                       Loss D: 0.6789, loss G: 0.6895
Epoch [115/200] Batch 0/313 

# Generate synthetic output

In [55]:
fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
fake_df = fake_df.rename(columns = {i: df_conti.columns[i] for i in range(df_conti.columns.shape[0])})
fake_df

Unnamed: 0,Age,Height,Weight
0,-0.221446,1.193498,0.67183
1,0.031339,0.090261,-0.205356
2,-0.984401,-0.51796,-0.91657
3,0.227838,1.87151,1.168838
4,-0.97856,-1.269928,-1.075238
5,-1.1718,-0.59214,-1.178437
6,0.632883,0.390625,0.030425
7,0.121439,0.510471,-0.202529
8,-0.354101,0.06095,-0.349296
9,-0.356742,2.129166,1.675217


In [56]:
final = denorm(fake_df, norm_list, norm_types, dict_conti).apply(np.ceil).astype('int64')

In [71]:
final

Unnamed: 0,Age,Height,Weight
0,25,188,82
1,26,177,70
2,20,172,60
3,27,194,88
4,20,165,58
...,...,...,...
10027,20,192,84
10028,26,179,72
10029,23,172,65
10030,26,171,60


In [58]:
df_real = df[['ID', 'Age','Height', 'Weight']].astype('int64')
df_real

Unnamed: 0,ID,Age,Height,Weight
0,1,24,180,80
1,2,23,170,60
2,3,24,175,71
3,4,34,182,95
4,5,21,185,82
...,...,...,...,...
9995,7390,22,181,75
9996,7391,19,160,48
9997,7392,34,175,70
9998,7393,25,185,84


In [59]:
for i in tqdm(range(len(df_real)//len(final) + 1)):
    noise = torch.randn(batch_size, z_dim).to(device)
    fake = gen(noise)
    fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
#     fake_df = fake_df.rename(columns = {0: df_conti.columns[0], 1: df_conti.columns[1],
#                              2: df_conti.columns[2], 3: df_conti.columns[3],
#                              4: df_conti.columns[4]})
    fake_df = fake_df.rename(columns = {i: df_conti.columns[i] for i in range(df_conti.columns.shape[0])})
    demo = denorm(fake_df, norm_list, norm_types, dict_conti).apply(np.ceil).astype('int64')
    final = pd.concat([final, demo]).reset_index(drop=True)

  0%|          | 0/626 [00:00<?, ?it/s]

In [72]:
# df_fake = final.reset_index().rename(columns={'index': 'ID'}).iloc[0:len(df_real), :].astype('int64')
df_fake = final.reset_index().rename(columns={'index': 'ID'}).astype('int64')

In [73]:
df_fake

Unnamed: 0,ID,Age,Height,Weight
0,0,25,188,82
1,1,26,177,70
2,2,20,172,60
3,3,27,194,88
4,4,20,165,58
...,...,...,...,...
10027,10027,20,192,84
10028,10028,26,179,72
10029,10029,23,172,65
10030,10030,26,171,60


# Visualization

## Draw columns

In [74]:
metadata = Metadata()

In [75]:
metadata.add_table(name='olympic',
                  data=df_real,
                  primary_key = 'ID')

In [76]:
metadata = metadata.get_table_meta('olympic')

In [77]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Age'
)

In [78]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Height'
)

In [79]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Weight'
)

## Draw correlation

### Height vs Weight

In [80]:
get_column_pair_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    columns=['Height', 'Weight'],
    metadata=metadata 
)

### Height vs Age

In [81]:
get_column_pair_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    columns=['Age', 'Height'],
    metadata=metadata 
)

### Weight vs Age

In [82]:
get_column_pair_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    columns=['Age', 'Weight'],
    metadata=metadata 
)