In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sdmetrics.reports.utils import get_column_plot
from sdmetrics.reports.single_table import QualityReport
from sdv import Metadata
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sdmetrics.single_column import BoundaryAdherence

In [50]:
import warnings
warnings.filterwarnings('ignore')

# Data pre-process

## Read cleaned data

In [51]:
# Please ensure that there is no duplicate keyword among column names, E.g., 'Sport' and 'AmountOfSport'
# df = pd.read_csv('../OlympicHistory/CleanedData.csv')
df = pd.read_csv('../OlympicHistory/CleanedData.csv').iloc[0:5000, :]

In [52]:
df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal,AOS,AOE,YOB
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,Thanks,1,1,1968
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,Thanks,1,1,1989
2,3,Gunnar Nielsen Aaby,M,24.0,175.0,71.0,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,Thanks,1,1,1896
3,4,Edgar Lindenau Aabye,M,34.0,182.0,95.0,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1,1,1866
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,Thanks,1,2,1967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3729,Henrik Andersen,M,25.0,168.0,75.0,Denmark,DEN,1992,Summer,Barcelona,Weightlifting,Weightlifting Men's Middleweight,Thanks,1,1,1967
4996,3730,Henrik Steen Andersen,M,23.0,193.0,95.0,Denmark,DEN,2000,Summer,Sydney,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay,Thanks,1,2,1977
4997,3731,Henry Anders Peter Brask Andersen,M,23.0,176.0,70.0,Denmark,DEN,1920,Summer,Antwerpen,Cycling,Cycling Men's Sprint,Thanks,1,2,1897
4998,3732,Herluf Juhl Andersen,M,40.0,177.0,69.0,Denmark,DEN,1972,Summer,Munich,Archery,Archery Men's Individual,Thanks,1,1,1932


In [53]:
# number of unique:
# Sex:            2
# Team:           1047
# NOC:            230
# Year:           35
# Season:         2
# City:           42
# Sport:          66
# Event:          710
# Medal:          4
# AOS:  4
# AOE:  23
# YearOfBirth:    169

## Find continuous data

## Find categorical data

In [54]:
# df_category = df[['Sex', 'Team', 'NOC', 'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'AOS', 'AOE', 'YearOfBirth']]
# df_category = df[['Sex', 'Team', 'Year', 'City', 'Sport','AOE']]
df_category = df[['Sex', 'Year', 'Season', 'City', 'Sport', 'Medal', 'AOS', 'AOE']]

df_category

Unnamed: 0,Sex,Year,Season,City,Sport,Medal,AOS,AOE
0,M,1992,Summer,Barcelona,Basketball,Thanks,1,1
1,M,2012,Summer,London,Judo,Thanks,1,1
2,M,1920,Summer,Antwerpen,Football,Thanks,1,1
3,M,1900,Summer,Paris,Tug-Of-War,Gold,1,1
4,F,1988,Winter,Calgary,Speed Skating,Thanks,1,2
...,...,...,...,...,...,...,...,...
4995,M,1992,Summer,Barcelona,Weightlifting,Thanks,1,1
4996,M,2000,Summer,Sydney,Swimming,Thanks,1,2
4997,M,1920,Summer,Antwerpen,Cycling,Thanks,1,2
4998,M,1972,Summer,Munich,Archery,Thanks,1,1


## Normalization

## One hot encoding/decoding

In [55]:
def one_hot_encoding(df: pd.DataFrame):
    cate_name = df.columns.to_numpy()
    cate_class_number = []
    cate_class = []
    for i in range(df.columns.shape[0]):
        cate_class.append(df.iloc[:, i].unique())
        cate_class_number.append(df.iloc[:, i].nunique())
    
    for i in tqdm(range(df.columns.shape[0])):
        df = pd.concat([df,pd.get_dummies(df[cate_name[i]], prefix=cate_name[i])],axis=1)
        df = df.drop(columns=cate_name[i])
    
    return cate_name, cate_class_number, cate_class, df

def one_hot_decoding(df:pd.DataFrame, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [56]:
cate_name, cate_class_number, cate_class, df_category_ohe = one_hot_encoding(df_category)
df_category_ohe

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,Sex_F,Sex_M,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,Year_1920,Year_1924,...,AOS_3,AOE_1,AOE_2,AOE_3,AOE_4,AOE_5,AOE_6,AOE_7,AOE_8,AOE_9
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4996,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4997,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4998,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Reshape data to 4d array

In [57]:
df_length = len(df_category_ohe.columns)
input_data = df_category_ohe.to_numpy().flatten().reshape(-1, 1, 1, df_length)
input_data.shape

(5000, 1, 1, 153)

# Build PyTorch Dataset

## Dataset class

In [58]:
class OlympicDataset(Dataset):
    
    def __init__(self, data: pd.DataFrame, transform=None):
        self.data = torch.from_numpy(data).float()
#         self.data = data
#         self.transform = transforms.Compose([transforms.ToTensor()]) 
        
    
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        data_content = self.data[idx]
#         data_label = int(1)
#         return (data_content, data_label)
#         return self.transform((data_content, data_label))
#         return self.transform(data_content)
#         return (data_content, data_label)[0]
        return data_content

## Pytorch Normalization

In [59]:
# loader_all = DataLoader(input_data, batch_size=len(input_data), num_workers=1)
# data = next(iter(loader_all))
# mean = float(data.mean().detach().numpy())
# std = float(data.std().detach().numpy())

In [60]:
# transforms = transforms.Compose(
#     [transforms.ToTensor(), transforms.Normalize(mean, std)]
# )

transforms = transforms.Compose(
    [transforms.ToTensor()]
)

# Build GAN

## Hyperparameters

In [61]:
# # Hyperparameters etc.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# lr = 3e-4
# z_dim = 64 # 128, 256
# # image_dim = 5 * 5 * 1  # 25
# # image_dim = square_length * square_length * 1
# image_dim = 1 * df_length * 1

# batch_size = 32
# num_epochs = 50

# fixed_noise = torch.randn((batch_size, z_dim)).to(device)

In [62]:
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 2.5e-4
z_dim = 64 # 128, 256
# image_dim = 5 * 5 * 1  # 25
# image_dim = square_length * square_length * 1
image_dim = 1 * df_length * 1

batch_size = 32
num_epochs = 50

fixed_noise = torch.randn((batch_size, z_dim)).to(device)

## Dataset and DataLoader

In [63]:
dataset = OlympicDataset(input_data, transform=transforms)
# dataset = OlympicDataset(input_data, transform=None)

loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Discriminator

In [64]:
# stored parameter

# nn.Linear(in_features, 128),
# nn.LeakyReLU(0.2),
# nn.Dropout(0.5), 
# nn.Linear(128, 1),
# nn.Sigmoid(),

In [65]:
class Discriminator(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5), 
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.disc(x)

disc = Discriminator(image_dim).to(device)

In [66]:
# class Discriminator(nn.Module):
#     def __init__(self, in_features):
#         super().__init__()
#         # Simple CNN
#         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
#         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
#         self.conv2_drop = nn.Dropout2d()
#         self.fc1 = nn.Linear(320, 50)
#         self.fc2 = nn.Linear(50, 1)

#     def forward(self, x):
#         x = F.relu(F.max_pool2d(self.conv1(x), 2))
#         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
#         # Flatten the tensor so it can be fed into the FC layers
#         x = x.view(-1, 320)
#         x = F.relu(self.fc1(x))
#         x = F.dropout(x, training=self.training)
#         x = self.fc2(x)
#         return F.sigmoid(x)
#         # return torch.sigmoid(x)

# disc = Discriminator(image_dim).to(device)

## Generator

In [67]:
# stored parameter

# nn.Linear(z_dim, 256),
# nn.LeakyReLU(0.3), 
# nn.Linear(256, img_dim),

In [68]:
class Generator(nn.Module):
    def __init__(self, z_dim, img_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),
#             nn.BatchNorm1d(256), 
            nn.LeakyReLU(0.5), 
            nn.Linear(256, img_dim),
#             nn.Tanh(),  # normalize inputs to [-1, 1] so make outputs [-1, 1]
        )

    def forward(self, x):
        return self.gen(x)
    
gen = Generator(z_dim, image_dim).to(device)

In [69]:
# class Generator(nn.Module):
#     def __init__(self, latent_dim, img_dim):
#         super().__init__()
#         self.lin1 = nn.Linear(latent_dim, 7*7*64)   # [n, 256, 7, 7]
#         self.ct1 = nn.ConvTranspose2d(64, 32, 4, stride=2)  # [n, 64, 16, 16]
#         self.ct2 = nn.ConvTranspose2d(32, 16, 4, stride=2)  # [n, 16, 34, 34]
#         self.conv = nn.Conv2d(16, 1, kernel_size=7)     # [n, 1, 28, 28]

#     def forward(self, x):
#         # Pass latent space input into linear layer and reshape
#         x = self.lin1(x)
#         x = F.relu(x)
#         x = x.view(-1, 64, 7, 7)

#         # Upsample (transposed conv) 16x16 (64 feature maps)
#         # Transposed convolution to 16x16 (64 feature maps)
#         x = self.ct1(x)
#         x = F.relu(x)
        
#         # Upsample to 34x34 (16 feature maps)
#         # Transposed convolution to 34x34 (16 feature maps)
#         x = self.ct2(x)
#         x = F.relu(x)
        
#         # Convolution to 28x28 (1 feature map)
#         return self.conv(x)
    
# gen = Generator(z_dim, image_dim).to(device)

## Optimizer

In [70]:
opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)
criterion = nn.BCELoss()

## Tensorboard

In [71]:
# writer_fake = SummaryWriter(f"logs/fake")
# writer_real = SummaryWriter(f"logs/real")

# Train/Test

In [72]:
step = 0
for epoch in range(num_epochs):
#     for batch_idx, (real, _) in enumerate(loader):
    for batch_idx, real in enumerate(loader):

#         real = real.view(-1, 25).to(device)
#         real = real.view(-1, square_length*square_length).to(device)
        real = real.view(-1, 1*df_length).to(device)
        batch_size = real.shape[0]

        ### Train Discriminator: max log(D(x)) + log(1 - D(G(z)))
        noise = torch.randn(batch_size, z_dim).to(device)
        fake = gen(noise)
        disc_real = disc(real).view(-1)
        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        disc_fake = disc(fake).view(-1)
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2
        disc.zero_grad()
        lossD.backward(retain_graph=True)
        opt_disc.step()

        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
        # where the second option of maximizing doesn't suffer from
        # saturating gradients
        output = disc(fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        if batch_idx == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
            )

            with torch.no_grad():
#                 fake = gen(fixed_noise).reshape(-1, 1, 5, 5)
#                 data = real.reshape(-1, 1, 5, 5)
                
#                 fake = gen(fixed_noise).reshape(-1, 1, square_length, square_length)
#                 data = real.reshape(-1, 1, square_length, square_length)
                
                fake = gen(fixed_noise).reshape(-1, 1, 1, df_length)
                data = real.reshape(-1, 1, 1, df_length)
            
                img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
                img_grid_real = torchvision.utils.make_grid(data, normalize=True)

#                 writer_fake.add_image(
#                     "Fake Data", img_grid_fake, global_step=step
#                 )
#                 writer_real.add_image(
#                     "Real Data", img_grid_real, global_step=step
#                 )
                step += 1

Epoch [0/50] Batch 0/157                       Loss D: 0.6935, loss G: 0.7009
Epoch [1/50] Batch 0/157                       Loss D: 0.7369, loss G: 0.6170
Epoch [2/50] Batch 0/157                       Loss D: 0.5956, loss G: 1.0414
Epoch [3/50] Batch 0/157                       Loss D: 0.6467, loss G: 0.7848
Epoch [4/50] Batch 0/157                       Loss D: 0.6771, loss G: 0.7265
Epoch [5/50] Batch 0/157                       Loss D: 0.7059, loss G: 0.6652
Epoch [6/50] Batch 0/157                       Loss D: 0.6899, loss G: 0.6667
Epoch [7/50] Batch 0/157                       Loss D: 0.6930, loss G: 0.7051
Epoch [8/50] Batch 0/157                       Loss D: 0.7021, loss G: 0.6729
Epoch [9/50] Batch 0/157                       Loss D: 0.6924, loss G: 0.6891
Epoch [10/50] Batch 0/157                       Loss D: 0.6993, loss G: 0.6778
Epoch [11/50] Batch 0/157                       Loss D: 0.6861, loss G: 0.6830
Epoch [12/50] Batch 0/157                       Loss D: 0.6963

# Generate synthetic output

## Continuous data

## Categorical data

In [73]:
fake.shape

torch.Size([8, 153])

In [74]:
fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
final = fake_df.astype('category')
final

Unnamed: 0,Sex_F,Sex_M,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,Year_1920,Year_1924,...,AOS_3,AOE_1,AOE_2,AOE_3,AOE_4,AOE_5,AOE_6,AOE_7,AOE_8,AOE_9
0,0.111508,0.596181,-0.151121,0.04043,-0.050931,-0.177696,0.129544,-0.107013,0.18563,-0.031484,...,0.054076,-0.031153,0.826482,0.16249,-0.086493,-0.101923,-0.015557,0.015228,-0.170146,-0.156073
1,0.098309,0.454199,-0.021766,0.051484,-0.160897,-0.031407,-0.03564,-0.060809,-0.078812,-0.128772,...,0.060182,0.283964,0.308584,0.061927,-0.078722,0.017013,0.080899,0.00431,0.06264,0.040669
2,0.474018,0.368968,0.044821,0.043679,-0.058498,-0.035114,0.012867,-0.003727,0.121601,0.060154,...,-0.035481,-0.422592,0.857592,0.079594,0.002461,0.036233,-0.045727,0.034999,-0.021777,-0.155712
3,0.162572,0.584657,-0.079477,-0.036793,-0.018556,-0.017737,0.161692,-0.046884,0.046869,0.006456,...,-0.035936,1.05568,-0.165796,0.002102,0.000224,-0.100094,-0.012865,0.025813,-0.030258,-0.049562
4,-0.16111,1.072152,0.007187,-0.003877,0.013826,-0.1093,0.11667,0.067105,-0.053081,-0.040128,...,-0.038455,-0.607562,1.156622,0.004169,-0.060371,-0.07939,-0.083796,0.027416,-0.249721,-0.126681
5,-0.165521,0.73135,-0.094185,-0.011095,-0.131392,-0.149758,0.074113,-0.002645,0.13055,-0.065352,...,0.034411,0.466983,-0.033124,0.064742,0.212828,-0.000284,0.116074,-0.055754,0.044334,-0.054215
6,-0.147666,0.674244,0.103198,-0.047655,-0.039015,-0.122276,-0.087767,0.071341,0.03678,0.173893,...,0.066083,-0.230637,0.503837,0.088536,-0.04578,-0.02979,0.265578,-0.040744,-0.096533,-0.075069
7,0.591792,0.716295,0.037772,-0.117462,0.026315,-0.202705,0.202986,-0.04035,0.008308,0.00215,...,-0.164022,0.065552,1.063258,0.072712,-0.133844,-0.011877,-0.272681,0.023811,-0.037115,-0.131393


In [75]:
df_real = df[np.append('ID', df_category.columns.to_numpy())].astype('category')
df_real

Unnamed: 0,ID,Sex,Year,Season,City,Sport,Medal,AOS,AOE
0,1,M,1992,Summer,Barcelona,Basketball,Thanks,1,1
1,2,M,2012,Summer,London,Judo,Thanks,1,1
2,3,M,1920,Summer,Antwerpen,Football,Thanks,1,1
3,4,M,1900,Summer,Paris,Tug-Of-War,Gold,1,1
4,5,F,1988,Winter,Calgary,Speed Skating,Thanks,1,2
...,...,...,...,...,...,...,...,...,...
4995,3729,M,1992,Summer,Barcelona,Weightlifting,Thanks,1,1
4996,3730,M,2000,Summer,Sydney,Swimming,Thanks,1,2
4997,3731,M,1920,Summer,Antwerpen,Cycling,Thanks,1,2
4998,3732,M,1972,Summer,Munich,Archery,Thanks,1,1


In [76]:
for i in tqdm(range(len(df_real)//len(final) + 1)):
    noise = torch.randn(batch_size, z_dim).to(device)
    fake = gen(noise)
    fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
    fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
    demo = fake_df
    final = pd.concat([final, demo]).reset_index(drop=True)

df_fake = final.reset_index().rename(columns={'index': 'ID'}).iloc[0:len(df_real), :].astype('category')
df_fake

  0%|          | 0/626 [00:00<?, ?it/s]

Unnamed: 0,ID,Sex_F,Sex_M,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,Year_1920,...,AOS_3,AOE_1,AOE_2,AOE_3,AOE_4,AOE_5,AOE_6,AOE_7,AOE_8,AOE_9
0,0,0.111508,0.596181,-0.151121,0.040430,-0.050931,-0.177696,0.129544,-0.107013,0.185630,...,0.054076,-0.031153,0.826482,0.162490,-0.086493,-0.101923,-0.015557,0.015228,-0.170146,-0.156073
1,1,0.098309,0.454199,-0.021766,0.051484,-0.160897,-0.031407,-0.035640,-0.060809,-0.078812,...,0.060182,0.283964,0.308584,0.061927,-0.078722,0.017013,0.080899,0.004310,0.062640,0.040669
2,2,0.474018,0.368968,0.044821,0.043679,-0.058498,-0.035114,0.012867,-0.003727,0.121601,...,-0.035481,-0.422592,0.857592,0.079594,0.002461,0.036233,-0.045727,0.034999,-0.021777,-0.155712
3,3,0.162572,0.584657,-0.079477,-0.036793,-0.018556,-0.017737,0.161692,-0.046884,0.046869,...,-0.035936,1.055680,-0.165796,0.002102,0.000224,-0.100094,-0.012865,0.025813,-0.030258,-0.049562
4,4,-0.161110,1.072152,0.007187,-0.003877,0.013826,-0.109300,0.116670,0.067105,-0.053081,...,-0.038455,-0.607562,1.156622,0.004169,-0.060371,-0.079390,-0.083796,0.027416,-0.249721,-0.126681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,0.089316,0.911071,-0.142522,-0.063588,-0.017747,-0.096252,0.102376,-0.046433,0.150929,...,-0.016849,0.760885,-0.099401,0.102652,-0.002588,0.111498,0.099028,-0.027020,-0.049127,-0.144461
4996,4996,-0.028103,1.048884,-0.182147,-0.018192,-0.042553,-0.188540,0.185137,-0.135928,-0.082793,...,-0.006090,0.474296,0.624997,0.080766,0.076393,-0.122972,-0.146188,0.026630,-0.117189,0.001760
4997,4997,0.330999,0.349233,-0.007036,0.000585,-0.067067,-0.051638,0.114949,-0.000721,0.195123,...,-0.009454,0.037748,0.629775,0.011080,0.163922,-0.127604,0.069132,0.022782,0.150013,-0.062998
4998,4998,0.323919,0.273212,-0.043905,0.002611,-0.020632,-0.035969,0.128896,0.069915,0.038755,...,0.031633,0.223162,0.492151,0.142299,-0.067627,0.057264,-0.046073,0.037418,-0.018890,-0.053937


In [77]:
df_fake = one_hot_decoding(df_fake)
df_fake

Unnamed: 0,ID,Sex,Year,Season,City,Sport,Medal,AOS,AOE
0,0,M,2000,Summer,London,Shooting,Thanks,1,2
1,1,M,2012,Winter,London,Badminton,Thanks,1,2
2,2,F,2004,Summer,London,Swimming,Thanks,1,2
3,3,M,1984,Summer,London,Athletics,Thanks,1,1
4,4,M,2000,Summer,London,Athletics,Thanks,1,2
...,...,...,...,...,...,...,...,...,...
4995,4995,M,1992,Summer,London,Swimming,Thanks,1,1
4996,4996,M,1992,Summer,Barcelona,Athletics,Thanks,1,2
4997,4997,M,2016,Summer,Rio de Janeiro,Volleyball,Thanks,1,2
4998,4998,F,2004,Summer,Athina,Athletics,Thanks,1,2


# Visualization

## Draw columns

In [78]:
metadata = Metadata()
metadata.add_table(name='olympic',
                  data=df_real,
                  primary_key = 'ID')
metadata = metadata.get_table_meta('olympic')

In [79]:
# my_report = QualityReport()
# my_report.generate(df_real, df_fake, metadata)

In [80]:
# my_report.get_score()

In [81]:
# my_report.get_details(property_name='Column Shapes')

In [82]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Age'
# )

In [83]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Height'
# )

In [84]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Weight'
# )

In [85]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sex'
)

In [86]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Year'
)

In [87]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Season'
)

In [88]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='City'
)

In [89]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sport'
)

In [90]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Medal'
)

In [91]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOS'
)

In [92]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOE'
)

In [93]:
# fig = my_report.get_visualization(property_name='Column Shapes')
# fig.show()

## Draw relationship

In [94]:
# sns.set_theme(style="dark")

In [95]:
# x = df_real.Height.to_numpy()
# y = df_real.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')


# sns.scatterplot(x=x, y=y, s=5, color=".15")
# sns.histplot(x=x, y=y, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=x, y=y, levels=5, color="w", linewidths=1)


In [96]:
# xp = df_fake.Height.to_numpy()
# yp = df_fake.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')

# sns.scatterplot(x=xp, y=yp, s=5, color=".15")
# sns.histplot(x=xp, y=yp, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=xp, y=yp, levels=5, color="w", linewidths=1)
