In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sdmetrics.reports.utils import get_column_plot
from sdmetrics.reports.single_table import QualityReport
from sdv import Metadata
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sdmetrics.single_column import BoundaryAdherence

In [40]:
import warnings
warnings.filterwarnings('ignore')

# Data pre-process

## Read cleaned data

In [41]:
# Please ensure that there is no duplicate keyword among column names, E.g., 'Sport' and 'AmountOfSport'
# df = pd.read_csv('../OlympicHistory/CleanedData.csv')
df = pd.read_csv('../OlympicHistory/CleanedData.csv').iloc[0:5000, :]

In [42]:
df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal,AOS,AOE,YOB
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,Thanks,1,1,1968
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,Thanks,1,1,1989
2,3,Gunnar Nielsen Aaby,M,24.0,175.0,71.0,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,Thanks,1,1,1896
3,4,Edgar Lindenau Aabye,M,34.0,182.0,95.0,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1,1,1866
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,Thanks,1,2,1967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3729,Henrik Andersen,M,25.0,168.0,75.0,Denmark,DEN,1992,Summer,Barcelona,Weightlifting,Weightlifting Men's Middleweight,Thanks,1,1,1967
4996,3730,Henrik Steen Andersen,M,23.0,193.0,95.0,Denmark,DEN,2000,Summer,Sydney,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay,Thanks,1,2,1977
4997,3731,Henry Anders Peter Brask Andersen,M,23.0,176.0,70.0,Denmark,DEN,1920,Summer,Antwerpen,Cycling,Cycling Men's Sprint,Thanks,1,2,1897
4998,3732,Herluf Juhl Andersen,M,40.0,177.0,69.0,Denmark,DEN,1972,Summer,Munich,Archery,Archery Men's Individual,Thanks,1,1,1932


In [43]:
# number of unique:
# Sex:            2
# Team:           1047
# NOC:            230
# Year:           35
# Season:         2
# City:           42
# Sport:          66
# Event:          710
# Medal:          4
# AOS:  4
# AOE:  23
# YearOfBirth:    169

## Find continuous data

## Find categorical data

In [44]:
# df_category = df[['Sex', 'Team', 'NOC', 'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'AOS', 'AOE', 'YearOfBirth']]
# df_category = df[['Sex', 'Team', 'Year', 'City', 'Sport','AOE']]
df_category = df[['Sex', 'Year', 'Season', 'City', 'Sport', 'Medal', 'AOS', 'AOE']]

df_category

Unnamed: 0,Sex,Year,Season,City,Sport,Medal,AOS,AOE
0,M,1992,Summer,Barcelona,Basketball,Thanks,1,1
1,M,2012,Summer,London,Judo,Thanks,1,1
2,M,1920,Summer,Antwerpen,Football,Thanks,1,1
3,M,1900,Summer,Paris,Tug-Of-War,Gold,1,1
4,F,1988,Winter,Calgary,Speed Skating,Thanks,1,2
...,...,...,...,...,...,...,...,...
4995,M,1992,Summer,Barcelona,Weightlifting,Thanks,1,1
4996,M,2000,Summer,Sydney,Swimming,Thanks,1,2
4997,M,1920,Summer,Antwerpen,Cycling,Thanks,1,2
4998,M,1972,Summer,Munich,Archery,Thanks,1,1


## Normalization

## One hot encoding/decoding

In [45]:
def one_hot_encoding(df: pd.DataFrame):
    cate_name = df.columns.to_numpy()
    cate_class_number = []
    cate_class = []
    for i in range(df.columns.shape[0]):
        cate_class.append(df.iloc[:, i].unique())
        cate_class_number.append(df.iloc[:, i].nunique())
    
    for i in tqdm(range(df.columns.shape[0])):
        df = pd.concat([df,pd.get_dummies(df[cate_name[i]], prefix=cate_name[i])],axis=1)
        df = df.drop(columns=cate_name[i])
    
    return cate_name, cate_class_number, cate_class, df

def one_hot_decoding(df:pd.DataFrame, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [46]:
cate_name, cate_class_number, cate_class, df_category_ohe = one_hot_encoding(df_category)
df_category_ohe

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,Sex_F,Sex_M,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,Year_1920,Year_1924,...,AOS_3,AOE_1,AOE_2,AOE_3,AOE_4,AOE_5,AOE_6,AOE_7,AOE_8,AOE_9
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4996,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4997,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4998,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Reshape data to 4d array

In [47]:
df_length = len(df_category_ohe.columns)
input_data = df_category_ohe.to_numpy().flatten().reshape(-1, 1, 1, df_length)
input_data.shape

(5000, 1, 1, 153)

# Build PyTorch Dataset

## Dataset class

In [48]:
class OlympicDataset(Dataset):
    
    def __init__(self, data: pd.DataFrame, transform=None):
        self.data = torch.from_numpy(data).float()
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        data_content = self.data[idx]
        return data_content

## Pytorch Normalization

In [49]:
# loader_all = DataLoader(input_data, batch_size=len(input_data), num_workers=1)
# data = next(iter(loader_all))
# mean = float(data.mean().detach().numpy())
# std = float(data.std().detach().numpy())

In [50]:
# transforms = transforms.Compose(
#     [transforms.ToTensor(), transforms.Normalize(mean, std)]
# )

transforms = transforms.Compose(
    [transforms.ToTensor()]
)

# Train by sampling

In [51]:
# # Hyperparameters etc.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# lr = 3e-4
# z_dim = 64 # 128, 256
# # image_dim = 5 * 5 * 1  # 25
# # image_dim = 1 * 5 * 1  # 5
# image_dim = 1 * df_length * 1


# batch_size = 32
# num_epochs = 200

# fixed_noise = torch.randn((batch_size, z_dim)).to(device)

In [52]:
# Hyperparameters etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
z_dim = 64 # 128, 256
image_dim = 1 * df_length * 1
batch_size = 32
num_epochs = 200

lr = 0.0001
# negative_slope_G = 1.0
# negative_slope_D = 0.3
# dropout_probability = 0.0

In [53]:
fixed_noise = torch.randn((batch_size, z_dim)).to(device)
dataset = OlympicDataset(input_data, transform=transforms)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [54]:
# class Discriminator(nn.Module):
#     def __init__(self, in_features):
#         super().__init__()
#         self.disc = nn.Sequential(
#             nn.Linear(in_features, 128),
#             nn.LeakyReLU(0.01),
#             nn.Linear(128, 1),
#             nn.Sigmoid(),
#         )

#     def forward(self, x):
#         return self.disc(x)

# disc = Discriminator(image_dim).to(device)


# class Generator(nn.Module):
#     def __init__(self, z_dim, img_dim):
#         super().__init__()
#         self.gen = nn.Sequential(
#             nn.Linear(z_dim, 256),
#             nn.LeakyReLU(0.01),
#             nn.Linear(256, img_dim),
# #             nn.Tanh(),  # normalize inputs to [-1, 1] so make outputs [-1, 1]
#         )

#     def forward(self, x):
#         return self.gen(x)

# gen = Generator(z_dim, image_dim).to(device)

# opt_disc = optim.Adam(disc.parameters(), lr=lr)
# opt_gen = optim.Adam(gen.parameters(), lr=lr)
# criterion = nn.BCELoss()

In [55]:
class Discriminator(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.disc(x)

disc = Discriminator(image_dim).to(device)


class Generator(nn.Module):
    def __init__(self, z_dim, img_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),
            nn.LeakyReLU(1.0), 
            nn.Linear(256, img_dim),
        )

    def forward(self, x):
        return self.gen(x)

gen = Generator(z_dim, image_dim).to(device)

opt_disc = optim.Adam(disc.parameters(), lr=lr)
opt_gen = optim.Adam(gen.parameters(), lr=lr)
criterion = nn.BCELoss()

In [56]:
df_real = df[np.append('ID', df_category.columns.to_numpy())].astype('category')

In [57]:
a = 0.4/100
r = 1.00217

# a = 0.1/100
# r = 1.01344

# a = 0.01/100
# r = 1.02872

# a = 0.001/100
# r = 1.04268

# a = 1e-10/100
# r = 1.13679

step = 0
print('Process:')
for epoch in tqdm(range(num_epochs)):
# for epoch in range(num_epochs):
#     noise_epoch = torch.randn(batch_size, z_dim).to(device)
    for batch_idx, real in enumerate(loader):

        real = real.view(-1, 1*df_length).to(device)
        batch_size = real.shape[0]

        noise = torch.randn(batch_size, z_dim).to(device)
        fake = gen(noise)
        disc_real = disc(real).view(-1)
        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        disc_fake = disc(fake).view(-1)
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2
        disc.zero_grad()
        lossD.backward(retain_graph=True)
        opt_disc.step()

        ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
        # where the second option of maximizing doesn't suffer from
        # saturating gradients
        output = disc(fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        if batch_idx == 0:
    #                 print(
    #                     f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
    #                           Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
    #                 )

            with torch.no_grad():
                fake = gen(fixed_noise).reshape(-1, 1, 1, df_length)
                data = real.reshape(-1, 1, 1, df_length)

                img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
                img_grid_real = torchvision.utils.make_grid(data, normalize=True)

                step += 1
            
    if epoch == 0:
        fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
        fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
        final = fake_df.astype('category')
        loop_num = int(len(df_real) * a //len(fake_df))
        for i in range(1 if loop_num==0 else loop_num):
            noise = torch.randn(batch_size, z_dim).to(device)
            fake = gen(noise)
            fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
            fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
            demo = fake_df
            final = pd.concat([final, demo]).reset_index(drop=True)
    else:
        loop_num = int(len(df_real) * a // len(fake_df))
        for i in range(loop_num + 1):
            noise = torch.randn(batch_size, z_dim).to(device)
            fake = gen(noise)
            fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
            fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
            demo = fake_df
            final = pd.concat([final, demo]).reset_index(drop=True)
    
    a = a * r

Process:


  0%|          | 0/200 [00:00<?, ?it/s]

In [58]:
# step = 0
# print('Process:')
# for epoch in tqdm(range(num_epochs)):
# # for epoch in range(num_epochs):
#     for batch_idx, real in enumerate(loader):

#         real = real.view(-1, 1*df_length).to(device)
#         batch_size = real.shape[0]

#         noise = torch.randn(batch_size, z_dim).to(device)
#         fake = gen(noise)
#         disc_real = disc(real).view(-1)
#         lossD_real = criterion(disc_real, torch.ones_like(disc_real))
#         disc_fake = disc(fake).view(-1)
#         lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
#         lossD = (lossD_real + lossD_fake) / 2
#         disc.zero_grad()
#         lossD.backward(retain_graph=True)
#         opt_disc.step()

#         ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
#         # where the second option of maximizing doesn't suffer from
#         # saturating gradients
#         output = disc(fake).view(-1)
#         lossG = criterion(output, torch.ones_like(output))
#         gen.zero_grad()
#         lossG.backward()
#         opt_gen.step()

#         if batch_idx == 0:
#     #                 print(
#     #                     f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
#     #                           Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
#     #                 )

#             with torch.no_grad():
#                 fake = gen(fixed_noise).reshape(-1, 1, 1, df_length)
#                 data = real.reshape(-1, 1, 1, df_length)

#                 img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
#                 img_grid_real = torchvision.utils.make_grid(data, normalize=True)

#                 step += 1
            
#     if epoch == 0:
#         fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
#         fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
#         final = fake_df.astype('category')
#         for i in range(len(df_real)//len(fake_df)//num_epochs):
#             noise = torch.randn(batch_size, z_dim).to(device)
#             fake = gen(noise)
#             fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
#             fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
#             demo = fake_df
#             final = pd.concat([final, demo]).reset_index(drop=True)
#     else:
#         for i in range(len(df_real)//len(fake_df)//num_epochs + 1):
#             noise = torch.randn(batch_size, z_dim).to(device)
#             fake = gen(noise)
#             fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
#             fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
#             demo = fake_df
#             final = pd.concat([final, demo]).reset_index(drop=True)

In [59]:
# df_fake = final.reset_index().rename(columns={'index': 'ID'}).iloc[0:len(df_real), :].astype('category')
df_fake = final.reset_index().rename(columns={'index': 'ID'}).astype('category')
df_fake = one_hot_decoding(df_fake)

In [60]:
len(df_fake)

5720

# Visualization

## Matadata

In [61]:
metadata = Metadata()
metadata.add_table(name='olympic',
                  data=df_real,
                  primary_key = 'ID')
metadata = metadata.get_table_meta('olympic')

## Draw columns

In [62]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Age'
# )

In [63]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Height'
# )

In [64]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Weight'
# )

In [65]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sex'
)

In [66]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Year'
)

In [67]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Season'
)

In [68]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='City'
)

In [69]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sport'
)

In [70]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Medal'
)

In [71]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOS'
)

In [72]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOE'
)

In [73]:
my_report = QualityReport()
my_report.generate(df_real, df_fake, metadata)
score = my_report.get_score()

property_df = my_report.get_properties()
shape_score = property_df[property_df.Property == 'Column Shapes'].Score[0]
trend_score = property_df[property_df.Property == 'Column Pair Trends'].Score[1]

Creating report: 100%|█████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.60it/s]


Overall Quality Score: 70.51%

Properties:
Column Shapes: 57.6%
Column Pair Trends: 83.42%





## Draw relationship

In [74]:
# sns.set_theme(style="dark")

In [75]:
# x = df_real.Height.to_numpy()
# y = df_real.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')


# sns.scatterplot(x=x, y=y, s=5, color=".15")
# sns.histplot(x=x, y=y, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=x, y=y, levels=5, color="w", linewidths=1)


In [76]:
# xp = df_fake.Height.to_numpy()
# yp = df_fake.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')

# sns.scatterplot(x=xp, y=yp, s=5, color=".15")
# sns.histplot(x=xp, y=yp, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=xp, y=yp, levels=5, color="w", linewidths=1)
