In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sdmetrics.reports.utils import get_column_plot
from sdmetrics.reports.single_table import QualityReport
from sdv import Metadata
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sdmetrics.single_column import BoundaryAdherence

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Data pre-process

## Read cleaned data

In [5]:
# Please ensure that there is no duplicate keyword among column names, E.g., 'Sport' and 'AmountOfSport'
# df = pd.read_csv('../OlympicHistory/CleanedData.csv')
df = pd.read_csv('../OlympicHistory/CleanedData.csv').iloc[0:5000, :]

In [6]:
df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal,AOS,AOE,YOB
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,Thanks,1,1,1968
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,Thanks,1,1,1989
2,3,Gunnar Nielsen Aaby,M,24.0,175.0,71.0,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,Thanks,1,1,1896
3,4,Edgar Lindenau Aabye,M,34.0,182.0,95.0,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1,1,1866
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,Thanks,1,2,1967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3729,Henrik Andersen,M,25.0,168.0,75.0,Denmark,DEN,1992,Summer,Barcelona,Weightlifting,Weightlifting Men's Middleweight,Thanks,1,1,1967
4996,3730,Henrik Steen Andersen,M,23.0,193.0,95.0,Denmark,DEN,2000,Summer,Sydney,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay,Thanks,1,2,1977
4997,3731,Henry Anders Peter Brask Andersen,M,23.0,176.0,70.0,Denmark,DEN,1920,Summer,Antwerpen,Cycling,Cycling Men's Sprint,Thanks,1,2,1897
4998,3732,Herluf Juhl Andersen,M,40.0,177.0,69.0,Denmark,DEN,1972,Summer,Munich,Archery,Archery Men's Individual,Thanks,1,1,1932


In [7]:
# number of unique:
# Sex:            2
# Team:           1047
# NOC:            230
# Year:           35
# Season:         2
# City:           42
# Sport:          66
# Event:          710
# Medal:          4
# AOS:  4
# AOE:  23
# YearOfBirth:    169

## Find continuous data

## Find categorical data

In [8]:
# df_category = df[['Sex', 'Team', 'NOC', 'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'AOS', 'AOE', 'YearOfBirth']]
# df_category = df[['Sex', 'Team', 'Year', 'City', 'Sport','AOE']]
df_category = df[['Sex', 'Year', 'Season', 'City', 'Sport', 'Medal', 'AOS', 'AOE']]

df_category

Unnamed: 0,Sex,Year,Season,City,Sport,Medal,AOS,AOE
0,M,1992,Summer,Barcelona,Basketball,Thanks,1,1
1,M,2012,Summer,London,Judo,Thanks,1,1
2,M,1920,Summer,Antwerpen,Football,Thanks,1,1
3,M,1900,Summer,Paris,Tug-Of-War,Gold,1,1
4,F,1988,Winter,Calgary,Speed Skating,Thanks,1,2
...,...,...,...,...,...,...,...,...
4995,M,1992,Summer,Barcelona,Weightlifting,Thanks,1,1
4996,M,2000,Summer,Sydney,Swimming,Thanks,1,2
4997,M,1920,Summer,Antwerpen,Cycling,Thanks,1,2
4998,M,1972,Summer,Munich,Archery,Thanks,1,1


## Normalization

## One hot encoding/decoding

In [9]:
def one_hot_encoding(df: pd.DataFrame):
    cate_name = df.columns.to_numpy()
    cate_class_number = []
    cate_class = []
    for i in range(df.columns.shape[0]):
        cate_class.append(df.iloc[:, i].unique())
        cate_class_number.append(df.iloc[:, i].nunique())
    
    for i in tqdm(range(df.columns.shape[0])):
        df = pd.concat([df,pd.get_dummies(df[cate_name[i]], prefix=cate_name[i])],axis=1)
        df = df.drop(columns=cate_name[i])
    
    return cate_name, cate_class_number, cate_class, df

def one_hot_decoding(df:pd.DataFrame, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [10]:
cate_name, cate_class_number, cate_class, df_category_ohe = one_hot_encoding(df_category)
df_category_ohe

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,Sex_F,Sex_M,Year_1896,Year_1900,Year_1904,Year_1906,Year_1908,Year_1912,Year_1920,Year_1924,...,AOS_3,AOE_1,AOE_2,AOE_3,AOE_4,AOE_5,AOE_6,AOE_7,AOE_8,AOE_9
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4996,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4997,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4998,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Reshape data to 4d array

In [11]:
df_length = len(df_category_ohe.columns)
input_data = df_category_ohe.to_numpy().flatten().reshape(-1, 1, 1, df_length)
input_data.shape

(5000, 1, 1, 153)

# Build PyTorch Dataset

## Dataset class

In [12]:
class OlympicDataset(Dataset):
    
    def __init__(self, data: pd.DataFrame, transform=None):
        self.data = torch.from_numpy(data).float()
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        data_content = self.data[idx]
        return data_content

## Pytorch Normalization

In [13]:
# loader_all = DataLoader(input_data, batch_size=len(input_data), num_workers=1)
# data = next(iter(loader_all))
# mean = float(data.mean().detach().numpy())
# std = float(data.std().detach().numpy())

In [14]:
# transforms = transforms.Compose(
#     [transforms.ToTensor(), transforms.Normalize(mean, std)]
# )

transforms = transforms.Compose(
    [transforms.ToTensor()]
)

# Build GAN

## Hyperparameters

In [15]:
# # Hyperparameters etc.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# lr = 3e-4
# z_dim = 64 # 128, 256
# # image_dim = 5 * 5 * 1  # 25
# # image_dim = square_length * square_length * 1
# image_dim = 1 * df_length * 1

# batch_size = 32
# num_epochs = 50

# fixed_noise = torch.randn((batch_size, z_dim)).to(device)

## Dataset and DataLoader

## Discriminator

In [16]:
# stored parameter

# nn.Linear(in_features, 128),
# nn.LeakyReLU(0.2),
# nn.Dropout(0.5), 
# nn.Linear(128, 1),
# nn.Sigmoid(),

In [17]:
# class Discriminator(nn.Module):
#     def __init__(self, in_features):
#         super().__init__()
#         # Simple CNN
#         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
#         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
#         self.conv2_drop = nn.Dropout2d()
#         self.fc1 = nn.Linear(320, 50)
#         self.fc2 = nn.Linear(50, 1)

#     def forward(self, x):
#         x = F.relu(F.max_pool2d(self.conv1(x), 2))
#         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
#         # Flatten the tensor so it can be fed into the FC layers
#         x = x.view(-1, 320)
#         x = F.relu(self.fc1(x))
#         x = F.dropout(x, training=self.training)
#         x = self.fc2(x)
#         return F.sigmoid(x)
#         # return torch.sigmoid(x)

# disc = Discriminator(image_dim).to(device)

## Generator

In [18]:
# stored parameter

# nn.Linear(z_dim, 256),
# nn.LeakyReLU(0.3), 
# nn.Linear(256, img_dim),

In [19]:
# class Generator(nn.Module):
#     def __init__(self, latent_dim, img_dim):
#         super().__init__()
#         self.lin1 = nn.Linear(latent_dim, 7*7*64)   # [n, 256, 7, 7]
#         self.ct1 = nn.ConvTranspose2d(64, 32, 4, stride=2)  # [n, 64, 16, 16]
#         self.ct2 = nn.ConvTranspose2d(32, 16, 4, stride=2)  # [n, 16, 34, 34]
#         self.conv = nn.Conv2d(16, 1, kernel_size=7)     # [n, 1, 28, 28]

#     def forward(self, x):
#         # Pass latent space input into linear layer and reshape
#         x = self.lin1(x)
#         x = F.relu(x)
#         x = x.view(-1, 64, 7, 7)

#         # Upsample (transposed conv) 16x16 (64 feature maps)
#         # Transposed convolution to 16x16 (64 feature maps)
#         x = self.ct1(x)
#         x = F.relu(x)
        
#         # Upsample to 34x34 (16 feature maps)
#         # Transposed convolution to 34x34 (16 feature maps)
#         x = self.ct2(x)
#         x = F.relu(x)
        
#         # Convolution to 28x28 (1 feature map)
#         return self.conv(x)
    
# gen = Generator(z_dim, image_dim).to(device)

## Optimizer

## Tensorboard

In [20]:
# writer_fake = SummaryWriter(f"logs/fake")
# writer_real = SummaryWriter(f"logs/real")

# Train/Test

# Generate synthetic output

## Continuous data

## Categorical data

# Visualization

## Draw columns

In [21]:
# my_report.get_details(property_name='Column Shapes')

In [22]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Age'
# )

In [23]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Height'
# )

In [24]:
# get_column_plot(
#     real_data=df_real,
#     synthetic_data=df_fake,
#     metadata=metadata,
#     column_name='Weight'
# )

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sex'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Year'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Season'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='City'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Sport'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='Medal'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOS'
)

In [None]:
get_column_plot(
    real_data=df_real,
    synthetic_data=df_fake,
    metadata=metadata,
    column_name='AOE'
)

In [None]:
# fig = my_report.get_visualization(property_name='Column Shapes')
# fig.show()

## Draw relationship

In [None]:
# sns.set_theme(style="dark")

In [None]:
# x = df_real.Height.to_numpy()
# y = df_real.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')


# sns.scatterplot(x=x, y=y, s=5, color=".15")
# sns.histplot(x=x, y=y, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=x, y=y, levels=5, color="w", linewidths=1)


In [None]:
# xp = df_fake.Height.to_numpy()
# yp = df_fake.Weight.to_numpy()

# # Draw a combo histogram and scatterplot with density contours
# f, ax = plt.subplots(figsize=(6, 6))
# ax.set(ylim=(25, 200))
# ax.set(xlim=(120, 240))
# ax.set_title('Height vs Weight',
#              fontweight ="bold")
# ax.set_xlabel('Height')
# ax.set_ylabel('Weight')

# sns.scatterplot(x=xp, y=yp, s=5, color=".15")
# sns.histplot(x=xp, y=yp, bins=50, pthresh=.1, cmap="mako")
# sns.kdeplot(x=xp, y=yp, levels=5, color="w", linewidths=1)


# Grid Search

In [77]:
def train_test_evaluate(lr, negative_slope_G, negative_slope_D, dropout_probability):
    # Hyperparameters etc.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    z_dim = 64 # 128, 256
    image_dim = 1 * df_length * 1
    batch_size = 32
    num_epochs = 50


    fixed_noise = torch.randn((batch_size, z_dim)).to(device)


    dataset = OlympicDataset(input_data, transform=transforms)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


    class Discriminator(nn.Module):
        def __init__(self, in_features):
            super().__init__()
            self.disc = nn.Sequential(
                nn.Linear(in_features, 128),
                nn.LeakyReLU(negative_slope_D),
                nn.Dropout(dropout_probability), 
                nn.Linear(128, 1),
                nn.Sigmoid(),
            )

        def forward(self, x):
            return self.disc(x)

    disc = Discriminator(image_dim).to(device)


    class Generator(nn.Module):
        def __init__(self, z_dim, img_dim):
            super().__init__()
            self.gen = nn.Sequential(
                nn.Linear(z_dim, 256),
                nn.LeakyReLU(negative_slope_G), 
                nn.Linear(256, img_dim),
            )

        def forward(self, x):
            return self.gen(x)

    gen = Generator(z_dim, image_dim).to(device)


    opt_disc = optim.Adam(disc.parameters(), lr=lr)
    opt_gen = optim.Adam(gen.parameters(), lr=lr)
    criterion = nn.BCELoss()


    step = 0
#     print('Training process:')
#     for epoch in tqdm(range(num_epochs)):
    for epoch in range(num_epochs):
        for batch_idx, real in enumerate(loader):

            real = real.view(-1, 1*df_length).to(device)
            batch_size = real.shape[0]

            noise = torch.randn(batch_size, z_dim).to(device)
            fake = gen(noise)
            disc_real = disc(real).view(-1)
            lossD_real = criterion(disc_real, torch.ones_like(disc_real))
            disc_fake = disc(fake).view(-1)
            lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
            lossD = (lossD_real + lossD_fake) / 2
            disc.zero_grad()
            lossD.backward(retain_graph=True)
            opt_disc.step()

            ### Train Generator: min log(1 - D(G(z))) <-> max log(D(G(z))
            # where the second option of maximizing doesn't suffer from
            # saturating gradients
            output = disc(fake).view(-1)
            lossG = criterion(output, torch.ones_like(output))
            gen.zero_grad()
            lossG.backward()
            opt_gen.step()

            if batch_idx == 0:
#                 print(
#                     f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
#                           Loss D: {lossD:.4f}, loss G: {lossG:.4f}"
#                 )

                with torch.no_grad():
                    fake = gen(fixed_noise).reshape(-1, 1, 1, df_length)
                    data = real.reshape(-1, 1, 1, df_length)

                    img_grid_fake = torchvision.utils.make_grid(fake, normalize=True)
                    img_grid_real = torchvision.utils.make_grid(data, normalize=True)

                    step += 1


    fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
    fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
    final = fake_df.astype('category')


    df_real = df[np.append('ID', df_category.columns.to_numpy())].astype('category')

    
#     print('Generating process:')
#     for i in tqdm(range(len(df_real)//len(final) + 1)):
    for i in range(len(df_real)//len(final) + 1):
        noise = torch.randn(batch_size, z_dim).to(device)
        fake = gen(noise)
        fake_df = pd.DataFrame(fake.flatten().reshape(-1, df_length).detach().numpy())
        fake_df = fake_df.rename(columns = {i: df_category_ohe.columns[i] for i in range(df_category_ohe.columns.shape[0])})
        demo = fake_df
        final = pd.concat([final, demo]).reset_index(drop=True)
    df_fake = final.reset_index().rename(columns={'index': 'ID'}).iloc[0:len(df_real), :].astype('category')
    df_fake = one_hot_decoding(df_fake)


    metadata = Metadata()
    metadata.add_table(name='olympic',
                      data=df_real,
                      primary_key = 'ID')
    metadata = metadata.get_table_meta('olympic')
    my_report = QualityReport()
    my_report.generate(df_real, df_fake, metadata)
    score = my_report.get_score()
    
    property_df = my_report.get_properties()
    shape_score = property_df[property_df.Property == 'Column Shapes'].Score[0]
    trend_score = property_df[property_df.Property == 'Column Pair Trends'].Score[1]

    return score, shape_score, trend_score

In [78]:
# lr = 3e-4
# negative_slope_G = 0.5
# negative_slope_D = 0.2
# dropout_probability = 0.8

# score = train_test_evaluate(lr, negative_slope_G, negative_slope_D, dropout_probability)

In [79]:
# lr_list = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
# negative_slope_list = [0.01, 0.1, 0.3, 0.6, 1]
# dropout_probability_list = [0, 0.01, 0.1, 0.3, 0.6, 0.8]

In [80]:
# scores = []
# lrs = []
# negative_slopes_G = []
# negative_slopes_D = []
# dropout_probabilities = []

# for lr in tqdm(lr_list):
#     for negative_slope_G in tqdm(negative_slope_list):
#         for negative_slope_D in tqdm(negative_slope_list):
#             for dropout_probability in tqdm(dropout_probability_list):
#                 score = train_test_evaluate(lr, negative_slope_G, negative_slope_D, dropout_probability)
#                 scores.append(score)
#                 lrs.append(lr)
#                 negative_slopes_G.append(negative_slope_G)
#                 negative_slopes_D.append(negative_slope_D)
#                 dropout_probabilities.append(dropout_probability)

In [81]:
lr_list = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
negative_slope_list = [0.01, 0.1, 0.3, 0.6, 1]
dropout_probability_list = [0, 0.01, 0.1, 0.3, 0.6, 0.8]

In [82]:
scores = []
shape_scores = []
trend_scores = []

lrs = []
negative_slopes_G = []
negative_slopes_D = []
dropout_probabilities = []

negative_slope_G = 1
dropout_probability = 0

for lr in tqdm(lr_list):
        for negative_slope_D in tqdm(negative_slope_list):
                score, shape_score, trend_score = train_test_evaluate(lr, negative_slope_G, negative_slope_D, dropout_probability)
                scores.append(score)
                shape_scores.append(shape_score)
                trend_scores.append(trend_score)
                
                lrs.append(lr)
                negative_slopes_G.append(negative_slope_G)
                negative_slopes_D.append(negative_slope_D)
                dropout_probabilities.append(dropout_probability)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]



Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.89it/s][A[A



Overall Quality Score: 32.79%

Properties:
Column Shapes: 35.63%
Column Pair Trends: 29.95%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.02it/s][A[A



Overall Quality Score: 24.14%

Properties:
Column Shapes: 27.98%
Column Pair Trends: 20.3%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.72it/s][A[A



Overall Quality Score: 25.64%

Properties:
Column Shapes: 31.95%
Column Pair Trends: 19.33%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.97it/s][A[A



Overall Quality Score: 22.41%

Properties:
Column Shapes: 31.01%
Column Pair Trends: 13.8%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.07it/s][A[A


Overall Quality Score: 27.53%

Properties:
Column Shapes: 37.37%
Column Pair Trends: 17.7%





  0%|          | 0/5 [00:00<?, ?it/s]



Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.30it/s][A[A



Overall Quality Score: 42.22%

Properties:
Column Shapes: 39.42%
Column Pair Trends: 45.03%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.24it/s][A[A



Overall Quality Score: 42.09%

Properties:
Column Shapes: 40.78%
Column Pair Trends: 43.4%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:01<00:00,  3.89it/s][A[A



Overall Quality Score: 36.42%

Properties:
Column Shapes: 38.14%
Column Pair Trends: 34.71%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.28it/s][A[A



Overall Quality Score: 41.47%

Properties:
Column Shapes: 40.11%
Column Pair Trends: 42.84%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.50it/s][A[A


Overall Quality Score: 45.95%

Properties:
Column Shapes: 44.89%
Column Pair Trends: 47.01%





  0%|          | 0/5 [00:00<?, ?it/s]



Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.93it/s][A[A



Overall Quality Score: 59.17%

Properties:
Column Shapes: 50.72%
Column Pair Trends: 67.61%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  4.97it/s][A[A



Overall Quality Score: 62.74%

Properties:
Column Shapes: 52.77%
Column Pair Trends: 72.72%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.35it/s][A[A



Overall Quality Score: 56.92%

Properties:
Column Shapes: 48.97%
Column Pair Trends: 64.88%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.11it/s][A[A



Overall Quality Score: 56.9%

Properties:
Column Shapes: 49.82%
Column Pair Trends: 63.98%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.14it/s][A[A


Overall Quality Score: 42.02%

Properties:
Column Shapes: 41.7%
Column Pair Trends: 42.35%





  0%|          | 0/5 [00:00<?, ?it/s]



Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.16it/s][A[A



Overall Quality Score: 59.66%

Properties:
Column Shapes: 51.16%
Column Pair Trends: 68.15%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:01<00:00,  3.99it/s][A[A



Overall Quality Score: 59.65%

Properties:
Column Shapes: 50.64%
Column Pair Trends: 68.66%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.47it/s][A[A



Overall Quality Score: 58.74%

Properties:
Column Shapes: 51.07%
Column Pair Trends: 66.42%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.39it/s][A[A



Overall Quality Score: 54.33%

Properties:
Column Shapes: 47.84%
Column Pair Trends: 60.82%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.37it/s][A[A


Overall Quality Score: 56.46%

Properties:
Column Shapes: 48.81%
Column Pair Trends: 64.12%





  0%|          | 0/5 [00:00<?, ?it/s]



Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.78it/s][A[A



Overall Quality Score: 32.76%

Properties:
Column Shapes: 36.3%
Column Pair Trends: 29.22%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.74it/s][A[A



Overall Quality Score: 33.81%

Properties:
Column Shapes: 35.98%
Column Pair Trends: 31.64%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.56it/s][A[A



Overall Quality Score: 32.72%

Properties:
Column Shapes: 32.81%
Column Pair Trends: 32.64%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.48it/s][A[A



Overall Quality Score: 34.43%

Properties:
Column Shapes: 35.38%
Column Pair Trends: 33.48%




Creating report:   0%|                                    | 0/4 [00:00<?, ?it/s][A[A

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.73it/s][A[A


Overall Quality Score: 32.46%

Properties:
Column Shapes: 34.41%
Column Pair Trends: 30.51%





# Score

## Save scores

In [83]:
score_df = pd.DataFrame(list(zip(lrs, negative_slopes_G, negative_slopes_D, 
                                 dropout_probabilities, scores, shape_scores, 
                                 trend_scores)), 
                        columns=['LR', 'NSG', 'NSD', 'DP', 'Score', 
                                 'Shape_score', 'Trend_score'])
score_df

Unnamed: 0,LR,NSG,NSD,DP,Score,Shape_score,Trend_score
0,0.01,1,0.01,0,0.327893,0.3563,0.299486
1,0.01,1,0.1,0,0.241379,0.27975,0.203007
2,0.01,1,0.3,0,0.256386,0.31945,0.193321
3,0.01,1,0.6,0,0.22408,0.310125,0.138036
4,0.01,1,1.0,0,0.275338,0.373675,0.177
5,0.001,1,0.01,0,0.422223,0.394175,0.450271
6,0.001,1,0.1,0,0.420875,0.40775,0.434
7,0.001,1,0.3,0,0.364245,0.381425,0.347064
8,0.001,1,0.6,0,0.414725,0.4011,0.42835
9,0.001,1,1.0,0,0.459534,0.448925,0.470143


In [84]:
score_df.to_csv('scores2.csv', index=False)

## First Score

In [88]:
score_df = pd.read_csv('scores1.csv')
score_df

Unnamed: 0,LR,NSG,NSD,DP,Score
0,1.000000e-02,0.01,0.01,0.00,0.378573
1,1.000000e-02,0.01,0.01,0.01,0.187936
2,1.000000e-02,0.01,0.01,0.10,0.358505
3,1.000000e-02,0.01,0.01,0.30,0.385188
4,1.000000e-02,0.01,0.01,0.60,0.374029
...,...,...,...,...,...
895,1.000000e-07,1.00,1.00,0.01,0.317602
896,1.000000e-07,1.00,1.00,0.10,0.305477
897,1.000000e-07,1.00,1.00,0.30,0.282261
898,1.000000e-07,1.00,1.00,0.60,0.313414


In [62]:
score_df[score_df.Score == score_df.Score.max()]

Unnamed: 0,LR,NSG,NSD,DP,Score
432,0.0001,1.0,0.3,0.0,0.647505


In [87]:
score, shape_score, trend_score = train_test_evaluate(0.0001, 1, 0.3, 0)
(score, shape_score, trend_score)

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.45it/s]


Overall Quality Score: 63.21%

Properties:
Column Shapes: 53.11%
Column Pair Trends: 73.31%





(0.6320750005750002, 0.5311000011500002, 0.7330500000000001)

## Second score

In [89]:
score_df = pd.read_csv('scores2.csv')
score_df

Unnamed: 0,LR,NSG,NSD,DP,Score,Shape_score,Trend_score
0,0.01,1,0.01,0,0.327893,0.3563,0.299486
1,0.01,1,0.1,0,0.241379,0.27975,0.203007
2,0.01,1,0.3,0,0.256386,0.31945,0.193321
3,0.01,1,0.6,0,0.22408,0.310125,0.138036
4,0.01,1,1.0,0,0.275338,0.373675,0.177
5,0.001,1,0.01,0,0.422223,0.394175,0.450271
6,0.001,1,0.1,0,0.420875,0.40775,0.434
7,0.001,1,0.3,0,0.364245,0.381425,0.347064
8,0.001,1,0.6,0,0.414725,0.4011,0.42835
9,0.001,1,1.0,0,0.459534,0.448925,0.470143


In [91]:
score_df[score_df.Shape_score == score_df.Shape_score.max()]

Unnamed: 0,LR,NSG,NSD,DP,Score,Shape_score,Trend_score
11,0.0001,1,0.1,0,0.627443,0.52765,0.727236


In [92]:
score, shape_score, trend_score = train_test_evaluate(0.0001, 1, 0.1, 0)
(score, shape_score, trend_score)

Creating report: 100%|████████████████████████████| 4/4 [00:00<00:00,  5.21it/s]


Overall Quality Score: 62.42%

Properties:
Column Shapes: 52.56%
Column Pair Trends: 72.28%





(0.6242017862892857, 0.5255750011500002, 0.7228285714285713)