# Import library

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sdmetrics.reports.utils import get_column_plot
from sdmetrics.reports.single_table import QualityReport
from sdv import Metadata
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sdmetrics.single_column import BoundaryAdherence

import warnings
warnings.filterwarnings('ignore')

# Pre-defined functions

## Normalization/denormalization

In [9]:
def max_abs_norm(data: pd.DataFrame, column: str):
    max_val = data[column].abs().max()
    data[column] = data[column] / max_val
    return data, max_val


def min_max_norm(data: pd.DataFrame, column: str):
    min_val = data[column].min()
    max_val = data[column].max()
    data[column] = (data[column] - min_val) / (max_val - min_val)
    return data, min_val, max_val


def standardization(data: pd.DataFrame, column: str):
    mean_val = data[column].mean()
    std_val = data[column].std()
    data[column] = (data[column] - mean_val) / std_val
    return data, mean_val, std_val


def max_abs_denorm(data: pd.DataFrame, column: str, norm_dict: dict):
    max_val = norm_dict[column]
    data[column] = data[column] * max_val
    return data


def min_max_denorm(data: pd.DataFrame, column: str, norm_dict: dict):
    min_val = norm_dict[column][0]
    max_val = norm_dict[column][1]
    data[column] = data[column] * (max_val - min_val) + min_val
    return data


def destandardization(data: pd.DataFrame, column: str, norm_dict: dict):
    mean_val = norm_dict[column][0]
    std_val = norm_dict[column][1]
    data[column] = data[column] * std_val + mean_val
    return data


def norm(data: pd.DataFrame, columns: [], norm_types: []):
    norm_dict = {}
    for i in range(len(columns)):
        if norm_types[i] == 'max_abs':
            data, max_val = max_abs_norm(data, columns[i])
            norm_dict.update({columns[i]: max_val})
        
        if norm_types[i] == 'min_max':
            data, min_val, max_val = min_max_norm(data, columns[i])
            norm_dict.update({columns[i]: [min_val, max_val]})
            
        if norm_types[i] == 'standard':
            data, mean_val, std_val = standardization(data, columns[i])
            norm_dict.update({columns[i]: [mean_val, std_val]})

    return data, norm_dict


def denorm(data: pd.DataFrame, columns: [], norm_types: [], norm_dict: dict):
    for i in range(len(columns)):
        if norm_types[i] == 'max_abs':
            data = max_abs_denorm(data, columns[i], norm_dict)
        
        if norm_types[i] == 'min_max':
            data = min_max_denorm(data, columns[i], norm_dict)
        
        if norm_types[i] == 'standard':
            data = destandardization(data, columns[i], norm_dict)
    
    return data

## One hot encoding/decoding

In [10]:
def one_hot_encoding(df: pd.DataFrame):
    cate_name = df.columns.to_numpy()
    cate_class_number = []
    cate_class = []
    for i in range(df.columns.shape[0]):
        cate_class.append(df.iloc[:, i].unique())
        cate_class_number.append(df.iloc[:, i].nunique())
    
    for i in tqdm(range(df.columns.shape[0])):
        df = pd.concat([df,pd.get_dummies(df[cate_name[i]], prefix=cate_name[i])],axis=1)
        df = df.drop(columns=cate_name[i])
    
    return cate_name, cate_class_number, cate_class, df

def one_hot_decoding(df:pd.DataFrame, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

# Prepare data

In [4]:
df = pd.read_csv('../OlympicHistory/CleanedData.csv').iloc[0:5000, :]
df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Year,Season,City,Sport,Event,Medal,AOS,AOE,YOB
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,Thanks,1,1,1968
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,Thanks,1,1,1989
2,3,Gunnar Nielsen Aaby,M,24.0,175.0,71.0,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,Thanks,1,1,1896
3,4,Edgar Lindenau Aabye,M,34.0,182.0,95.0,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,1,1,1866
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,Thanks,1,2,1967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3729,Henrik Andersen,M,25.0,168.0,75.0,Denmark,DEN,1992,Summer,Barcelona,Weightlifting,Weightlifting Men's Middleweight,Thanks,1,1,1967
4996,3730,Henrik Steen Andersen,M,23.0,193.0,95.0,Denmark,DEN,2000,Summer,Sydney,Swimming,Swimming Men's 4 x 100 metres Freestyle Relay,Thanks,1,2,1977
4997,3731,Henry Anders Peter Brask Andersen,M,23.0,176.0,70.0,Denmark,DEN,1920,Summer,Antwerpen,Cycling,Cycling Men's Sprint,Thanks,1,2,1897
4998,3732,Herluf Juhl Andersen,M,40.0,177.0,69.0,Denmark,DEN,1972,Summer,Munich,Archery,Archery Men's Individual,Thanks,1,1,1932


In [11]:
# haven't try ['ID', 'Name', 'Team', 'NOC', 'Event', 'YOB']
continuous_columns = ['Age','Height', 'Weight']
categorical_columns = ['Sex', 'Year', 'Season', 'City', 'Sport', 'Medal', 'AOS', 'AOE']
df_conti = df[continuous_columns]
df_category = df[categorical_columns]

In [16]:
norm_list = continuous_columns
norm_types = ['standard' for i in range(len(norm_list))]
df_conti_norm, dict_conti = norm(df_conti, norm_list, norm_types)
df_conti_norm

Unnamed: 0,Age,Height,Weight
0,-0.246543,0.440417,0.599714
1,-0.417611,-0.620829,-0.883717
2,-0.246543,-0.090206,-0.067830
3,1.464137,0.652666,1.712287
4,-0.759747,0.971040,0.748057
...,...,...,...
4995,-0.075475,-0.833078,0.228856
4996,-0.417611,1.820036,1.712287
4997,-0.417611,0.015919,-0.142001
4998,2.490545,0.122043,-0.216173
