In [14]:
import pandas as pd
import numpy as np
import json 

from dython.nominal import compute_associations
from scipy.spatial import distance
from scipy.stats import wasserstein_distance
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [30]:
SDG = "ctgan"
DATASET = "adult"


real_path = f"../data/processed/{DATASET}/train_min.csv"
syn_path = f"../data/synthetic/{DATASET}/{SDG}.csv"

df_real = pd.read_csv(real_path)
df_syn = pd.read_csv(syn_path)

# load info json
#with open("../sdg-models/tabsyn/data/info/adult.json", "r") as f:
#        info = json.load(f)

In [3]:
df_real

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
1,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
2,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K
3,37.0,Private,280464.0,Some-college,10.0,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0.0,0.0,80.0,United-States,>50K
4,30.0,State-gov,141297.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0.0,0.0,40.0,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7836,71.0,,287372.0,Doctorate,16.0,Married-civ-spouse,,Husband,White,Male,0.0,0.0,10.0,United-States,>50K
7837,39.0,Local-gov,111499.0,Assoc-acdm,12.0,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,20.0,United-States,>50K
7838,53.0,Private,321865.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
7839,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


In [4]:
df_syn

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,54.0,Private,199474.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
1,58.0,Private,376262.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,Black,Male,96732.0,0.0,40.0,England,>50K
2,37.0,Private,187185.0,Some-college,10.0,Married-civ-spouse,Adm-clerical,Wife,White,Female,99999.0,0.0,40.0,United-States,>50K
3,30.0,State-gov,39542.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,Black,Male,7191.0,1.0,60.0,United-States,>50K
4,44.0,Self-emp-inc,154930.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,1891.0,40.0,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16874,36.0,Private,152396.0,Some-college,10.0,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0.0,0.0,40.0,Mexico,>50K
16875,45.0,Private,284146.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0.0,0.0,40.0,Ecuador,>50K
16876,59.0,Private,227666.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5334.0,0.0,40.0,United-States,>50K
16877,57.0,Private,187203.0,Assoc-voc,15.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,44.0,United-States,>50K


In [5]:
df_real["income"].value_counts()

income
>50K    7841
Name: count, dtype: int64

In [6]:
df_syn["income"].value_counts()

income
>50K    16879
Name: count, dtype: int64

In [31]:
# NaNs are read as float this breaks the code will transfrom to 'missing' for categoricalo columns
cat_cols= ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']

for col in cat_cols:
    df_real[col] = df_real[col].replace(0.0, 'missing')
    df_syn[col] = df_syn[col].replace(0.0, 'missing')

## Statistical similarity

In [32]:
# code which add label encoding to the data
LabelEncoders = {}

for col in cat_cols:
    #print(col)
    le = LabelEncoder()
    df_real[col] = le.fit_transform(df_real[col])
    df_syn[col] = le.transform(df_syn[col])
    LabelEncoders[col] = le


In [33]:
df_real

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,52.0,4,209642.0,11,9.0,2,3,0,4,1,0.0,0.0,45.0,36,0
1,31.0,2,45781.0,12,14.0,4,9,1,4,0,14084.0,0.0,50.0,36,0
2,42.0,2,159449.0,9,13.0,2,3,0,4,1,5178.0,0.0,40.0,36,0
3,37.0,2,280464.0,14,10.0,2,3,0,2,1,0.0,0.0,80.0,36,0
4,30.0,5,141297.0,9,13.0,2,9,0,1,1,0.0,0.0,40.0,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7836,71.0,6,287372.0,10,16.0,2,14,0,4,1,0.0,0.0,10.0,36,0
7837,39.0,1,111499.0,7,12.0,2,0,5,4,0,0.0,0.0,20.0,36,0
7838,53.0,2,321865.0,12,14.0,2,3,0,4,1,0.0,0.0,40.0,36,0
7839,40.0,2,154374.0,11,9.0,2,6,0,4,1,0.0,0.0,40.0,36,0


In [34]:
df_syn

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,54.0,2,199474.0,9,13.0,2,9,0,4,1,0.0,0.0,45.0,36,0
1,58.0,2,376262.0,11,9.0,2,2,0,2,1,96732.0,0.0,40.0,8,0
2,37.0,2,187185.0,14,10.0,2,0,5,4,0,99999.0,0.0,40.0,36,0
3,30.0,5,39542.0,11,9.0,2,2,0,2,1,7191.0,1.0,60.0,36,0
4,44.0,3,154930.0,9,13.0,2,3,0,4,1,0.0,1891.0,40.0,36,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16874,36.0,2,152396.0,14,10.0,2,3,0,1,1,0.0,0.0,40.0,24,0
16875,45.0,2,284146.0,9,13.0,2,3,0,1,1,0.0,0.0,40.0,6,0
16876,59.0,2,227666.0,11,9.0,2,3,0,4,1,5334.0,0.0,40.0,36,0
16877,57.0,2,187203.0,8,15.0,2,2,0,4,1,0.0,0.0,44.0,36,0


In [35]:
# code from ctab-gan-plus repo 
really = df_real.copy()
fakey = df_syn.copy()

# create corrleation matrix
real_corr = compute_associations(df_real, nominal_columns=cat_cols)
syn_corr = compute_associations(df_syn, nominal_columns=cat_cols)


In [36]:
# with labelEncoding
real_corr

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
age,1.0,0.238632,-0.068277,0.164399,-0.022411,0.211244,0.191638,0.145113,0.049869,0.084962,0.054473,0.016824,-0.127006,0.096465,0.0
workclass,0.238632,1.0,0.027158,0.126173,0.151132,0.02426,0.465895,0.052316,0.05561,0.119456,0.113022,0.041374,0.191554,0.026637,0.0
fnlwgt,-0.068277,0.027158,1.0,0.053365,-0.002573,0.045676,0.077863,0.022306,0.118996,0.017714,0.005543,-0.00333,-0.001055,0.109944,0.0
education,0.164399,0.126173,0.053365,1.0,1.0,0.04547,0.186821,0.058713,0.053708,0.06372,0.178113,0.081554,0.10095,0.133809,0.0
education.num,-0.022411,0.151132,-0.002573,1.0,1.0,0.111599,0.584477,0.101441,0.080187,0.030948,0.106002,0.061825,0.060773,0.159457,0.0
marital.status,0.211244,0.02426,0.045676,0.04547,0.111599,1.0,0.060582,0.46822,0.027924,0.261834,0.056619,0.028579,0.064145,0.035434,0.0
occupation,0.191638,0.465895,0.077863,0.186821,0.584477,0.060582,1.0,0.13384,0.063765,0.284914,0.100181,0.065301,0.246443,0.040323,0.0
relationship,0.145113,0.052316,0.022306,0.058713,0.101441,0.46822,0.13384,1.0,0.055348,0.854583,0.055484,0.032718,0.224465,0.061708,0.0
race,0.049869,0.05561,0.118996,0.053708,0.080187,0.027924,0.063765,0.055348,1.0,0.061328,0.02689,0.015774,0.026139,0.402828,0.0
sex,0.084962,0.119456,0.017714,0.06372,0.030948,0.261834,0.284914,0.854583,0.061328,1.0,0.005609,0.015086,0.192779,0.03026,0.0


In [37]:
# without labelEncoding
real_corr

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
age,1.0,0.238632,-0.068277,0.164399,-0.022411,0.211244,0.191638,0.145113,0.049869,0.084962,0.054473,0.016824,-0.127006,0.096465,0.0
workclass,0.238632,1.0,0.027158,0.126173,0.151132,0.02426,0.465895,0.052316,0.05561,0.119456,0.113022,0.041374,0.191554,0.026637,0.0
fnlwgt,-0.068277,0.027158,1.0,0.053365,-0.002573,0.045676,0.077863,0.022306,0.118996,0.017714,0.005543,-0.00333,-0.001055,0.109944,0.0
education,0.164399,0.126173,0.053365,1.0,1.0,0.04547,0.186821,0.058713,0.053708,0.06372,0.178113,0.081554,0.10095,0.133809,0.0
education.num,-0.022411,0.151132,-0.002573,1.0,1.0,0.111599,0.584477,0.101441,0.080187,0.030948,0.106002,0.061825,0.060773,0.159457,0.0
marital.status,0.211244,0.02426,0.045676,0.04547,0.111599,1.0,0.060582,0.46822,0.027924,0.261834,0.056619,0.028579,0.064145,0.035434,0.0
occupation,0.191638,0.465895,0.077863,0.186821,0.584477,0.060582,1.0,0.13384,0.063765,0.284914,0.100181,0.065301,0.246443,0.040323,0.0
relationship,0.145113,0.052316,0.022306,0.058713,0.101441,0.46822,0.13384,1.0,0.055348,0.854583,0.055484,0.032718,0.224465,0.061708,0.0
race,0.049869,0.05561,0.118996,0.053708,0.080187,0.027924,0.063765,0.055348,1.0,0.061328,0.02689,0.015774,0.026139,0.402828,0.0
sex,0.084962,0.119456,0.017714,0.06372,0.030948,0.261834,0.284914,0.854583,0.061328,1.0,0.005609,0.015086,0.192779,0.03026,0.0


In [38]:
corr_dist = np.linalg.norm(real_corr - syn_corr)

In [39]:
corr_dist

1.1505219523361918

In [10]:
# without label encoding
corr_dist

1.150521952336192

In [75]:
Stat_dict = {}
cat_stat = []
num_stat = []

for column in df_real.columns:
        
    if column in cat_cols:
        #print(column)

        real_pdf=(really[column].value_counts()/really[column].value_counts().sum())
        fake_pdf=(fakey[column].value_counts()/fakey[column].value_counts().sum())
        categories = (fakey[column].value_counts()/fakey[column].value_counts().sum()).keys().tolist()
        sorted_categories = sorted(categories)
        
        real_pdf_values = [] 
        fake_pdf_values = []

        for i in sorted_categories:
            real_pdf_values.append(real_pdf[i])
            fake_pdf_values.append(fake_pdf[i])

        if len(real_pdf)!=len(fake_pdf):
            zero_cats = set(really[column].value_counts().keys())-set(fakey[column].value_counts().keys())
            for z in zero_cats:
                real_pdf_values.append(real_pdf[z])
                fake_pdf_values.append(0)
        Stat_dict[column]=(distance.jensenshannon(real_pdf_values,fake_pdf_values, 2.0))
        cat_stat.append(Stat_dict[column])    
        print("column: ", column, "JSD: ", Stat_dict[column])  
    else:
        scaler = MinMaxScaler()
        scaler.fit(df_real[column].values.reshape(-1,1))
        l1 = scaler.transform(df_real[column].values.reshape(-1,1)).flatten()
        l2 = scaler.transform(df_syn[column].values.reshape(-1,1)).flatten()
        Stat_dict[column]= (wasserstein_distance(l1,l2))
        print("column: ", column, "WD: ", Stat_dict[column])
        num_stat.append(Stat_dict[column])

column:  age WD:  0.0034392463923820717
workclass
column:  workclass JSD:  0.03122529547046864
column:  fnlwgt WD:  0.0008187794132309643
education
column:  education JSD:  0.03820892557659078
column:  education.num WD:  0.004402104551438246
marital.status
column:  marital.status JSD:  0.022092910072399337
occupation
column:  occupation JSD:  0.04724614053342541
relationship
column:  relationship JSD:  0.009482097279248458
race
column:  race JSD:  0.02237077981301528
sex
column:  sex JSD:  0.0064724424880911735
column:  capital.gain WD:  0.004125651937904421
column:  capital.loss WD:  0.00208954179202446
column:  hours.per.week WD:  0.005029731035585274
native.country
column:  native.country JSD:  0.05790220350823997
income
column:  income JSD:  0.009561174950196984


In [76]:
print(np.mean(num_stat),np.mean(cat_stat),corr_dist)

0.0033175091870942396 0.027173552187964006 0.411851167316875


In [77]:
stat_res_avg = []
stat_res_avg.append([np.mean(num_stat),np.mean(cat_stat),corr_dist])
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.003318,0.027174,0.411851


### Interpretation

-> lower values indicates a better result

## alpha-Precision & beta-Recall

- alpha-Precision: evaluates the fidelity of synthetic data -> whether each synthetic example comes from the real-data distribution
- beta-Recall: evaluates the coverge of the synthetic data -> whether the synthetic data can cover the entire distribution of the real data (in other words: whether a reald data sample is close to the syhnthetic data)

-> higher values indicate superior performance


Code from TabSyn

In [1]:
import json
import pandas as pd
import numpy as np
from synthcity.metrics import eval_detection, eval_performance, eval_statistical
from synthcity.plugins.core.dataloader import GenericDataLoader
from sklearn.preprocessing import OneHotEncoder
import os

  from .autonotebook import tqdm as notebook_tqdm


                  variable OMP_PATH to the location of the header before importing keopscore or pykeops,
                  e.g. using os.environ: import os; os.environ['OMP_PATH'] = '/path/to/omp/header'


In [2]:
real_path = "../sdg-models/tabsyn/synthetic/adult/real.csv"
syn_path = "../sdg-models/tabsyn/synthetic/adult/tabsyn.csv"

real_data = pd.read_csv(real_path)
syn_data = pd.read_csv(syn_path)

# load info json
with open("../sdg-models/tabsyn/data/info/adult.json", "r") as f:
        info = json.load(f)

In [3]:
dataname = "adult"
model = "tabsyn"

real_data.columns = range(len(real_data.columns))
syn_data.columns = range(len(syn_data.columns))

num_col_idx = info['num_col_idx']
cat_col_idx = info['cat_col_idx']
target_col_idx = info['target_col_idx']
cat_col_idx += target_col_idx
    
num_real_data = real_data[num_col_idx]
cat_real_data = real_data[cat_col_idx]

num_real_data_np = num_real_data.to_numpy()
cat_real_data_np = cat_real_data.to_numpy().astype('str')
    

num_syn_data = syn_data[num_col_idx]
cat_syn_data = syn_data[cat_col_idx]

num_syn_data_np = num_syn_data.to_numpy()

# cat_syn_data_np = np.array
cat_syn_data_np = cat_syn_data.to_numpy().astype('str')
if (dataname == 'default' or dataname == 'news') and model[:4] == 'codi':
    cat_syn_data_np = cat_syn_data.astype('int').to_numpy().astype('str')

elif model[:5] == 'great':
    if dataname == 'shoppers':
        cat_syn_data_np[:, 1] = cat_syn_data[11].astype('int').to_numpy().astype('str')
        cat_syn_data_np[:, 2] = cat_syn_data[12].astype('int').to_numpy().astype('str')
        cat_syn_data_np[:, 3] = cat_syn_data[13].astype('int').to_numpy().astype('str')
        
        max_data = cat_real_data[14].max()
    
        cat_syn_data.loc[cat_syn_data[14] > max_data, 14] = max_data
        # cat_syn_data[14] = cat_syn_data[14].apply(lambda x: threshold if x > max_data else x)
        
        cat_syn_data_np[:, 4] = cat_syn_data[14].astype('int').to_numpy().astype('str')
        cat_syn_data_np[:, 4] = cat_syn_data[14].astype('int').to_numpy().astype('str')
    
    elif dataname in ['default', 'faults', 'beijing']:

        columns = cat_real_data.columns
        for i, col in enumerate(columns):
            if (cat_real_data[col].dtype == 'int'):

                max_data = cat_real_data[col].max()
                min_data = cat_real_data[col].min()

                cat_syn_data.loc[cat_syn_data[col] > max_data, col] = max_data
                cat_syn_data.loc[cat_syn_data[col] < min_data, col] = min_data

                cat_syn_data_np[:, i] = cat_syn_data[col].astype('int').to_numpy().astype('str')
                
    else:
        cat_syn_data_np = cat_syn_data.to_numpy().astype('str')

else:
    cat_syn_data_np = cat_syn_data.to_numpy().astype('str')

encoder = OneHotEncoder()
encoder.fit(cat_real_data_np)


cat_real_data_oh = encoder.transform(cat_real_data_np).toarray()
cat_syn_data_oh = encoder.transform(cat_syn_data_np).toarray()

le_real_data = pd.DataFrame(np.concatenate((num_real_data_np, cat_real_data_oh), axis = 1)).astype(float)
le_real_num = pd.DataFrame(num_real_data_np).astype(float)
le_real_cat = pd.DataFrame(cat_real_data_oh).astype(float)


le_syn_data = pd.DataFrame(np.concatenate((num_syn_data_np, cat_syn_data_oh), axis = 1)).astype(float)
le_syn_num = pd.DataFrame(num_syn_data_np).astype(float)
le_syn_cat = pd.DataFrame(cat_syn_data_oh).astype(float)

np.set_printoptions(precision=4)

result = []

print('=========== All Features ===========')
print('Data shape: ', le_syn_data.shape)

X_syn_loader = GenericDataLoader(le_syn_data)
X_real_loader = GenericDataLoader(le_real_data)

quality_evaluator = eval_statistical.AlphaPrecision()
qual_res = quality_evaluator.evaluate(X_real_loader, X_syn_loader)
qual_res = {
    k: v for (k, v) in qual_res.items() if "naive" in k
}  # use the naive implementation of AlphaPrecision
qual_score = np.mean(list(qual_res.values()))

print('alpha precision: {:.6f}, beta recall: {:.6f}'.format(qual_res['delta_precision_alpha_naive'], qual_res['delta_coverage_beta_naive'] ))

Alpha_Precision_all = qual_res['delta_precision_alpha_naive']
Beta_Recall_all = qual_res['delta_coverage_beta_naive']

: 