In [146]:
import sys
import pandas as pd

from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.metadata import Metadata

In [151]:
sdg_list = ["smote", "ctgan", "tabsyn", "ctab-gan-plus", "vae-bgm", "tvae-all", "tvae-top-2"]
dataset_list = ["adult", "yeast", "cc-fraud-1", "cc-fraud-5"]

for sdg in sdg_list:
    for dataset in dataset_list:
        # no data for yeast with ctab-gan-plus model        
        if (sdg == "ctab-gan-plus") & (dataset == "yeast"):
            continue

        #print(sdg, dataset)
        # read dataset
        syn_path = f"../data/synthetic/{dataset}/{sdg}.csv"
        if dataset in ["cc-fraud-1", "cc-fraud-5"]:
            real_path = f"../data/processed/cc-fraud/train_min.csv"
        else:
            real_path = f"../data/processed/{dataset}/train_min.csv"

        df_syn = pd.read_csv(syn_path)
        df_real = pd.read_csv(real_path)

        # filter df_syn for minority column
        if dataset == "adult": 
            minority_class = ">50K"
            target = "income"
        if dataset == "yeast": 
            minority_class =  "ME2"
            target = "localization.site"
        if dataset in ["cc-fraud-1", "cc-fraud-5"]:
            minority_class= "fraud"
            target = "Class"

        df_syn = df_syn[df_syn[target] == minority_class]
         
        # get categorical columns
        categorical_columns = df_real.select_dtypes(include=['object', 'category']).columns

        # get numerical columns
        numerical_columns = df_real.select_dtypes(include=['number']).columns.to_list()

        # no education.num column for vae-bgm model
        if (sdg == "vae-bgm") and (dataset == "adult"):
            numerical_columns.remove("education.num")


        # Identify rows with mismatched categorical or numerical value 
        mismatched_rows = pd.DataFrame()

        # loop through categorical columns and check for additional categories in synthetic data
        for column in categorical_columns:
            original_values = set(df_real[column].unique())
            
            mismatched = df_syn[~df_syn[column].isin(original_values)]
            mismatched_rows = pd.concat([mismatched_rows, mismatched])


        for col in numerical_columns:
            min_val = df_real[col].min()
            max_val = df_real[col].max()
            
            mismatched = df_syn[
                (df_syn[col] < min_val) | (df_syn[col] > max_val)
            ]
            mismatched_rows = pd.concat([mismatched_rows, mismatched])
        
        # Remove duplicate rows
        mismatched_rows = mismatched_rows.drop_duplicates()
        num_mismatches = len(mismatched_rows)

        if num_mismatches != 0:
            invalid_ratio = num_mismatches * 100 / len(df_syn)
            print(f"{sdg} {dataset}: {num_mismatches} from {len(df_syn)} generated data | {round(invalid_ratio, 2)}")
        else:
            print(f"{sdg} {dataset}: {num_mismatches} from {len(df_syn)} generated data")
        #print(mismatched_rows)

smote adult: 0 from 16879 generated data
smote yeast: 0 from 329 generated data
smote cc-fraud-1: 0 from 38612 generated data
smote cc-fraud-5: 0 from 7092 generated data
ctgan adult: 0 from 16879 generated data
ctgan yeast: 0 from 329 generated data
ctgan cc-fraud-1: 0 from 38612 generated data
ctgan cc-fraud-5: 0 from 7092 generated data
tabsyn adult: 22 from 16819 generated data | 0.13
tabsyn yeast: 61 from 345 generated data | 17.68
tabsyn cc-fraud-1: 1211 from 36762 generated data | 3.29
tabsyn cc-fraud-5: 208 from 6758 generated data | 3.08
ctab-gan-plus adult: 40 from 17170 generated data | 0.23
ctab-gan-plus cc-fraud-1: 2590 from 43353 generated data | 5.97
ctab-gan-plus cc-fraud-5: 470 from 8079 generated data | 5.82
vae-bgm adult: 10318 from 17244 generated data | 59.84
vae-bgm yeast: 57 from 314 generated data | 18.15
vae-bgm cc-fraud-1: 1094 from 46397 generated data | 2.36
vae-bgm cc-fraud-5: 1539 from 8238 generated data | 18.68
tvae-all adult: 2996 from 16879 generated d

## Check individual models and datasets

In [79]:
SDG = "tabsyn" 
DATASET = "adult"

df_real = pd.read_csv(f"../data/processed/{DATASET}/train_min.csv")
df_syn = pd.read_csv(f"../data/synthetic/{DATASET}/{SDG}.csv")

In [80]:

metadata = Metadata.detect_from_dataframe(data=df_real, table_name=DATASET)
# create diagnostic and quality report from sdv
diagnostic_report = run_diagnostic(
    real_data=df_real, synthetic_data=df_syn, metadata=metadata, verbose=True
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 441.35it/s]|
Data Validity Score: 96.55%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 545.57it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 98.27%



In [136]:
def invalid_values(sdg, dataset):

    syn_path = f"../data/synthetic/{dataset}/{sdg}.csv"
    if dataset in ["cc-fraud-1", "cc-fraud-5"]:
        real_path = f"../data/processed/cc-fraud/train_min.csv"
    else:
        real_path = f"../data/processed/{dataset}/train_min.csv"

    df_syn = pd.read_csv(syn_path)
    df_real = pd.read_csv(real_path)



    # filter df_syn for minority column
    if dataset == "adult": 
        minority_class = ">50K"
        target = "income"
    if dataset == "yeast": 
        minority_class =  "ME2"
        target = "localization.site"
    if dataset in ["cc-fraud-1", "cc-fraud-5"]:
        minority_class= "fraud"
        target = "Class"


    df_syn = df_syn[df_syn[target] == minority_class]

    # get categorical columns
    categorical_columns = df_real.select_dtypes(include=['object', 'category']).columns
    # get numerical columns
    numerical_columns = df_real.select_dtypes(include=['number']).columns.to_list()
    if sdg == "vae-bgm" and dataset == "adult":
        numerical_columns.remove("education.num")

    for column in categorical_columns:
        original_values = set(df_real[column].unique())
        synthetic_values = set(df_syn[column].unique())
        
        # Werte, die nur im synthetischen Datensatz vorkommen
        new_values = synthetic_values - original_values            
        if new_values:
            print(f"In Spalte '{column}' gibt es folgende neue Werte im synthetischen Datensatz: {new_values}")
        else:
            print(f"Keine neuen Werte in Spalte '{column}'")

    for col in numerical_columns:
        min_val = df_real[col].min()
        max_val = df_real[col].max()
        
        # Werte außerhalb des Min-Max-Bereichs im synthetischen Datensatz
        out_of_range = df_syn[
            (df_syn[col] < min_val) | (df_syn[col] > max_val)
        ][col]
        
        if not out_of_range.empty:
            print(f"In Spalte '{col}' gibt es folgende Werte außerhalb des Min-Max-Bereichs ({min_val}, {max_val}): {len(out_of_range.tolist())}")
            print(out_of_range.tolist())
        else:
            print(f"Alle Werte in Spalte '{col}' liegen im Min-Max-Bereich.")
    
    print(f"Amount synthetic data: {len(df_syn)}")

In [145]:
invalid_values("tabsyn", "cc-fraud-1")

Keine neuen Werte in Spalte 'Class'
In Spalte 'V1' gibt es folgende Werte außerhalb des Min-Max-Bereichs (-30.552380043581, 2.13238602134104): 25
[2.1428978, 2.1825771, 2.2955782, 2.2625062, 2.1658733, 2.216045, 2.2171648, 2.243649, 2.2535584, 2.2558942, 2.2017796, 2.2566469, 2.1744096, 2.2993112, 2.264163, 2.229014, 2.1935012, 2.319629, 2.1559467, 2.1548102, 2.196531, 2.1785328, 2.191529, 2.1326075, 2.2036765]
Alle Werte in Spalte 'V2' liegen im Min-Max-Bereich.
In Spalte 'V3' gibt es folgende Werte außerhalb des Min-Max-Bereichs (-31.1036848245812, 2.25020963478583): 39
[2.3513207, 2.9592009, 2.8882103, 2.3742452, 2.6884816, 2.593805, 2.9256122, 2.3448648, 2.8834937, 2.5743427, 2.826286, 2.41137, 2.628173, 3.0667958, 2.3721397, 2.8026743, 2.4944565, 2.5475204, 2.7773263, 2.5754876, 2.748709, 2.6293192, 2.873497, 2.5154173, 2.7687445, 2.421261, 2.276785, 2.6143773, 2.7304811, 2.2632518, 2.4913046, 2.9618025, 2.296538, 2.3330657, 2.8881545, 3.0181773, 2.4455729, 2.981502, 2.5103228]
In

In [129]:
SDG = "ctab-gan-plus" 
DATASET = "adult"
target = "income"
minority_class = ">50K"
df_real = pd.read_csv(f"../data/processed/{DATASET}/train_min.csv")
df_syn = pd.read_csv(f"../data/synthetic/{DATASET}/{SDG}.csv")
df_syn = df_syn[df_syn[target] == minority_class]

In [130]:
original_values = set(df_real["education"].unique())
synthetic_values = set(df_syn["education"].unique())

In [131]:
df_syn[df_syn["education"] == "Preschool"]

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
2968,41,Private,90762,Preschool,7,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
4135,60,Private,121081,Preschool,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,>50K
8136,46,Private,356756,Preschool,4,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,50,United-States,>50K
9613,49,Private,98857,Preschool,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,99075,0,67,United-States,>50K
10128,39,Private,112677,Preschool,4,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,,>50K
10264,48,Private,91192,Preschool,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,59,United-States,>50K
12404,56,Private,158127,Preschool,4,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
12522,36,Private,126283,Preschool,12,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1892,60,United-States,>50K
13859,50,Private,132717,Preschool,4,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,United-States,>50K
20404,48,Private,187052,Preschool,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,58,United-States,>50K


In [124]:
original_values

{'10th',
 '11th',
 '12th',
 '1st-4th',
 '5th-6th',
 '7th-8th',
 '9th',
 'Assoc-acdm',
 'Assoc-voc',
 'Bachelors',
 'Doctorate',
 'HS-grad',
 'Masters',
 'Prof-school',
 'Some-college'}

In [125]:
synthetic_values

{'10th',
 '11th',
 '12th',
 '1st-4th',
 '5th-6th',
 '7th-8th',
 '9th',
 'Assoc-acdm',
 'Assoc-voc',
 'Bachelors',
 'Doctorate',
 'HS-grad',
 'Masters',
 'Preschool',
 'Prof-school',
 'Some-college'}

In [126]:
original_values - synthetic_values

set()

In [127]:
synthetic_values - original_values

{'Preschool'}