In [8]:
import pandas as pd
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.metadata import Metadata

In [9]:
DATASET = "adult"
SDG = "ctgan"

real_path = f"../data/processed/{DATASET}/train_min.csv"
syn_path = f"../data/synthetic/{DATASET}/{SDG}.csv"

df_real = pd.read_csv(real_path)
df_syn = pd.read_csv(syn_path)

metadata = Metadata.detect_from_dataframe(data=df_real, table_name=DATASET)

In [14]:
# run Diganostics
diagnostic_report = run_diagnostic(
    real_data=df_real,
    synthetic_data=df_syn,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 432.25it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 348.94it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [17]:
diagnostic_report.get_details(property_name="Data Validity")

Unnamed: 0,Column,Metric,Score
0,age,BoundaryAdherence,1.0
1,workclass,CategoryAdherence,1.0
2,fnlwgt,BoundaryAdherence,1.0
3,education,CategoryAdherence,1.0
4,education.num,BoundaryAdherence,1.0
5,marital.status,CategoryAdherence,1.0
6,occupation,CategoryAdherence,1.0
7,relationship,CategoryAdherence,1.0
8,race,CategoryAdherence,1.0
9,sex,CategoryAdherence,1.0


In [29]:
# get the score
data_validity_score = diagnostic_report.get_details(property_name="Data Validity")["Score"].mean()
data_structure_score = diagnostic_report.get_details(property_name="Data Structure")["Score"].mean()

print(data_validity_score)
print(data_structure_score)

1.0
1.0


In [33]:
# quality report 
quality_report = evaluate_quality(
    real_data=df_real,
    synthetic_data=df_syn,
    metadata=metadata,
    verbose=True
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 118.52it/s]|
Column Shapes Score: 87.38%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:00<00:00, 245.57it/s]|
Column Pair Trends Score: 79.09%

Overall Score (Average): 83.23%



In [34]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,age,KSComplement,0.967478
1,workclass,TVComplement,0.831915
2,fnlwgt,KSComplement,0.884837
3,education,TVComplement,0.899332
4,education.num,KSComplement,0.963695
5,marital.status,TVComplement,0.904521
6,occupation,TVComplement,0.90764
7,relationship,TVComplement,0.858793
8,race,TVComplement,0.83527
9,sex,TVComplement,0.930208


In [35]:
quality_report.get_details(property_name='Column Pair Trends')

Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,age,workclass,ContingencySimilarity,0.812003,,
1,age,fnlwgt,CorrelationSimilarity,0.998724,-0.068277,-0.070830
2,age,education,ContingencySimilarity,0.848132,,
3,age,education.num,CorrelationSimilarity,0.986516,-0.022411,0.004557
4,age,marital.status,ContingencySimilarity,0.873540,,
...,...,...,...,...,...,...
100,capital.loss,native.country,ContingencySimilarity,0.818338,,
101,capital.loss,income,ContingencySimilarity,0.904190,,
102,hours.per.week,native.country,ContingencySimilarity,0.576648,,
103,hours.per.week,income,ContingencySimilarity,0.606506,,


In [36]:
column_shape_score = quality_report.get_details(property_name='Column Shapes')["Score"].mean()
column_pair_trends_score = quality_report.get_details(property_name='Column Pair Trends')["Score"].mean()

print(column_shape_score)
print(column_pair_trends_score)

0.873755961094417
0.7908911526502476
