In [1]:
import pandas as pd
import random
import os
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.metrics.eval_sanity import CommonRowsProportion
from synthcity.metrics.eval_sanity import NearestSyntheticNeighborDistance
from synthcity.metrics.eval_statistical import ChiSquaredTest

# Load real train data
real_train = pd.read_csv("Raw Data/real_train.csv")
data_loader_real_train = GenericDataLoader(real_train)
data_loader_real_train_encoded = data_loader_real_train.encode()[0]

# Load real holdout data
real_holdout = pd.read_csv("Raw Data/real_holdout.csv")
data_loader_real_holdout = GenericDataLoader(real_holdout)
data_loader_real_holdout_encoded = data_loader_real_holdout.encode()[0]

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


In [14]:
print(data_loader_real_holdout_encoded)

     Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0     40    1              1        140          289          0           1   
1     37    1              1        130          283          0           2   
2     54    1              1        110          208          0           1   
3     43    0              1        150          186          0           1   
4     41    0              1        110          250          0           2   
..   ...  ...            ...        ...          ...        ...         ...   
179   49    1              1        130          266          0           1   
180   67    1              0        160          286          0           0   
181   35    1              1        122          192          0           1   
182   67    1              2        152          212          0           0   
183   63    1              0        140          187          0           0   

     MaxHR  ExerciseAngina  Oldpeak  ST_Slope  Hear

In [2]:
metric_ims_holdout = CommonRowsProportion()
ims_holdout = metric_ims_holdout.evaluate(data_loader_real_train_encoded, data_loader_real_holdout_encoded)
print(ims_holdout)

{'score': 0.0}


In [8]:
metric_dcr_holdout = NearestSyntheticNeighborDistance()
dcr_holdout = metric_dcr_holdout.evaluate(data_loader_real_train_encoded, data_loader_real_holdout_encoded)
print(dcr_holdout)
print("DCR raw output:", dcr_holdout)


{'mean': 0.12695340937910018}
DCR raw output: {'mean': 0.12695340937910018}


In [10]:
methods = ["synthpop", "arf", "privbayes", "ctgan", "tvae", "tabsyn"]
num_datasets = 5

# Store results
results = {}

for method in methods:
    method_results = []

    for i in range(1, num_datasets + 1):
        syn_path = f"Data/train_data/{method}/syn_real_train_{method}_{i}.csv"
        
        if not os.path.exists(syn_path):
            print(f"File not found: {syn_path}")
            continue

        syn_df = pd.read_csv(syn_path)
        data_loader_syn = GenericDataLoader(syn_df)
        data_loader_syn_encoded = data_loader_syn.encode()[0]

        # Compute CommonRowsProportion
        metric_ims = CommonRowsProportion()
        ims_score = metric_ims.evaluate(data_loader_real_train_encoded, data_loader_syn_encoded)

        # Compute NearestSyntheticNeighborDistance
        metric_dcr = NearestSyntheticNeighborDistance()
        dcr_score = metric_dcr.evaluate(data_loader_real_train_encoded, data_loader_syn_encoded)

        # Append results
        method_results.append({
            "dataset_id": i,
            "ims_score": ims_score["score"],
            "dcr_score": dcr_score["mean"],
        })

        print(f"{method} - Dataset {i} => IMS: {ims_score['score']:.4f}, DCR: {dcr_score['mean']:.4f}")

    results[method] = method_results

# (Optional) Convert to DataFrame for easier analysis
import pandas as pd
results_df = pd.DataFrame([
    {"method": method, **entry}
    for method, entries in results.items()
    for entry in entries
])


synthpop - Dataset 1 => IMS: 0.0041, DCR: 0.1023
synthpop - Dataset 2 => IMS: 0.0027, DCR: 0.1060
synthpop - Dataset 3 => IMS: 0.0000, DCR: 0.1002
synthpop - Dataset 4 => IMS: 0.0000, DCR: 0.1628
synthpop - Dataset 5 => IMS: 0.0027, DCR: 0.1036
arf - Dataset 1 => IMS: 0.0000, DCR: 0.1127
arf - Dataset 2 => IMS: 0.0000, DCR: 0.1025
arf - Dataset 3 => IMS: 0.0000, DCR: 0.1085
arf - Dataset 4 => IMS: 0.0000, DCR: 0.1406
arf - Dataset 5 => IMS: 0.0000, DCR: 0.1851
privbayes - Dataset 1 => IMS: 0.0000, DCR: 0.0993
privbayes - Dataset 2 => IMS: 0.0000, DCR: 0.1292
privbayes - Dataset 3 => IMS: 0.0000, DCR: 0.1285
privbayes - Dataset 4 => IMS: 0.0000, DCR: 0.1031
privbayes - Dataset 5 => IMS: 0.0000, DCR: 0.1045
ctgan - Dataset 1 => IMS: 0.0000, DCR: 0.0752
ctgan - Dataset 2 => IMS: 0.0000, DCR: 0.1202
ctgan - Dataset 3 => IMS: 0.0000, DCR: 0.0847
ctgan - Dataset 4 => IMS: 0.0000, DCR: 0.1593
ctgan - Dataset 5 => IMS: 0.0000, DCR: 0.1237
tvae - Dataset 1 => IMS: 0.0000, DCR: 0.0536
tvae - Dat

In [11]:
print(results_df)

       method  dataset_id  ims_score  dcr_score
0    synthpop           1   0.004087   0.102337
1    synthpop           2   0.002725   0.106001
2    synthpop           3   0.000000   0.100250
3    synthpop           4   0.000000   0.162765
4    synthpop           5   0.002725   0.103626
5         arf           1   0.000000   0.112679
6         arf           2   0.000000   0.102508
7         arf           3   0.000000   0.108523
8         arf           4   0.000000   0.140644
9         arf           5   0.000000   0.185055
10  privbayes           1   0.000000   0.099260
11  privbayes           2   0.000000   0.129176
12  privbayes           3   0.000000   0.128507
13  privbayes           4   0.000000   0.103081
14  privbayes           5   0.000000   0.104507
15      ctgan           1   0.000000   0.075188
16      ctgan           2   0.000000   0.120171
17      ctgan           3   0.000000   0.084695
18      ctgan           4   0.000000   0.159319
19      ctgan           5   0.000000   0

In [12]:
# 1. Convert `results` to a flat DataFrame
results_df = pd.DataFrame([
    {"method": method, **entry}
    for method, entries in results.items()
    for entry in entries
])

# Save all dataset-level results
results_df.to_csv("sanity_metric_results_per_dataset.csv", index=False)
print("Saved: sanity_metric_results_per_dataset.csv")

# 2. Compute average scores per method
average_scores_df = results_df.groupby("method")[["ims_score", "dcr_score"]].mean().reset_index()

# Save the averaged results
average_scores_df.to_csv("sanity_metric_results_averaged.csv", index=False)
print("Saved: sanity_metric_results_averaged.csv")

Saved: sanity_metric_results_per_dataset.csv
Saved: sanity_metric_results_averaged.csv
