# 1. Datengrundlage
## 1.1 Import

In [2]:
import pandas as pd
from pathlib import Path
from datetime import datetime

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
data_path = Path("../../data/")

file_real = data_path / "20250301_data_20250421_133906_final_10_train.csv"
df_real = pd.read_csv(file_real, low_memory=False) # Use low_memory=False to prevent dtype issues with large files
path_real = Path(file_real)
# print(df_real.head())

file_synth = data_path / "20250301_data_20250421_133906_final_synth_10_epochs.csv"
df_synth = pd.read_csv(file_synth, low_memory=False) # Use low_memory=False to prevent dtype issues with large files
path_synth = Path(file_synth)
# print(df_synth.head())

file_holdout = data_path / "20250301_data_20250421_133906_final_10_holdout.csv"
df_holdout = pd.read_csv(file_holdout, low_memory=False) # Use low_memory=False to prevent dtype issues with large files
path_holdout = Path(file_holdout)


## 1.2 Col Description & Target

In [3]:
target_col = 'icu_admission_24h'

numerical_cols = ['age', 'temperature', 'heart_rate', 'respiratory_rate', 'oxygen_saturation', 'systolic_bp', 'diastolic_bp']
categorical_cols = ['icu_admission_24h', 'gender', 'ethnicity', 'consciousness_level','news_score', 'night_arrival', 'weekend_arrival','chief_complaint', 'icd_block']

In [4]:
use_subset = True
subset_size = 1000

In [5]:
if use_subset:
    df_real     = df_real.sample(n=subset_size, random_state=42)
    df_synth    = df_synth.sample(n=subset_size, random_state=42)
    df_holdout  = df_holdout.sample(n=subset_size, random_state=42)

# NNDR

In [6]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  nndr={})

SynthEval: synthetic data read successfully


Syntheval: nndr: 100%|██████████| 1/1 [00:00<00:00,  9.71it/s]


SynthEval results

Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Nearest neighbour distance ratio         :   0.8991  0.0034   |
| Privacy loss (diff. in NNDR)             :   -0.0039  0.0048   |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,avg_nndr,p,0.899082,0.00335,0.899082,0.00335
1,priv_loss_nndr,p,-0.003892,0.004829,0.996108,0.00335


# NNAA

In [7]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  nnaa={})

SynthEval: synthetic data read successfully


Syntheval: nnaa: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]


SynthEval results

Utility metric description                    value   error                                 
+---------------------------------------------------------------+
| Nearest neighbour adversarial accuracy   :   0.6605  0.0000   |
+---------------------------------------------------------------+
    
Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Privacy loss (diff. in NNAA)             :   -0.0030  0.0000   |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,nnaa,u,0.6605,0.0,0.3395,0.0
1,priv_loss_nnaa,p,-0.003,0.0,0.997,0.0


# DCR

In [11]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, dcr={})



SynthEval: synthetic data read successfully


Syntheval: dcr: 100%|██████████| 1/1 [00:00<00:00,  1.41it/s]


SynthEval results

Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Median distance to closest record        :   1.1575           |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,median_DCR,p,1.157522,,0.820231,


# Hitting-Rate

In [14]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  hit_rate={})

SynthEval: synthetic data read successfully


Syntheval: hit_rate: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]


SynthEval results

Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Hitting rate (0.03 x range(att))         :   0.0000           |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,hit_rate,p,0.0,,1.0,


# EPS Risk

In [15]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  eps_risk={})

SynthEval: synthetic data read successfully


Syntheval: eps_risk: 100%|██████████| 1/1 [00:00<00:00,  5.17it/s]


SynthEval results

Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Epsilon identifiability risk             :   0.3270           |       
| Privacy loss (diff. in eps. risk)        :   -0.0100           |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,eps_identif_risk,p,0.327,,0.673,
1,priv_loss_eps,p,-0.01,,0.99,


# MIA

In [16]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  mia_risk={})

SynthEval: synthetic data read successfully


Syntheval: mia_risk: 100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]

Unrecognised keyword: mia_risk

SynthEval results






# AIA

In [17]:
from syntheval import SynthEval

# S = SynthEval(df_real_small, holdout_dataframe=df_holdout_small, cat_cols=categorical_columns)
S = SynthEval(df_real, holdout_dataframe=df_holdout, cat_cols=categorical_cols)
S.evaluate(df_synth, analysis_target_var="icu_admission_24h",  att_discl={})

SynthEval: synthetic data read successfully


Syntheval: att_discl: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


SynthEval results

Privacy metric description                    value   error                                 
+---------------------------------------------------------------+
| Attr. disclosure risk (acc. with holdout):   0.4274  0.0714   |
+---------------------------------------------------------------+
    





Unnamed: 0,metric,dim,val,err,n_val,n_err
0,att_discl_risk,p,0.427375,0.071382,0.572625,0.071382
