# Code
...
# Imports

In [1]:
import pandas as pd
from pathlib import Path
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional, Tuple, Union
import seaborn as sns
import matplotlib.pyplot as plt
from sdmetrics.single_column import StatisticSimilarity

# User Configuration

In [2]:
SAVE_CSV = True  # Save figures

REAL_FILE = "20250301_data_20250510_122405_final_100_train.csv"
SYNTH_FILE = "20250301_data_20250510_122405_final_100_synth.csv"
HOLDOUT_FILE = "20250301_data_20250510_122405_final_100_holdout.csv"

DATA_DIR = Path("../../data")
OUTPUT_DIR_CSV = Path("results")
OUTPUT_DIR_CSV.mkdir(parents=True, exist_ok=True)  # ensure output folder exists

# Set my design
sns.set_theme(
    style="white",
    context="talk",
    palette="colorblind"
)

# Utility Functions

## Load Data & Assign Correct Datatypes

In [3]:
def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Konvertiert bestimmte Spalten des DataFrames in die gewünschten Datentypen:
     - definierte Spalten als category
     - definierte Spalten als pandas Nullable Integer (Int64)
     - consciousness_level und news_score als geordnete Categoricals
    """
    df = df.copy()  # Änderungen nicht am Original vornehmen

    # 1) Kategorische Spalten
    cat_cols = ['gender', 'ethnicity', 'chief_complaint', 'icd_block']
    for col in cat_cols:
        df[col] = df[col].astype('category')

    # 2) Integer-Spalten mit Nullable Integer dtype
    int_cols = ['age', 'systolic_bp', 'diastolic_bp',
                'heart_rate', 'respiratory_rate', 'oxygen_saturation']
    for col in int_cols:
        df[col] = df[col].astype('Int64')

    # 3) Geordnete Categoricals
    df['consciousness_level'] = pd.Categorical(
        df['consciousness_level'],
        categories=['A', 'C', 'V', 'P', 'U'],
        ordered=True
    )
    df['news_score'] = pd.Categorical(
        df['news_score'],
        categories=list(range(19)),
        ordered=True
    )

    return df

def load_data(
    real_filename: Union[str, Path],
    synth_filename: Optional[Union[str, Path]] = None,
    holdout_filename: Optional[Union[str, Path]] = None,
    data_dir: Path = DATA_DIR
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]:
    """
    Lädt die realen, synthetischen und optionalen Holdout-CSV-Dateien
    aus data_dir und wandelt sie über convert_dtypes um.

    Returns:
        df_real: pd.DataFrame
        df_synth: Optional[pd.DataFrame]
        df_holdout: Optional[pd.DataFrame]
    """
    def _read_and_convert(fn: Union[str, Path]) -> pd.DataFrame:
        return (
            pd.read_csv(data_dir / fn, low_memory=False)
              .pipe(convert_dtypes)
        )

    df_real    = _read_and_convert(real_filename)
    df_synth   = _read_and_convert(synth_filename)   if synth_filename   else None
    df_holdout = _read_and_convert(holdout_filename) if holdout_filename else None

    return df_real, df_synth, df_holdout

In [None]:
def compute_statistics_for_attributes(df_real, df_synth, num_attrs, statistic='mean'):
    results = []
    
    # Stelle sicher, dass statistic eine Liste ist
    statistics = [statistic] if isinstance(statistic, str) else statistic

    for attr in num_attrs:
        for stat in statistics:
            stat_result = StatisticSimilarity.compute(
                real_data=df_real[attr],
                synthetic_data=df_synth[attr],
                statistic=stat
            )
            results.append({'feature': attr, 'statistic': stat, 'result': stat_result})

    return pd.DataFrame(results)

# Main Routine
## Daten Laden

In [5]:
df_real, df_synth, df_holdout = load_data(
    REAL_FILE,
    SYNTH_FILE,
    HOLDOUT_FILE,
    data_dir=DATA_DIR
)

In [6]:
num_attrs = df_real.select_dtypes(include=['number']).columns
cat_attrs = df_real.select_dtypes(exclude=['number']).columns

In [None]:
stats_age = StatisticSimilarity.compute(
    real_data=df_real['age'],
    synthetic_data=df_synth['age'],
    statistic='mean'
)
print(stats_age)

0.9786451743978586


In [9]:
result_df = compute_statistics_for_attributes(df_real, df_synth, num_attrs, statistic=['mean', 'std', 'median'])
print(result_df)
if SAVE_CSV:
    result_df.to_csv(OUTPUT_DIR_CSV / "statistics.csv", index=False, sep=';', encoding='utf-8', decimal=',')

              feature statistic    result
0                 age      mean  0.978645
1                 age       std  0.988089
2                 age    median  1.000000
3         temperature      mean  0.999872
4         temperature       std  0.999429
5         temperature    median  1.000000
6          heart_rate      mean  0.996679
7          heart_rate       std  0.992629
8          heart_rate    median  0.980676
9    respiratory_rate      mean  0.996877
10   respiratory_rate       std  0.999291
11   respiratory_rate    median  1.000000
12  oxygen_saturation      mean  0.992750
13  oxygen_saturation       std  0.999507
14  oxygen_saturation    median  0.980000
15        systolic_bp      mean  0.997299
16        systolic_bp       std  0.996995
17        systolic_bp    median  0.977679
18       diastolic_bp      mean  0.995563
19       diastolic_bp       std  0.997600
20       diastolic_bp    median  1.000000
