# Code
...
# Imports

In [1]:
from ctgan import CTGAN
import pandas as pd
from pathlib import Path
from typing import Optional, Tuple, Union
from sklearn.model_selection import train_test_split
from ctgan.data_transformer import DataTransformer
import json
DataTransformer._parallel_transform = DataTransformer._synchronous_transform

# User Configuration

In [2]:
# Raw data file
RAW_FILE = "20250301_data_20250510_122405_final.csv"
DATA_DIR = Path("data")

# Holdout parameters
RANDOM_STATE = 42  # Random state for reproducibility
HOLDOUT_SIZE = 0.3  # Proportion of data to hold out for testing

# CTGAN parameters, 10 epochs = 8 min
NUM_EPOCHS = 100
SAVE_MODEL = True
USE_PRETRAINED_MODEL = False
MODEL_DIR = Path("data/models")
MODEL_NAME = "20250301_data_20250510_100_epochs.pkl"

# Utility Functions

## Convert Dtypes

In [3]:
def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Konvertiert bestimmte Spalten des DataFrames in die gewünschten Datentypen:
     - definierte Spalten als category
     - definierte Spalten als pandas Nullable Integer (Int64)
     - consciousness_level und news_score als geordnete Categoricals
    """
    df = df.copy()  # Änderungen nicht am Original vornehmen

    # 1) Kategorische Spalten
    cat_cols = ['gender', 'ethnicity', 'chief_complaint', 'icd_block']
    for col in cat_cols:
        df[col] = df[col].astype('category')

    # 2) Integer-Spalten mit Nullable Integer dtype
    int_cols = ['age', 'systolic_bp', 'diastolic_bp',
                'heart_rate', 'respiratory_rate', 'oxygen_saturation']
    for col in int_cols:
        df[col] = df[col].astype('Int64')

    # 3) Geordnete Categoricals
    df['consciousness_level'] = pd.Categorical(
        df['consciousness_level'],
        categories=['A', 'C', 'V', 'P', 'U'],
        ordered=True
    )
    df['news_score'] = pd.Categorical(
        df['news_score'],
        categories=list(range(19)),
        ordered=True
    )

    return df

## Load Data

In [4]:
def load_data(
    real_filename: Union[str, Path],
    synth_filename: Optional[Union[str, Path]] = None,
    holdout_filename: Optional[Union[str, Path]] = None,
    data_dir: Path = DATA_DIR
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]:
    """
    Lädt die realen, synthetischen und optionalen Holdout-CSV-Dateien
    aus data_dir und wandelt sie über convert_dtypes um.

    Returns:
        df_real: pd.DataFrame
        df_synth: Optional[pd.DataFrame]
        df_holdout: Optional[pd.DataFrame]
    """
    def _read_and_convert(fn: Union[str, Path]) -> pd.DataFrame:
        return (
            pd.read_csv(data_dir / fn, low_memory=False)
              .pipe(convert_dtypes)
        )

    df_real    = _read_and_convert(real_filename)
    df_synth   = _read_and_convert(synth_filename)   if synth_filename   else None
    df_holdout = _read_and_convert(holdout_filename) if holdout_filename else None

    return df_real, df_synth, df_holdout

## CTGAN Model

In [5]:
def load_or_compute(train_data: pd.DataFrame, cat_cols: pd.Series, epochs: int, use_pretrained_model: bool, model_dir: Path, model_name: str, save_model: bool) -> CTGAN:
    model_path = model_dir / model_name

    if use_pretrained_model and model_path.exists():
        with model_path.open("rb") as f:
            ctgan = CTGAN.load(f)
        print(f"Vortrainiertes Modell geladen: {model_path}")
        return ctgan
    
    print(f"Erstelle neues Modell: {model_path}")
    ctgan = CTGAN(epochs=epochs, verbose=True)
    ctgan.fit(train_data, cat_cols)
    print(f"Neues CTGAN-Modell erstellt mit {epochs} Epochen")
    
    loss_csv_path = model_path.with_name(model_name + "_losses.csv")
    ctgan.loss_values.to_csv(loss_csv_path, index=False, sep=";", decimal=",", encoding="utf-8")
    print(f"Loss-Werte gespeichert: {loss_csv_path}")

    if save_model:
        model_path.parent.mkdir(parents=True, exist_ok=True)
        with model_path.open("wb") as f:
            ctgan.save(f)
        print(f"Modell gespeichert: {model_path}")
    return ctgan

# Main Routine

In [6]:
df = load_data(RAW_FILE, data_dir= DATA_DIR / "raw")[0]

## Abtrennen des Holdout-Datensatzes

In [7]:
train_df, holdout_df = train_test_split(df, test_size=HOLDOUT_SIZE, random_state=RANDOM_STATE)
p = Path(RAW_FILE)
holdout_df.to_csv(DATA_DIR / p.with_name(p.stem + "_" + str(NUM_EPOCHS) + "_holdout.csv"), index=False)
print(f"Holdout ⟶ _holdout_{str(NUM_EPOCHS)}_epochs.csv ({len(holdout_df)} rows)")

Holdout ⟶ _holdout_100_epochs.csv (42439 rows)


## Ordinal Encoding of consciousness_level and news_score

In [8]:
train_df['consciousness_level'] = train_df['consciousness_level'].cat.codes
train_df['news_score'] = train_df['news_score'].cat.codes.replace(-1, pd.NA).astype('Int64')

## Define Discrete Cols for CTGAN & Remove NAs
'CTGAN does not support null values in the continuous training data. '

In [9]:
# print(df.dtypes)
cat_cols = train_df.select_dtypes(include=['category', 'bool']).columns.tolist()

In [10]:
length_of_df = len(train_df)
print(f"Length of df before dropping NAs: {length_of_df}")

# Drop rows with any NA values
train_df = train_df.dropna(how='any').reset_index(drop=True)
print(f"Dropped {abs(len(train_df)-length_of_df)} rows; length of df after dropping NAs: {len(train_df)}")

Length of df before dropping NAs: 99022
Dropped 783 rows; length of df after dropping NAs: 98239


## Train the Model

In [11]:
ctgan = load_or_compute(
    train_data=train_df, 
    cat_cols=cat_cols, 
    epochs=NUM_EPOCHS, 
    use_pretrained_model=USE_PRETRAINED_MODEL, 
    model_dir=MODEL_DIR, 
    model_name=MODEL_NAME, 
    save_model=SAVE_MODEL
)

Erstelle neues Modell: data\models\20250301_data_20250510_100_epochs.pkl


Gen. (-0.61) | Discrim. (-0.11): 100%|██████████| 100/100 [1:45:54<00:00, 63.54s/it]

Neues CTGAN-Modell erstellt mit 100 Epochen
Loss-Werte gespeichert: data\models\20250301_data_20250510_100_epochs.pkl_losses.csv
Modell gespeichert: data\models\20250301_data_20250510_100_epochs.pkl





## Generate Synthetic Dataset

In [12]:
synthetic_data = ctgan.sample(len(train_df))

## Value Adaptation

In [13]:
synthetic_data["temperature"] = synthetic_data["temperature"].round(1)
synthetic_data.head()

Unnamed: 0,icu_admission_24h,age,gender,ethnicity,consciousness_level,temperature,heart_rate,respiratory_rate,oxygen_saturation,systolic_bp,diastolic_bp,news_score,night_arrival,weekend_arrival,chief_complaint,icd_block
0,False,56,M,Other,0,36.3,87,22,98,148,96,2,False,True,Other,E08-E13
1,False,54,F,Black,0,36.6,99,18,100,132,77,0,False,True,chest pain,I10-I1A
2,False,68,M,Asian,0,36.7,72,16,100,109,68,0,False,False,Other,I70-I79
3,False,66,F,Black,0,37.5,76,15,98,91,58,0,False,True,agitation,J40-J4A
4,False,40,M,White,0,36.4,97,18,100,137,89,1,True,False,chest pain,E08-E13


## Reverse Ordinal Encoding

In [14]:
mapping = {0: 'A', 1: 'C', 2: 'V', 3: 'P', 4: 'U'}
train_df['consciousness_level'] = train_df['consciousness_level'].map(mapping)
synthetic_data['consciousness_level'] = synthetic_data['consciousness_level'].map(mapping)
train_df['news_score'] = train_df['news_score'].astype('Int64')
synthetic_data['news_score'] = synthetic_data['news_score'].astype('Int64')

## Export Datasets

In [15]:
train_df.to_csv(DATA_DIR / p.with_name(p.stem + "_" + str(NUM_EPOCHS) + "_train.csv"), index=False)
print(f"Train    ⟶ _train_{str(NUM_EPOCHS)}_epochs.csv    ({len(train_df)}    rows)")

synthetic_data.to_csv(DATA_DIR / p.with_name(p.stem + "_" + str(NUM_EPOCHS) + "_synth.csv"), index=False)
print(f"Synthetic    ⟶ _synth_{str(NUM_EPOCHS)}_epochs.csv    ({len(synthetic_data)} rows)")

Train    ⟶ _train_100_epochs.csv    (98239    rows)
Synthetic    ⟶ _synth_100_epochs.csv    (98239 rows)


## Export Metadata

In [16]:
synthetic_data.dtypes

icu_admission_24h          bool
age                       Int64
gender                 category
ethnicity              category
consciousness_level      object
temperature             float64
heart_rate                Int64
respiratory_rate          Int64
oxygen_saturation         Int64
systolic_bp               Int64
diastolic_bp              Int64
news_score                Int64
night_arrival              bool
weekend_arrival            bool
chief_complaint        category
icd_block              category
dtype: object

In [17]:
# map pandas dtypes to SDV sdtypes
dtype_map = {
    "int64":     "numerical",
    "Int64":     "numerical",
    "float64":   "numerical",
    "object":    "categorical",
    "bool":      "boolean",
    "category": "categorical"
}

# build the metadata dict
metadata = {
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {},
    "primary_key": None
}

for col, dtype in df.dtypes.items():
    sdtype = dtype_map.get(str(dtype), "categorical")
    metadata["columns"][col] = {"sdtype": sdtype}

# write it out
file_path_meta = DATA_DIR / p.with_name(p.stem + "_metadata.json")

with open(file_path_meta, "w") as f:
    json.dump(metadata, f, indent=4)

print(f"Metadata ⟶ {file_path_meta}")


Metadata ⟶ data\20250301_data_20250510_122405_final_metadata.json


In [18]:
d = holdout_df.dtypes.reset_index()
d.columns = ['Feature','Dtype']
d['Type'] = d['Dtype'].apply(
    lambda x: 'numeric' 
    if pd.api.types.is_numeric_dtype(x) and not pd.api.types.is_bool_dtype(x) 
    else 'categorical'
)
d[['Feature','Type']].to_csv(DATA_DIR / 'features_types_synthcheck.csv', index=False)