# Objectives
...
# Imports

In [1]:
from pathlib import Path
import pandas as pd
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_float_dtype 

# User Configuration

In [2]:
# Configuration
SAVE_DATA = True  # Whether to save generated figures and CSVs

# File paths
RAW_FILE = "20250301_data_20250510_122405_final.csv"

# Define directories for input and output
DATA_DIR = Path("../../data/raw")
OUTPUT_DIR_CSV = Path("results")
OUTPUT_DIR_FIG = Path("figures")

# Data Loading

In [3]:
def load_data(real_filename: str, synth_filename: str | None = None, holdout_filename: str | None = None) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None]:
    """Read real, synthetic, and optional holdout CSVs into DataFrames."""
    df_real = pd.read_csv(DATA_DIR / real_filename, low_memory=False)
    df_synth = pd.read_csv(DATA_DIR / synth_filename, low_memory=False) if synth_filename else None
    df_holdout = pd.read_csv(DATA_DIR / holdout_filename, low_memory=False) if holdout_filename else None

    return df_real, df_synth, df_holdout

In [4]:
OUTPUT_DIR_CSV.mkdir(parents=True, exist_ok=True)  # Ensure output dir exists
OUTPUT_DIR_FIG.mkdir(parents=True, exist_ok=True)  # Ensure output dir exists

df, _, _ = load_data(RAW_FILE)
# df.dtypes

In [5]:
cat_cols = ['gender', 'ethnicity', 'chief_complaint', 'icd_block']
for col in cat_cols:
    df[col] = df[col].astype('category')
int_cols = ['age', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'respiratory_rate', 'oxygen_saturation']
for col in int_cols:
    df[col] = df[col].astype('Int64')
df['consciousness_level'] = pd.Categorical(df['consciousness_level'], categories=['A', 'C', 'V', 'P', 'U'], ordered=True)
df['news_score'] = pd.Categorical(df['news_score'], categories=list(range(19)), ordered=True)
df.dtypes

icu_admission_24h          bool
age                       Int64
gender                 category
ethnicity              category
consciousness_level    category
temperature             float64
heart_rate                Int64
respiratory_rate          Int64
oxygen_saturation         Int64
systolic_bp               Int64
diastolic_bp              Int64
news_score             category
night_arrival              bool
weekend_arrival            bool
chief_complaint        category
icd_block              category
dtype: object

In [6]:
df_missing = df[df.isna().any(axis=1)].reset_index(drop=True)

print(f"Anzahl der Zeilen mit mind. 1 NA-Wert in einer Spalte: {df_missing.shape[0]}")

Anzahl der Zeilen mit mind. 1 NA-Wert in einer Spalte: 1120


In [8]:
# -------------------------------------------------
# 1) Mapping-Dictionaries (nach Bedarf editieren)
# -------------------------------------------------
role_mapping = {
    "icu_admission_24h": "target (binary)",
    "age":               "predictor, privacy-sensitive, fairness",
    "gender":            "predictor, privacy-sensitive, fairness, clinical plausibility check",
    "ethnicity":         "predictor, privacy-sensitive, fairness",
    "consciousness_level":"predictor",
    "temperature":       "predictor",
    "heart_rate":        "predictor",
    "respiratory_rate":  "predictor",
    "oxygen_saturation": "predictor",
    "systolic_bp":       "predictor",
    "diastolic_bp":      "predictor",
    "news_score":        "predictor (derived severity)",
    "night_arrival":     "predictor (operational)",
    "weekend_arrival":   "predictor (operational)",
    "chief_complaint":   "predictor (high-cardinality), privacy-sensitive, clinical plausibility check",
    "icd_block":         "stratifier (diagnostic code), privacy-sensitive, clinical plausibility check"
}

preproc_mapping = {
    "icu_admission_24h": "Unchanged; One-Hot Encoding",
    "age":               "Clipped [18, 91]; Standardized (z-score)",
    "gender":            "Unchanged; One-Hot Encoding",
    "ethnicity":         "Regex-map to 5 categories; One-Hot Encoding",
    "consciousness_level":"Unchanged; Ordinal Encoding (A<C<V<P<U)",
    "temperature":       "If >=50 assumed °F; Converted to °C; <30°C or >45°C set to NaN; Round 1 dp; Standardized (z-score)",
    "heart_rate":        "Clipped [20,240]; Standardized (z-score)",
    "respiratory_rate":  "Clipped [1,70]; Standardized (z-score)",
    "oxygen_saturation": "Clipped [20,100]; Standardized (z-score)",
    "systolic_bp":       "Clipped [30,300]; Standardized (z-score)",
    "diastolic_bp":      "Clipped [25,150]; Standardized (z-score)",
    "news_score":        "Computed from vital params; Ordinal Encoding (0-18)",
    "night_arrival":     "Unchanged; One-Hot Encoding",
    "weekend_arrival":   "Unchanged; One-Hot Encoding",
    "chief_complaint":   "Text normalize + fuzzy cluster (Top 309 + Others); NaN if missing; One-Hot Encoding",
    "icd_block":         "Mapped to 285 ICD-10 subclasses; One-Hot Encoding",
}

# -------------------------------------------------
# 2) Hilfsfunktion zur kompakten Werte-/Kategorieanzeige
# -------------------------------------------------
def value_summary(series, top_k=5):
    if series.dtype == bool:
        return "True / False"
    if is_numeric_dtype(series):
        if is_float_dtype(series):
            return f"{series.min()} - {series.max()} (Median {np.round(series.median(),1)})"
        return f"{series.min()} - {series.max()} (Median {int(np.round(series.median(),0))})"
    top_vals = series.value_counts(dropna=True).head(top_k).index.tolist()
    top_vals = [str(v) for v in top_vals]
    res = ", ".join(top_vals)
    if len(series.value_counts(dropna=True)) > top_k:
        res += ", ..."
    return res

# -------------------------------------------------
# 3) Metadaten-Tabelle aufbauen
# -------------------------------------------------
rows = []
for col in df.columns:
    dtype = df[col].dtype
    missing_pct = round(df[col].isna().mean()*100, 2)
    rows.append({
        "Feature": col,
        "Type": str(dtype),
        "Range/Categories": value_summary(df[col]),
        "Role": role_mapping.get(col, "predictor"),
        "Missing values (%)": missing_pct,
        "Preprocessing": preproc_mapping.get(col, "-")
    })

df_meta = pd.DataFrame(rows)

# -------------------------------------------------
# 4) Ergebnis anzeigen & (optional) exportieren
# -------------------------------------------------
display(df_meta)                         # im Notebook
if SAVE_DATA:
    df_meta.to_csv(OUTPUT_DIR_CSV / "feature_table.csv", index=False, sep=";", decimal=",", encoding="utf-8")  # für CSV-Export
# df_meta.to_latex("feature_table.tex", index=False) # für LaTeX-Manuskript


Unnamed: 0,Feature,Type,Range/Categories,Role,Missing values (%),Preprocessing
0,icu_admission_24h,bool,True / False,target (binary),0.0,Unchanged; One-Hot Encoding
1,age,Int64,18 - 91 (Median 61),"predictor, privacy-sensitive, fairness",0.0,"Clipped [18, 91]; Standardized (z-score)"
2,gender,category,"F, M","predictor, privacy-sensitive, fairness, clinic...",0.0,Unchanged; One-Hot Encoding
3,ethnicity,category,"White, Black, Other, Asian, Unknown","predictor, privacy-sensitive, fairness",0.0,Regex-map to 5 categories; One-Hot Encoding
4,consciousness_level,category,"A, C, V, P, U",predictor,0.0,Unchanged; Ordinal Encoding (A<C<V<P<U)
5,temperature,float64,30.0 - 44.1 (Median 36.7),predictor,0.04,If >=50 assumed °F; Converted to °C; <30°C or ...
6,heart_rate,Int64,20 - 227 (Median 85),predictor,0.01,"Clipped [20,240]; Standardized (z-score)"
7,respiratory_rate,Int64,1 - 65 (Median 18),predictor,0.01,"Clipped [1,70]; Standardized (z-score)"
8,oxygen_saturation,Int64,42 - 100 (Median 98),predictor,0.04,"Clipped [20,100]; Standardized (z-score)"
9,systolic_bp,Int64,50 - 274 (Median 132),predictor,0.04,"Clipped [30,300]; Standardized (z-score)"
