In [175]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def cast_df(df):
    df = df.with_columns([
        pl.col("valuenum").cast(pl.Float64),
pl.col("charttime").str.strptime(pl.Datetime, format="%d-%m-%Y %H:%M:%S", strict=False)
    ])
    return df

df_all = pl.read_parquet('../data/compare/silver_icu.parquet')
df_pneumonia = pl.read_parquet('../data/silver_df.parquet')

cast_df_all = cast_df(df_all)
cast_df_pneumonia = cast_df(df_pneumonia)

In [176]:
def calc_stats(df, label):
    numeric = df.filter((pl.col("label") == label) & pl.col("valuenum").is_not_null())

    stats = (
        numeric.group_by("label")
        .agg([
            pl.col("valuenum").mean().alias("mean"),
            pl.col("valuenum").median().alias("median"),
            pl.col("valuenum").std().alias("std"),
            pl.col("valuenum").min().alias("min"),
            pl.col("valuenum").max().alias("max"),
            pl.col("valuenum").quantile(0.25).alias("q25"),
            pl.col("valuenum").quantile(0.75).alias("q75"),
        ])
    )
    print(stats)
    return stats

#Anzahl der Messungen pro Parameter
def total_measurement(df, label):
    return (
        df.filter((pl.col("label") == label) & pl.col("valuenum").is_not_null())
          .height
    )

#Anteil fehlender Werte
def total_missing(df, label):
    sub = df.filter(pl.col("label") == label)
    total = sub.height
    notnull = sub.filter(pl.col("valuenum").is_not_null()).height
    if total == 0:
        return 0.0
    missing_fraction = (total - notnull) / total
    return missing_fraction

#Anzahl ICU-Aufenthalte (stay_id), Parameter mind 1x gemessen
def total_icu_stay(df, label):
    return (
        df.filter((pl.col("label") == label) & pl.col("valuenum").is_not_null())
          .select(pl.col("stay_id").n_unique())
          .item()
    )

# Mittlere Messintervalle
def mean_interval(df, label):
    sub = df.filter((pl.col("label") == label) & pl.col("valuenum").is_not_null())
    if sub.is_empty():
        return None

    intervals = (
        sub.group_by(["subject_id", "stay_id"])
        .agg([pl.col("charttime").sort().alias("times")])
        .filter(pl.col("times").arr.lengths > 1)  # <--- Hier geändert!
        .with_columns(
            (pl.col("times")
                .arr.eval(pl.element().diff().cast(pl.Duration("s")).dt.seconds() / 3600, parallel=True)
                .arr.exclude(0)
            ).alias("intervals")
        )
        .with_columns(
            pl.col("intervals").arr.mean().alias("mean_interval")
        )
    )

    if intervals.is_empty():
        return None
    return float(intervals["mean_interval"].mean())




In [177]:
labels = cast_df_all["label"].unique().to_list()

for label in labels:
    calc_stats(cast_df_all, label)
    total_measurement(cast_df_all, label)
    total_missing(cast_df_all, label)
    total_icu_stay(cast_df_all, label)
    mean_interval(cast_df_all, label)
    

shape: (1, 8)
┌────────────────┬────────────┬────────┬──────────┬───────┬───────┬───────┬───────┐
│ label          ┆ mean       ┆ median ┆ std      ┆ min   ┆ max   ┆ q25   ┆ q75   │
│ ---            ┆ ---        ┆ ---    ┆ ---      ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ str            ┆ f64        ┆ f64    ┆ f64      ┆ f64   ┆ f64   ┆ f64   ┆ f64   │
╞════════════════╪════════════╪════════╪══════════╪═══════╪═══════╪═══════╪═══════╡
│ Sodium (serum) ┆ 140.165414 ┆ 140.0  ┆ 7.049802 ┆ 126.0 ┆ 154.0 ┆ 135.0 ┆ 145.0 │
└────────────────┴────────────┴────────┴──────────┴───────┴───────┴───────┴───────┘


AttributeError: 'ExprArrayNameSpace' object has no attribute 'lengths'

In [None]:
labels = cast_df_all["label"].unique().to_list()

for label in labels:
    print(f"\n===== Statistik für Parameter: {label} =====")
    stats = calc_stats(cast_df_all, label)
    # Statistiken (als DataFrame mit einer Zeile)
    if stats.height > 0:
        row = stats.row(0)
        print(f"  Mittelwert (mean): {row[1]:.2f}")
        print(f"  Median:           {row[2]:.2f}")
        print(f"  Std-Abw.:         {row[3]:.2f}")
        print(f"  Minimum:          {row[4]:.2f}")
        print(f"  Maximum:          {row[5]:.2f}")
        print(f"  25%-Quantil:      {row[6]:.2f}")
        print(f"  75%-Quantil:      {row[7]:.2f}")
    else:
        print("  Keine numerischen Werte für diesen Parameter vorhanden.")

    n_mess = total_measurement(cast_df_all, label)
    print(f"  Anzahl Messungen:             {n_mess}")

    missing = total_missing(cast_df_all, label)
    print(f"  Anteil fehlender Werte:       {missing:.2%}")

    n_icu = total_icu_stay(cast_df_all, label)
    print(f"  ICU-Aufenthalte mit Messung:  {n_icu}")

    interval = mean_interval(cast_df_all, label)
    if interval is not None:
        print(f"  Mittleres Messintervall (h):  {interval:.2f}")
    else:
        print(f"  Mittleres Messintervall (h):  n/a (keine Serienmessung)")
