In [25]:
import os
import polars as pl

In [30]:
# Set Polars to show full DataFrame output (no truncation)
pl.Config.set_tbl_formatting("ASCII_FULL")   # or "UTF8_FULL" if you prefer UTF8 borders
pl.Config.set_tbl_rows(1000)      # Increase max rows (default is 10)
pl.Config.set_tbl_cols(50)        # Increase max columns (default is 10)

# Now your print(group_df) will show full content


# read data from bronze table into a DataFrame
file_path = "data/bronze_df.parquet"

# Load the Parquet file into a Polars DataFrame
df = pl.read_parquet(file_path)

columns = df.columns
print(columns)

['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value', 'valuenum', 'valueuom', 'label', 'abbreviation', 'unitname']


In [37]:
all_code_groups = {
    "Gender Codes": {
        226228: "Gender"
    },
    "Age Codes": {
        226984: "Apache IV Age"
    },
    "Confusion / Mental Instability": {
        228395: "Confusion / Mental Instability",
        228394: "Orientation to Place",
        229381: "Orientation to Person",
        223898: "Orientation",
        228396: "Orientation to Time",
        226104: "Level of Consciousness",
        229382: "Orientation Score",
        228688: "Delirium",
        2930: "Delirium due to conditions classified elsewhere",
        2903: "Senile dementia with delirium",
        2931: "Subacute delirium",
        2982: "Reactive confusion",
        29281: "Drug-induced delirium",
        29011: "Presenile dementia with delirium",
        78097: "Altered mental status",
        29041: "Vascular dementia, with delirium",
        "F05": "Delirium due to known physiological condition"
    },
    "Respiratory Rate": {
        230040: "Paradoxical breathing"
    },
    "Blood Pressure": {
        225309: "ART BP Systolic",
        227243: "Manual Blood Pressure Systolic Right",
        220179: "Non Invasive Blood Pressure Systolic",
        220050: "Arterial Blood Pressure Systolic",
        224167: "Manual Blood Pressure Systolic Left"
    },
    "Temperature": {
        224027: "Skin Temperature",
        223761: "Temperature Fahrenheit",
        223762: "Temperature Celsius",
        226329: "Blood Temperature CCO (C)",
        50825: "Temperature Blood Blood Gas"
    },
    "Pulse Rates": {
        229770: "Resting Pulse Rate (COWS)",
        223942: "Graft/Flap Pulse",
        223936: "Radial Pulse R",
        223948: "Radial Pulse L",
        223941: "Popliteal Pulse R",
        223946: "Popliteal Pulse L",
        223949: "Ulnar Pulse L",
        223945: "Femoral Pulse L",
        223939: "Brachial Pulse R",
        223944: "Brachial Pulse L",
        223940: "Femoral Pulse R",
        223938: "Ulnar Pulse R"
    },
    "BUN": {
        225624: "BUN",
        51842: "Bun"
    },
    "pH": {
        50820: "pH",
        223830: "PH (Arterial)"
    },
    "Sodium": {
        220645: "Sodium (serum)",
        226534: "Sodium (whole blood)",
        228389: "Sodium (serum) (soft)",
        228390: "Sodium (whole blood) (soft)",
        50983: "Sodium",
        52623: "Sodium"
    },
    "Glucose": {
        50809: "Glucose",
        50931: "Glucose",
        52569: "Glucose",
        226537: "Glucose (whole blood)",
        225664: "Glucose finger stick (range 70-100)",
        220621: "Glucose (serum)",
        228388: "Glucose (whole blood) (soft)"
    },
    "Hematocrit": {
        52028: "Hematocrit Blood",
        51638: "Hematocrit Blood",
        51639: "Hematocrit Blood",
        51221: "Hematocrit Blood",
        226540: "Hematocrit (whole blood - calc)",
        220545: "Hematocrit (serum)"
    },
    "Oxygen Saturation": {
        220227: "Arterial O2 Saturation",
        220277: "O2 saturation pulseoxymetry",
        223835: "Inspired O2 Fraction",
        50817: "Oxygen Saturation"
    },
    "Pleural Effusion": {
        51181: "Malignant pleural effusion",
        5119: "Unspecified pleural effusion",
        "J910": "Malignant pleural effusion",
        "J918": "Pleural effusion in other conditions classified elsewhere",
        "J91": "Pleural effusion in conditions classified elsewhere"
    }
}

final_data = {}
missing_in_group = {}

for group_name, codes in all_code_groups.items():
    group_data = []
    print(f"\nProcessing {group_name}:")

    for code, description in codes.items():
        # Determine type of code and filter accordingly
        if isinstance(code, int):
            filtered_df = df.filter(pl.col("itemid") == code)
        else:
            # code is string, so cast itemid to string for comparison
            filtered_df = df.filter(pl.col("itemid").cast(pl.Utf8) == code)

        # Check if filtered_df is empty
        if filtered_df.is_empty():
            print(f"No data found for {description} (code {code}). Skipping.")
            missing_in_group[code] = description
            continue
        
        # Inspect unique units
        unique_units = filtered_df.select(pl.col("valueuom").unique())
        unique_units_list = unique_units["valueuom"].to_list()

        # Calculate range of numerical values if column exists
        if "valuenum" in filtered_df.columns and not filtered_df["valuenum"].is_empty():
            value_stats = filtered_df.select([
                pl.col("valuenum").min().alias("min_value"),
                pl.col("valuenum").max().alias("max_value")
            ])
            min_value = value_stats["min_value"][0]
            max_value = value_stats["max_value"][0]
            print(f"Range of numerical values for {description} (code {code}): {min_value} to {max_value}")
        else:
            print(f"No numerical values found for {description} (code {code}).")

        # Print unique units
        print(f"Unique units for {description} (code {code}): {unique_units_list}")

        # Append filtered data
        group_data.append(filtered_df)

    # Combine the group's filtered data into a single DataFrame if any
    final_data[group_name] = pl.concat(group_data) if group_data else None

print(missing_in_group)

# Print final DataFrames for each group
#for group_name, group_df in final_data.items():
#    print(f"\nFinal DataFrame for {group_name}:")
#    if group_df is not None:
#        print(group_df)
#    else:
#        print("No data found.")


Processing Gender Codes:
No data found for Gender (code 226228). Skipping.

Processing Age Codes:
No data found for Apache IV Age (code 226984). Skipping.

Processing Confusion / Mental Instability:
Range of numerical values for Confusion / Mental Instability (code 228395): 0.0 to 2.0
Unique units for Confusion / Mental Instability (code 228395): [None]
Range of numerical values for Orientation to Place (code 228394): 0.0 to 2.0
Unique units for Orientation to Place (code 228394): [None]
Range of numerical values for Orientation to Person (code 229381): 1.0 to 1.0
Unique units for Orientation to Person (code 229381): [None]
No data found for Orientation (code 223898). Skipping.
Range of numerical values for Orientation to Time (code 228396): 0.0 to 3.0
Unique units for Orientation to Time (code 228396): [None]
No data found for Level of Consciousness (code 226104). Skipping.
No data found for Orientation Score (code 229382). Skipping.
No data found for Delirium (code 228688). Skipping

In [None]:
# 50817 Oxygen Saturation → korrekt für Partial Pressure of Arterial Oxygen
# 220277 O2 saturation pulseoxymetry → ist Sauerstoffsättigung, nicht arterieller Sauerstoffpartialdruck
# Die verbleibenden Codes werden manuell überprüft, ob sie für die Kalkulation der Scores gebraucht werden, 
# zudem wird recherchiert ob die Range der Werte Sinn ergibt, oder ob in einem nächsten Schritt noch nach 
# Ausreißern gesucht werden muss

In [18]:
# list of gender codes (only one, and it is not what we are looking for, so it will probably be filtered)
age_codes = {226228: "Gender"}


# Process each group
for code, description in age_codes.items():  # Unpack the key-value pairs
    # Filter the DataFrame
    filtered_df = df.filter(pl.col("itemid") == code)
    
    # Inspect unique units
    unique_units = filtered_df.select(pl.col("valueuom").unique())
    print(f"Unique units for {description} (code {code}): {unique_units}")
    
    # Append the filtered data for final combination (optional)
    final_data.append(filtered_df)

# since there are no listings with this code anymore, it was already filtered / not relevant

Unique units for Gender (code 226228): shape: (0, 1)
┌──────────┐
│ valueuom │
│ ---      │
│ str      │
╞══════════╡
└──────────┘


In [27]:
# Save as parquet file
bronze_df.write_parquet("data/bronze_df.parquet")