In [2]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Define paths to the folders
hosp_path = "../mimic-iv-3.1/hosp/*.csv.gz"
icu_path = "../mimic-iv-3.1/icu/*.csv.gz"

### Import

In [52]:
# Import
targets = pd.read_pickle("../mimic-iv-3.1/final_microbiology_df.pkl")
labevents_df= pd.read_csv("../mimic-iv-3.1/labevents_df_4b_wd.csv.gz", compression="gzip")
# subject_hadm_driver = pd.read_pickle("../mimic-iv-3.1/subject_hadm_driver.pkl")
# subject_hadm_time_driver = pd.read_pickle("../mimic-iv-3.1/subject_hadm_time_driver.pkl")

### Import top 100 labs

In [53]:
with open("../mimic-iv-3.1/top_100_labs.pkl", "rb") as f:
    top_100_labs = pickle.load(f)

top_100_labs

[['Hematocrit', 'Blood'],
 ['Platelet Count', 'Blood'],
 ['MCV', 'Blood'],
 ['Red Blood Cells', 'Blood'],
 ['RDW', 'Blood'],
 ['MCH', 'Blood'],
 ['MCHC', 'Blood'],
 ['White Blood Cells', 'Blood'],
 ['Creatinine', 'Blood'],
 ['Urea Nitrogen', 'Blood'],
 ['Potassium', 'Blood'],
 ['Sodium', 'Blood'],
 ['Chloride', 'Blood'],
 ['Bicarbonate', 'Blood'],
 ['Anion Gap', 'Blood'],
 ['Hemoglobin', 'Blood'],
 ['Glucose', 'Blood'],
 ['Urine Color', 'Urine'],
 ['Urine Appearance', 'Urine'],
 ['Urobilinogen', 'Urine'],
 ['Specific Gravity', 'Urine'],
 ['Leukocytes', 'Urine'],
 ['Ketone', 'Urine'],
 ['Blood', 'Urine'],
 ['Glucose', 'Urine'],
 ['Protein', 'Urine'],
 ['Bilirubin', 'Urine'],
 ['Nitrite', 'Urine'],
 ['pH', 'Urine'],
 ['RBC', 'Urine'],
 ['WBC', 'Urine'],
 ['Bacteria', 'Urine'],
 ['Yeast', 'Urine'],
 ['Epithelial Cells', 'Urine'],
 ['Estimated GFR (MDRD equation)', 'Blood'],
 ['Monocytes', 'Blood'],
 ['Neutrophils', 'Blood'],
 ['Basophils', 'Blood'],
 ['Lymphocytes', 'Blood'],
 ['Eosinophi

In [129]:
numeric_features = [
    "Blood - Hematocrit",
    "Blood - Platelet Count",
    "Blood - MCV",
    "Blood - Red Blood Cells",
    "Blood - RDW",
    "Blood - MCH",
    "Blood - MCHC",
    "Blood - White Blood Cells",
    "Blood - Creatinine",
    "Blood - Urea Nitrogen",
    "Blood - Potassium",
    "Blood - Sodium",
    "Blood - Chloride",
    "Blood - Bicarbonate",
    "Blood - Anion Gap",
    "Blood - Hemoglobin",
    'Blood - Basophils',
    'Blood - Lymphocytes',
    'Blood - Eosinophils',    
    'Blood - Calcium, Total',
    'Blood - Alanine Aminotransferase (ALT)',
    'Blood - Asparate Aminotransferase (AST)', 
    'Blood - Magnesium',
    'Blood - Phosphate',
    'Blood - INR(PT)',
    'Blood - PT',
    'Blood - Alkaline Phosphatase',
    'Blood - Bilirubin, Total',
    'Blood - Albumin',
    'Blood - RDW-SD',
    'Blood - Neutrophils',
    'Blood - Immature Granulocytes',
    'Blood - Lactate',
    'Blood - PTT',
    'Urine - Specific Gravity',
    'Urine - pH']

flag_features = [
    'Blood - Glucose',
    'Blood - Monocytes',
    'Urine - Urine Color',
    'Urine - Urine Appearance',
    'Urine - Ketone',
    'Urine - Glucose',
    'Urine - Protein',
    'Urine - Epithelial Cells',
    'Urine - RBC',
    'Urine - WBC'
]

### Append features

In [106]:
# Convert charttime to datetime format
targets["charttime_target"] = pd.to_datetime(targets["charttime"])
labevents_df["charttime"] = pd.to_datetime(labevents_df["charttime"])

def get_lab_value_with_fallback(label_name, fluid_name, targets, labevents_df, column_name, is_flag=False):
    """
    Retrieves the latest lab value before the infection event, or the earliest value on the same date.
    Uses 'value' for numeric features and 'flag' for flag features.
    Converts flag values: 'abnormal' -> 1, all else (including NaNs) -> 0.
    """
    # Filter for the specific lab test and correct fluid type
    lab_df = labevents_df[(labevents_df["label"] == label_name) & (labevents_df["fluid"] == fluid_name)].copy()
    
    # Select relevant columns based on column_name (either 'value' or 'flag')
    lab_df = lab_df[["subject_id", "charttime", column_name]].copy()
    
    # Convert flag values to binary (1 for 'abnormal', 0 otherwise)
    if is_flag:
        lab_df[column_name] = lab_df[column_name].apply(lambda x: 1 if x == "abnormal" else 0)
    
    # Merge with targets **on subject_id and charttime_target**
    merged_df = lab_df.merge(targets[["subject_id", "charttime"]], 
                             on=["subject_id"], 
                             how="left",
                             suffixes=("_lab", "_target"))

    # Extract date-only versions of charttime
    merged_df["chartdate_lab"] = merged_df["charttime_lab"].dt.date
    merged_df["chartdate_target"] = merged_df["charttime_target"].dt.date

    # 1. Try to get the latest lab test before the infection's charttime
    pre_infection_df = merged_df[merged_df["charttime_lab"] < merged_df["charttime_target"]]
    pre_infection_df = pre_infection_df.sort_values(
        by=["subject_id", "charttime_target", "charttime_lab"], ascending=[True, True, False]
    ).drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # 2. Fallback: Get the earliest lab test on the same date if no pre-infection data
    same_date_df = merged_df[merged_df["chartdate_lab"] == merged_df["chartdate_target"]]
    same_date_df = same_date_df.sort_values(
        by=["subject_id", "charttime_target", "charttime_lab"], ascending=[True, True, True]
    ).drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # Combine both, prioritizing pre-infection data
    combined_df = pd.concat([pre_infection_df, same_date_df])
    combined_df = combined_df.drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # Create the new column name in the "Fluid - Label" format
    new_col_name = f"{fluid_name} - {label_name}"

    # Rename the chosen column (either 'value' or 'flag')
    combined_df = combined_df.rename(columns={column_name: new_col_name})

    # Keep necessary columns
    return combined_df[["subject_id", "charttime_target", new_col_name]]

In [107]:
# Initialize final dataset with targets
intm_df_1 = targets.copy()

# Iterate over numeric features and merge into final dataset
for feature in tqdm(numeric_features, desc="Processing Numeric Lab Features", unit="feature"):
    fluid, label = feature.split(" - ")  # Split back into fluid and label
    lab_feature_df = get_lab_value_with_fallback(label, fluid, targets, labevents_df, column_name="value", is_flag=False)
    intm_df_1 = intm_df_1.merge(lab_feature_df, on=["subject_id", "charttime_target"], how="left")

intm_df_1.head()

Processing Numeric Lab Features: 100%|████████████████████████████████████████████| 36/36 [05:25<00:00,  9.04s/feature]


Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag,charttime_target,Blood - Hematocrit,Blood - Platelet Count,Blood - MCV,Blood - Red Blood Cells,Blood - RDW,Blood - MCH,Blood - MCHC,Blood - White Blood Cells,Blood - Creatinine,Blood - Urea Nitrogen,Blood - Potassium,Blood - Sodium,Blood - Chloride,Blood - Bicarbonate,Blood - Anion Gap,Blood - Hemoglobin,Blood - Basophils,Blood - Lymphocytes,Blood - Eosinophils,"Blood - Calcium, Total",Blood - Alanine Aminotransferase (ALT),Blood - Asparate Aminotransferase (AST),Blood - Magnesium,Blood - Phosphate,Blood - INR(PT),Blood - PT,Blood - Alkaline Phosphatase,"Blood - Bilirubin, Total",Blood - Albumin,Blood - RDW-SD,Blood - Neutrophils,Blood - Immature Granulocytes,Blood - Lactate,Blood - PTT,Urine - Specific Gravity,Urine - pH
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5


In [130]:
# Initialize final dataset with targets
intm_df_2 = intm_df_1.copy()

# Iterate over flag features and merge into final dataset
for feature in tqdm(flag_features, desc="Processing Flag Lab Features", unit="feature"):
    fluid, label = feature.split(" - ")  # Split back into fluid and label
    lab_feature_df = get_lab_value_with_fallback(label, fluid, targets, labevents_df, column_name="flag", is_flag=True)
    intm_df_2 = intm_df_2.merge(lab_feature_df, on=["subject_id", "charttime_target"], how="left")


intm_df_2.head()

Processing Flag Lab Features: 100%|███████████████████████████████████████████████| 10/10 [00:56<00:00,  5.68s/feature]


Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag,charttime_target,Blood - Hematocrit,Blood - Platelet Count,Blood - MCV,Blood - Red Blood Cells,Blood - RDW,Blood - MCH,Blood - MCHC,Blood - White Blood Cells,Blood - Creatinine,Blood - Urea Nitrogen,Blood - Potassium,Blood - Sodium,Blood - Chloride,Blood - Bicarbonate,Blood - Anion Gap,Blood - Hemoglobin,Blood - Basophils,Blood - Lymphocytes,Blood - Eosinophils,"Blood - Calcium, Total",Blood - Alanine Aminotransferase (ALT),Blood - Asparate Aminotransferase (AST),Blood - Magnesium,Blood - Phosphate,Blood - INR(PT),Blood - PT,Blood - Alkaline Phosphatase,"Blood - Bilirubin, Total",Blood - Albumin,Blood - RDW-SD,Blood - Neutrophils,Blood - Immature Granulocytes,Blood - Lactate,Blood - PTT,Urine - Specific Gravity,Urine - pH,Blood - Glucose,Blood - Monocytes,Urine - Urine Color,Urine - Urine Appearance,Urine - Ketone,Urine - Glucose,Urine - Protein,Urine - Epithelial Cells,Urine - RBC,Urine - WBC
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189,106,3.67,13.2,34.8,32.7,11.6,0.4,5,3.3,134,96,30,11,12.8,,,,7.7,46,187,1.7,2.1,1.7,18.5,299,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [131]:
print(targets.shape)
print(intm_df_2.shape)

(504738, 8)
(504738, 54)


In [132]:
# Export checkpoint 1
final_dataset.to_csv("../mimic-iv-3.1/targets_w_lab_features.csv.gz", compression="gzip", index=False)

### Cleaning 1

In [133]:
import numpy as np
intm_df_2.replace('___', np.nan, inplace=True)
intm_df_2.replace('ERROR', np.nan, inplace=True)
intm_df_2.replace('NotDone', np.nan, inplace=True)
intm_df_2.replace('UNABLE TO REPORT', np.nan, inplace=True)

In [134]:
for col in numeric_features:
    try:
        intm_df_2[col] = pd.to_numeric(intm_df_2[col], errors='coerce')
    except ValueError:
        print(f"Could not convert column: {col}")

In [135]:
# Export checkpoint 2
intm_df_2.to_csv("../mimic-iv-3.1/targets_w_lab_features_v2.csv.gz", compression="gzip", index=False)

In [136]:
# Count NaN values in each of the new lab test columns
nan_counts = intm_df_2.isna().sum()

# Display the result
print(nan_counts)

subject_id                                      0
hadm_id                                    321967
org_name                                        0
ab_name                                         0
charttime                                       0
interpretation                                  0
susceptible_flag                                0
charttime_target                                0
Blood - Hematocrit                          48598
Blood - Platelet Count                      51996
Blood - MCV                                 47240
Blood - Red Blood Cells                     41891
Blood - RDW                                 41876
Blood - MCH                                 41828
Blood - MCHC                                41826
Blood - White Blood Cells                   50783
Blood - Creatinine                          50738
Blood - Urea Nitrogen                       47044
Blood - Potassium                          121961
Blood - Sodium                             110031


### Categorical Features

In [137]:
# Initialize final dataset with targets
intm_df_3 = intm_df_2.copy()

cat_features = [
    "Urine - Leukocytes",
    "Urine - Nitrite",
    "Urine - Bilirubin",
    "Urine - Blood",
    "Urine - Yeast"
]

In [138]:
def get_lab_value_with_comments_fallback(label_name, fluid_name, targets, labevents_df):
    """
    Retrieves the latest lab value before the infection event, or the earliest value on the same date.
    If 'value' is null, it attempts to use the 'comments' column instead.
    """
    # Filter for the specific lab test and correct fluid type
    lab_df = labevents_df[(labevents_df["label"] == label_name) & (labevents_df["fluid"] == fluid_name)].copy()
    
    # Select relevant columns
    lab_df = lab_df[["subject_id", "charttime", "value", "comments"]].copy()

    # If value is NaN, use comments instead
    lab_df["value"] = lab_df["value"].fillna(lab_df["comments"])  # Fill missing values with comments

    # Merge with targets **on subject_id and charttime_target**
    merged_df = lab_df.merge(targets[["subject_id", "charttime"]], 
                             on=["subject_id"], 
                             how="left",
                             suffixes=("_lab", "_target"))

    # Extract date-only versions of charttime
    merged_df["chartdate_lab"] = merged_df["charttime_lab"].dt.date
    merged_df["chartdate_target"] = merged_df["charttime_target"].dt.date

    # 1. Try to get the latest lab test before the infection's charttime
    pre_infection_df = merged_df[merged_df["charttime_lab"] < merged_df["charttime_target"]]
    pre_infection_df = pre_infection_df.sort_values(
        by=["subject_id", "charttime_target", "charttime_lab"], ascending=[True, True, False]
    ).drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # 2. Fallback: Get the earliest lab test on the same date if no pre-infection data
    same_date_df = merged_df[merged_df["chartdate_lab"] == merged_df["chartdate_target"]]
    same_date_df = same_date_df.sort_values(
        by=["subject_id", "charttime_target", "charttime_lab"], ascending=[True, True, True]
    ).drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # Combine both, prioritizing pre-infection data
    combined_df = pd.concat([pre_infection_df, same_date_df])
    combined_df = combined_df.drop_duplicates(subset=["subject_id", "charttime_target"], keep="first")

    # Create the new column name in the "Fluid - Label" format
    new_col_name = f"{fluid_name} - {label_name}"

    # Rename the chosen column
    combined_df = combined_df.rename(columns={"value": new_col_name})

    # Keep necessary columns
    return combined_df[["subject_id", "charttime_target", new_col_name]]


In [139]:
# Iterate over cat features and merge into final dataset
for feature in tqdm(cat_features, desc="Processing Comment-Based Features", unit="feature"):
    fluid, label = feature.split(" - ")  # Split back into fluid and label
    lab_feature_df = get_lab_value_with_comments_fallback(label, fluid, targets, labevents_df)
    intm_df_3 = intm_df_3.merge(lab_feature_df, on=["subject_id", "charttime_target"], how="left")

Processing Comment-Based Features: 100%|████████████████████████████████████████████| 5/5 [00:25<00:00,  5.07s/feature]


In [140]:
for feature in cat_features:
    print(intm_df_3[feature].value_counts())
    print("\n")

Urine - Leukocytes
NEG.                 150996
LG.                   72609
LG*.                  45338
NEG                   36680
MOD.                  24443
SM .                  21602
TR.                   18935
MOD                   17096
MOD*.                 14976
SM*.                  12654
SM                    10881
TR                     9576
TR*.                   8976
LG                     5735
SMALL                   369
UNABLE TO REPORT.       180
TRACE                   170
___                     167
LARGE                   154
Name: count, dtype: int64


Urine - Nitrite
NEG.                 300809
NEG                   62317
POS.                  42438
POS*.                 27289
POS                   18311
___                     206
UNABLE TO REPORT.       167
Name: count, dtype: int64


Urine - Bilirubin
NEG.                 356180
NEG                   69911
SM                     7327
SM .                   5816
SM*.                   3550
MOD.                   

In [141]:
# Define mapping dictionary for standardization
category_mapping = {
    "NEG.": "NEG", "NEG": "NEG",
    "POS.": "POS", "POS*": "POS", "POS": "POS",
    "LG.": "LARGE", "LG*": "LARGE", "LG": "LARGE",
    "MOD.": "MODERATE", "MOD*": "MODERATE", "MOD": "MODERATE",
    "SM .": "SMALL", "SM*": "SMALL", "SM": "SMALL", "SMALL": "SMALL",
    "TR.": "TRACE", "TR*": "TRACE", "TR": "TRACE", "TRACE": "TRACE",
    "LGE": "LARGE", "LGE.": "LARGE", "LARGE": "LARGE",
    "OCC": "OCCASIONAL", "OCC.": "OCCASIONAL", "OCC*": "OCCASIONAL",
    "FEW": "FEW", "FEW.": "FEW", "FEW*": "FEW",
    "RARE": "RARE", "RARE.": "RARE", "RARE*": "RARE",
    "MANY": "MANY", "MANY.": "MANY", "MANY*": "MANY",
    
    "___": None, "UNABLE TO REPORT.": None,
}

# Apply standardization using replace (safer for mixed-type columns)
for feature in cat_features:
    intm_df_3[feature] = intm_df_3[feature].replace(category_mapping)

intm_df_3.head()

Unnamed: 0,subject_id,hadm_id,org_name,ab_name,charttime,interpretation,susceptible_flag,charttime_target,Blood - Hematocrit,Blood - Platelet Count,Blood - MCV,Blood - Red Blood Cells,Blood - RDW,Blood - MCH,Blood - MCHC,Blood - White Blood Cells,Blood - Creatinine,Blood - Urea Nitrogen,Blood - Potassium,Blood - Sodium,Blood - Chloride,Blood - Bicarbonate,Blood - Anion Gap,Blood - Hemoglobin,Blood - Basophils,Blood - Lymphocytes,Blood - Eosinophils,"Blood - Calcium, Total",Blood - Alanine Aminotransferase (ALT),Blood - Asparate Aminotransferase (AST),Blood - Magnesium,Blood - Phosphate,Blood - INR(PT),Blood - PT,Blood - Alkaline Phosphatase,"Blood - Bilirubin, Total",Blood - Albumin,Blood - RDW-SD,Blood - Neutrophils,Blood - Immature Granulocytes,Blood - Lactate,Blood - PTT,Urine - Specific Gravity,Urine - pH,Blood - Glucose,Blood - Monocytes,Urine - Urine Color,Urine - Urine Appearance,Urine - Ketone,Urine - Glucose,Urine - Protein,Urine - Epithelial Cells,Urine - RBC,Urine - WBC,Urine - Leukocytes,Urine - Nitrite,Urine - Bilirubin,Urine - Blood,Urine - Yeast
0,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189.0,106.0,3.67,13.2,34.8,32.7,11.6,0.4,5.0,3.3,134.0,96.0,30.0,11.0,12.8,,,,7.7,46.0,187.0,1.7,2.1,1.7,18.5,299.0,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,SMALL,POS,SMALL,NEG,NONE
1,10000826,20032235.0,ESCHERICHIA COLI,AMPICILLIN/SULBACTAM,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189.0,106.0,3.67,13.2,34.8,32.7,11.6,0.4,5.0,3.3,134.0,96.0,30.0,11.0,12.8,,,,7.7,46.0,187.0,1.7,2.1,1.7,18.5,299.0,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,SMALL,POS,SMALL,NEG,NONE
2,10000826,20032235.0,ESCHERICHIA COLI,CEFAZOLIN,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189.0,106.0,3.67,13.2,34.8,32.7,11.6,0.4,5.0,3.3,134.0,96.0,30.0,11.0,12.8,,,,7.7,46.0,187.0,1.7,2.1,1.7,18.5,299.0,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,SMALL,POS,SMALL,NEG,NONE
3,10000826,20032235.0,ESCHERICHIA COLI,CEFEPIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189.0,106.0,3.67,13.2,34.8,32.7,11.6,0.4,5.0,3.3,134.0,96.0,30.0,11.0,12.8,,,,7.7,46.0,187.0,1.7,2.1,1.7,18.5,299.0,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,SMALL,POS,SMALL,NEG,NONE
4,10000826,20032235.0,ESCHERICHIA COLI,CEFTAZIDIME,2146-12-08 22:22:00,S,1.0,2146-12-08 22:22:00,39.0,189.0,106.0,3.67,13.2,34.8,32.7,11.6,0.4,5.0,3.3,134.0,96.0,30.0,11.0,12.8,,,,7.7,46.0,187.0,1.7,2.1,1.7,18.5,299.0,2.1,2.7,,,,,32.4,1.016,6.5,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,SMALL,POS,SMALL,NEG,NONE


### Remove low count features

In [148]:
# Define threshold for NaN counts
nan_threshold = 200000

# Identify columns with more than 200,000 NaN values
columns_to_drop_200 = intm_df_3.columns[intm_df_3.isna().sum() > nan_threshold]

# Drop those columns
intm_df_3a = intm_df_3.drop(columns=columns_to_drop_200)

In [150]:
# Define threshold for NaN counts
nan_threshold = 150000

# Identify columns with more than 200,000 NaN values
columns_to_drop_150 = intm_df_3.columns[intm_df_3.isna().sum() > nan_threshold]

# Drop those columns
intm_df_3b = intm_df_3.drop(columns=columns_to_drop_150)

In [151]:
print(intm_df_3.shape)
print(intm_df_3a.shape)
print(intm_df_3b.shape)

(504738, 59)
(504738, 54)
(504738, 45)


In [153]:
print(columns_to_drop_200)
print(columns_to_drop_150)

Index(['hadm_id', 'Blood - Albumin', 'Blood - RDW-SD',
       'Blood - Immature Granulocytes', 'Blood - Lactate'],
      dtype='object')
Index(['hadm_id', 'Blood - Alanine Aminotransferase (ALT)',
       'Blood - Asparate Aminotransferase (AST)', 'Blood - Magnesium',
       'Blood - Phosphate', 'Blood - INR(PT)', 'Blood - PT',
       'Blood - Alkaline Phosphatase', 'Blood - Bilirubin, Total',
       'Blood - Albumin', 'Blood - RDW-SD', 'Blood - Immature Granulocytes',
       'Blood - Lactate', 'Blood - PTT'],
      dtype='object')


### Export

In [152]:
# Export checkpoint 3
intm_df_3.to_csv("../mimic-iv-3.1/targets_w_lab_features_v3.csv.gz", compression="gzip", index=False)
intm_df_3a.to_csv("../mimic-iv-3.1/targets_w_lab_features_v3a.csv.gz", compression="gzip", index=False)
intm_df_3b.to_csv("../mimic-iv-3.1/targets_w_lab_features_v3b.csv.gz", compression="gzip", index=False)

In [144]:
intm_df_3.isna().sum()

subject_id                                      0
hadm_id                                    321967
org_name                                        0
ab_name                                         0
charttime                                       0
interpretation                                  0
susceptible_flag                                0
charttime_target                                0
Blood - Hematocrit                          48598
Blood - Platelet Count                      51996
Blood - MCV                                 47240
Blood - Red Blood Cells                     41891
Blood - RDW                                 41876
Blood - MCH                                 41828
Blood - MCHC                                41826
Blood - White Blood Cells                   50783
Blood - Creatinine                          50738
Blood - Urea Nitrogen                       47044
Blood - Potassium                          121961
Blood - Sodium                             110031
