In [1]:
import optuna
from optuna.samplers import TPESampler
import warnings

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBClassifier
from pathlib import Path

warnings.filterwarnings("ignore")

### Extract Features from omr.csv

In [2]:
subject_id = pd.read_csv("patients_subject_id.csv")

In [3]:
# ---------- USER: set these ----------
omr_path = Path("E:/Chrome Dls/MIMIC_IV_Core/hosp/omr.csv")   # or "omr.csv"
pat_set = set(subject_id["subject_id"].values)   # your set of subject_ids (ensure it's a set of ints)
chunksize = 500_000             # tune depending on memory
out_path = Path("omr_summary.csv")
# ------------------------------------

if out_path.exists():
    omr_summary = pd.read_csv(out_path)
else:
    # mapping result_name variants -> canonical short name + metadata
    NAME_MAP = {
        # weight: some rows are "Weight (Lbs)" (imperial) and some "Weight" (unit unknown)
        "weight (lbs)": ("weight_kg", "lbs"),
        "weight": ("weight_kg", None),
        # height variants
        "height (inches)": ("height_cm", "in"),
        "height": ("height_cm", None),
        # BMI variants
        "bmi (kg/m2)": ("bmi", None),
        "bmi": ("bmi", None),
        # blood pressure variants (we will split into sys/dia)
        "blood pressure": ("bp", None),
        "blood pressure sitting": ("bp", None),
        "blood pressure standing (1 min)": ("bp", None),
        "blood pressure lying": ("bp", None),
        "blood pressure standing": ("bp", None),
        "blood pressure standing (3 mins)": ("bp", None),
        # kidney function
        "egfr": ("egfr", None),
    }

    # Prepare accumulators: sums & counts per (subject, measure)
    sums = {}            # key: (subject_id, measure) -> float sum
    counts = {}          # key -> int count

    # helper functions
    def normalize_name(name):
        if not isinstance(name, str):
            return None
        n = name.strip().lower()
        return n

    # parse a result_value into numeric(s)
    def parse_value_for_measure(measure_key, raw_value):
        if pd.isna(raw_value):
            return None

        v = str(raw_value).strip()

        # Blood pressure: look for "120/80" or "120 / 80" or "120/80 mmHg"
        if measure_key == "bp":
            m = re.search(r'(\d+)\D+(\d+)', v)
            if m:
                sys = pd.to_numeric(m.group(1), errors='coerce')
                dia = pd.to_numeric(m.group(2), errors='coerce')
                return (sys, dia)
            else:
                # fallback: look for two numbers
                nums = re.findall(r'\d+', v)
                if len(nums) >= 2:
                    return (pd.to_numeric(nums[0]), pd.to_numeric(nums[1]))
                return None

        # For other numeric measures, extract first floating number
        m = re.search(r'[-+]?\d*\.\d+|\d+', v)
        if m:
            try:
                return float(m.group(0))
            except:
                return None
        return None

    # conversion helpers
    def lbs_to_kg(x): return x * 0.45359237
    def inches_to_cm(x): return x * 2.54

    # process file in chunks
    print("Reading and aggregating OMR in chunks...")
    read_kwargs = {"usecols": ['subject_id','chartdate','seq_num','result_name','result_value'],
                "chunksize": chunksize}
    if omr_path.suffix == ".gz":
        read_kwargs["compression"] = "gzip"

    for chunk in pd.read_csv(omr_path, **read_kwargs, low_memory=False):
        # filter by subject set early to reduce memory
        chunk = chunk[chunk['subject_id'].isin(pat_set)]
        if chunk.empty:
            continue

        # normalize result_name to map
        chunk['result_name_norm'] = chunk['result_name'].str.lower().str.strip()

        # iterate rows (vectorized attempts are possible but this is robust)
        for row in chunk.itertuples(index=False):
            subj = row.subject_id
            rn = row.result_name_norm
            if rn not in NAME_MAP:
                continue
            measure, unit_hint = NAME_MAP[rn]   # measure: 'weight_kg', 'height_cm', 'bp', etc.
            parsed = parse_value_for_measure(measure, row.result_value)
            if parsed is None:
                continue

            if measure == "bp":
                # parsed is a tuple (sys, dia)
                sys_val, dia_val = parsed
                if pd.notna(sys_val):
                    key = (subj, "bp_sys")
                    sums[key] = sums.get(key, 0.0) + float(sys_val)
                    counts[key] = counts.get(key, 0) + 1
                if pd.notna(dia_val):
                    key = (subj, "bp_dia")
                    sums[key] = sums.get(key, 0.0) + float(dia_val)
                    counts[key] = counts.get(key, 0) + 1
            else:
                val = parsed
                if val is None or (isinstance(val, float) and np.isnan(val)):
                    continue

                # convert units if hint given
                if measure == "weight_kg" and unit_hint == "lbs":
                    val = lbs_to_kg(val)
                if measure == "height_cm" and unit_hint == "in":
                    val = inches_to_cm(val)

                key = (subj, measure)
                sums[key] = sums.get(key, 0.0) + float(val)
                counts[key] = counts.get(key, 0) + 1

    print("Finished reading chunks. Building per-subject summary...")

    # build rows per subject
    subjects = sorted({k[0] for k in counts.keys()})
    rows = []
    for subj in subjects:
        row = {"subject_id": subj}
        for m in ["weight_kg","height_cm","bmi","bp_sys","bp_dia","egfr"]:
            key = (subj, m)
            s = sums.get(key, 0.0)
            c = counts.get(key, 0)
            row[f"{m}_mean"] = (s / c) if c > 0 else np.nan
            row[f"{m}_count"] = c
        rows.append(row)

    omr_summary = pd.DataFrame(rows)

    # Optional: if BMI missing but weight & height available, compute BMI = weight_kg / (height_m **2)
    mask_bmi_na = omr_summary['bmi_mean'].isna()
    have_wt_ht = mask_bmi_na & omr_summary['weight_kg_mean'].notna() & omr_summary['height_cm_mean'].notna()
    if have_wt_ht.any():
        wt = omr_summary.loc[have_wt_ht, 'weight_kg_mean']
        ht_m = omr_summary.loc[have_wt_ht, 'height_cm_mean'] / 100.0
        omr_summary.loc[have_wt_ht, 'bmi_mean'] = wt / (ht_m ** 2)

    # Save compact file
    omr_summary.to_csv(out_path, index=False)
    print("Saved summary to:", out_path)
    print("Summary shape:", omr_summary.shape)


Reading and aggregating OMR in chunks...
Finished reading chunks. Building per-subject summary...
Saved summary to: omr_summary.csv
Summary shape: (25233, 13)


In [4]:
omr_summary.head()

Unnamed: 0,subject_id,weight_kg_mean,weight_kg_count,height_cm_mean,height_cm_count,bmi_mean,bmi_count,bp_sys_mean,bp_sys_count,bp_dia_mean,bp_dia_count,egfr_mean,egfr_count
0,10000032,42.231264,25,152.4,2,18.5375,8,106.166667,6,64.666667,6,,0
1,10000084,77.110703,1,177.8,1,24.4,1,,0,,0,,0
2,10000117,50.407558,28,163.98875,16,18.548,25,114.805556,72,71.847222,72,,0
3,10000161,,0,,0,,0,126.5,2,87.5,2,,0
4,10000248,76.203518,1,172.72,1,25.5,1,,0,,0,,0


# Optional: which patients ain't omr

In [5]:
# make sure subject_id is unique in both dfs
patients_ids = set(subject_id['subject_id'].unique())
omr_ids = set(omr_summary['subject_id'].unique())

# set difference
missing_in_omr = patients_ids - omr_ids
print("Count missing in omr:", len(missing_in_omr))

# if you want it as a DataFrame:
missing_df = pd.DataFrame(sorted(list(missing_in_omr)), columns=['subject_id'])


Count missing in omr: 11666


In [7]:
# Suppose missing_df['subject_id'] is your list of missing patients
missing_ids = set(missing_df['subject_id'].unique())

# Read only the needed columns from chartevents
usecols = ['subject_id', 'itemid', 'value', 'valueuom']
chunk_size = 1_000_000  # adjust depending on your RAM

results = []

for chunk in pd.read_csv("E:/Chrome Dls/MIMIC_IV_Core/icu/chartevents.csv", usecols=usecols, chunksize=chunk_size):
    # keep only missing patients
    chunk = chunk[chunk['subject_id'].isin(missing_ids)]
    
    # keep only rows with kg (weight) or Inches (height)
    chunk = chunk[chunk['valueuom'].isin(['kg', 'Inches'])]
    
    results.append(chunk)

chartevents_subset = pd.concat(results, ignore_index=True)
print(chartevents_subset.head())
print("Unique patients found in chartevents:", chartevents_subset['subject_id'].nunique())


   subject_id  itemid value valueuom
0    10002114  224639  71.5       kg
1    10002114  226512  64.1       kg
2    10002114  224639  64.1       kg
3    10002667  226512  87.7       kg
4    10002667  224639  89.6       kg
Unique patients found in chartevents: 3598
