In [1]:
import optuna
from optuna.samplers import TPESampler
import warnings

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import math
from collections import defaultdict
import seaborn as sns
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBClassifier
from pathlib import Path

warnings.filterwarnings("ignore")

### Select most important chart events items from chartevents.csv

In [2]:
d_items_df = pd.read_csv(
    f"E:/Chrome Dls/MIMIC_IV_Core/icu/d_items.csv"
)

In [3]:
# Suppose d_items_df is your d_items DataFrame
keywords = [
    "arterial bp", "blood pressure", "heart rate", "respiratory rate",
    "temperature", "weight", "height", "bmi"
]

mask = d_items_df['label'].str.lower().str.contains("|".join(keywords), na=False)
candidates = d_items_df[mask].copy()

# But we should refine: exclude alarm or threshold / less measurement types
exclude_keywords = ["alarm", "threshold", "upper", "lower", "alert"]
mask2 = ~candidates['label'].str.lower().str.contains("|".join(exclude_keywords), na=False)
candidates = candidates[mask2]

candidates.reset_index(inplace=True)
candidates[['itemid','label','unitname','param_type']]


Unnamed: 0,itemid,label,unitname,param_type
0,220045,Heart Rate,bpm,Numeric
1,220050,Arterial Blood Pressure systolic,mmHg,Numeric
2,220051,Arterial Blood Pressure diastolic,mmHg,Numeric
3,220052,Arterial Blood Pressure mean,mmHg,Numeric
4,220179,Non Invasive Blood Pressure systolic,mmHg,Numeric
5,220180,Non Invasive Blood Pressure diastolic,mmHg,Numeric
6,220181,Non Invasive Blood Pressure mean,mmHg,Numeric
7,220210,Respiratory Rate,insp/min,Numeric
8,223761,Temperature Fahrenheit,°F,Numeric
9,223762,Temperature Celsius,°C,Numeric


In [4]:
# ['Heart Rate', 'Non Invasive Blood Pressure systolic', 'Non Invasive Blood Pressure diastolic',
# 'Respiratory Rate', 'Temperature Celsius', 'Admission Weight (Kg)', 'Height (cm)']
candidate_items = (220045, 220179, 220180, 220210, 223762, 226512, 226730) # 7 params

### Create summary

In [5]:
subject_id = pd.read_csv("patients_subject_id.csv")

In [6]:
# ---------------- USER SETTINGS ----------------
chartevents_path = "E:/Chrome Dls/MIMIC_IV_Core/icu/chartevents.csv"   # or full path
out_path = Path("charteevents_subject_summary.csv")
pat_set = set(subject_id["subject_id"].values)
chunksize = 1_000_000   # safe starting point for 16GB RAM
usecols = ['subject_id','hadm_id','itemid','charttime','valuenum','value','valueuom','warning']
# ------------------------------------------------

# make sure candidate_items is a set of ints
candidate_items = set(int(x) for x in candidate_items)

if out_path.exists():
    result = pd.read_csv(out_path)
else:
    # global accumulators keyed by (subject_id, itemid)
    sum_dict   = defaultdict(float)
    count_dict = defaultdict(int)
    sumsq_dict = defaultdict(float)
    min_dict   = {}
    max_dict   = {}

    # helper to update dictionaries from a grouped DataFrame
    def update_accumulators(grp_df):
        # grp_df index: MultiIndex (subject_id, itemid)
        for (subj, item), row in grp_df.iterrows():
            key = (int(subj), int(item))
            s = float(row['sum'])
            c = int(row['count'])
            ssq = float(row['sumsq'])
            mi = float(row['min'])
            ma = float(row['max'])

            sum_dict[key] += s
            count_dict[key] += c
            sumsq_dict[key] += ssq
            if key in min_dict:
                if mi < min_dict[key]:
                    min_dict[key] = mi
            else:
                min_dict[key] = mi
            if key in max_dict:
                if ma > max_dict[key]:
                    max_dict[key] = ma
            else:
                max_dict[key] = ma

    # read in chunks
    read_kwargs = {"usecols": usecols, "chunksize": chunksize, "low_memory": False}
    if Path(chartevents_path).suffix == ".gz":
        read_kwargs["compression"] = "gzip"

    print("Reading chartevents in chunks...")
    for chunk_idx, chunk in enumerate(pd.read_csv(chartevents_path, **read_kwargs)):
        # filter itemids early
        chunk = chunk[chunk['itemid'].isin(candidate_items)]
        if pat_set is not None:
            chunk = chunk[chunk['subject_id'].isin(pat_set)]
        if chunk.empty:
            print(f"Chunk {chunk_idx}: no relevant rows, skipping.")
            continue

        # ensure numeric valuenuum
        chunk['valuenum'] = pd.to_numeric(chunk['valuenum'], errors='coerce')
        # drop rows without numeric measurements (you may choose to keep non-numeric parsing)
        chunk = chunk.dropna(subset=['valuenum'])
        if chunk.empty:
            continue

        # compute sumsq column efficiently
        chunk['valuenum_sq'] = chunk['valuenum'] * chunk['valuenum']

        # group per (subject_id, itemid)
        grp = chunk.groupby(['subject_id','itemid'], sort=False).agg(
            sum = ('valuenum', 'sum'),
            count = ('valuenum', 'count'),
            sumsq = ('valuenum_sq', 'sum'),
            min = ('valuenum','min'),
            max = ('valuenum','max')
        )

        # update global accumulators
        update_accumulators(grp)

        # housekeeping
        del chunk, grp
        if (chunk_idx + 1) % 5 == 0:
            print(f"Processed {chunk_idx+1} chunks...")

    print("Finished scanning all chunks. Building final table...")

    # collect unique subject_ids found
    subject_ids = sorted({k[0] for k in count_dict.keys()})
    print("Subjects with at least one candidate measurement:", len(subject_ids))

    # build rows per subject
    rows = []
    for subj in subject_ids:
        row = {"subject_id": subj}
        for item in sorted(candidate_items):
            key = (subj, item)
            c = count_dict.get(key, 0)
            s = sum_dict.get(key, 0.0)
            ssq = sumsq_dict.get(key, 0.0)
            mi = min_dict.get(key, np.nan)
            ma = max_dict.get(key, np.nan)

            mean = (s / c) if c > 0 else np.nan
            # std: sample std if count>1, else NaN
            if c > 1:
                var = (ssq - (s * s) / c) / (c - 1)
                std = math.sqrt(var) if var >= 0 else 0.0
            else:
                std = np.nan

            row[f"item_{item}_mean"] = mean
            row[f"item_{item}_std"] = std
            row[f"item_{item}_min"] = mi
            row[f"item_{item}_max"] = ma
            row[f"item_{item}_count"] = c
        rows.append(row)

    result = pd.DataFrame(rows)

    # Optional: if you want to include subjects with no measurements (from pat_set), reindex
    if pat_set is not None:
        all_subjects = sorted(pat_set)
        result = result.set_index('subject_id').reindex(all_subjects).reset_index()

    # save
    result.to_csv(out_path, index=False)
    print("Saved summary to:", out_path)
    print("Result shape:", result.shape)


In [10]:
print(result.shape)
result.head()

(36899, 36)


Unnamed: 0,subject_id,item_220045_mean,item_220045_std,item_220045_min,item_220045_max,item_220045_count,item_220179_mean,item_220179_std,item_220179_min,item_220179_max,...,item_226512_mean,item_226512_std,item_226512_min,item_226512_max,item_226512_count,item_226730_mean,item_226730_std,item_226730_min,item_226730_max,item_226730_count
0,10000032,96.5,4.196559,91.0,105.0,10.0,88.9,4.629615,82.0,95.0,...,39.4,,39.4,39.4,1.0,152.0,,152.0,152.0,1.0
1,10000068,,,,,,,,,,...,,,,,,,,,,
2,10000084,,,,,,,,,,...,,,,,,,,,,
3,10000108,,,,,,,,,,...,,,,,,,,,,
4,10000117,,,,,,,,,,...,,,,,,,,,,


### Map item id --> Item Names!

In [7]:
# candidates should be loaded earlier
mapping = candidates.set_index('itemid')['label'].to_dict()
# example to rename columns after `result` is built:
new_cols = {}
for col in result.columns:
    if col.startswith('item_') and col.endswith('_mean'):
        item = int(col.split('_')[1])
        name = mapping.get(item, str(item)).replace(' ','_').replace('/','_')
        new_cols[col] = f"{name}_mean"
    # similarly for ref_lower/ref_upper/count/any_abnormal

result = result.rename(columns=new_cols)

# Keep only subject_id and *_mean columns
result = result[["subject_id"] + [c for c in result.columns if c.endswith("_mean")]]

result.to_csv(out_path, index=False)

print(result.shape)
result.head(10)

(36899, 8)


Unnamed: 0,subject_id,Heart_Rate_mean,Non_Invasive_Blood_Pressure_systolic_mean,Non_Invasive_Blood_Pressure_diastolic_mean,Respiratory_Rate_mean,Temperature_Celsius_mean,Admission_Weight_(Kg)_mean,Height_(cm)_mean
0,10000032,96.5,88.9,54.1,20.7,,39.4,152.0
1,10000068,,,,,,,
2,10000084,,,,,,,
3,10000108,,,,,,,
4,10000117,,,,,,,
5,10000161,,,,,,,
6,10000248,,,,,,,
7,10000280,,,,,,,
8,10000560,,,,,,,
9,10000635,,,,,,,


In [12]:
missing_values_prop = result.isnull().mean()
a = missing_values_prop*100.0
print("missings proportion:\n", a)

missings proportion:
 subject_id                                     0.000000
Heart_Rate_mean                               70.272907
Non_Invasive_Blood_Pressure_systolic_mean     70.530367
Non_Invasive_Blood_Pressure_diastolic_mean    70.530367
Respiratory_Rate_mean                         70.300008
Temperature_Celsius_mean                      95.685520
Admission_Weight_(Kg)_mean                    70.294588
Height_(cm)_mean                              83.080842
dtype: float64


### compare subject_id of ICU vs Hosp

In [15]:
import pandas as pd

# Load your patient set
patients_subject_id = pd.read_csv("patients_subject_id.csv")
pat_set = set(patients_subject_id['subject_id'])

# Prepare an empty set to collect subject_ids from chartevents
chartevents_subjects = set()

# Path to big file
chartevents_path = "E:/Chrome Dls/MIMIC_IV_Core/icu/chartevents.csv"

# Iterate over file in chunks to save memory
chunksize = 10**6   # adjust if needed (1M rows per chunk)
for chunk in pd.read_csv(chartevents_path, usecols=['subject_id'], chunksize=chunksize):
    chartevents_subjects.update(chunk['subject_id'].unique())

# Now compare sets
missing_subjects = pat_set - chartevents_subjects

print(f"Total patients in your set: {len(pat_set)}")
print(f"Total patients found in chartevents: {len(pat_set & chartevents_subjects)}")
print(f"Patients NOT found in chartevents: {len(missing_subjects)}")

# Save missing IDs for inspection
pd.DataFrame(sorted(missing_subjects), columns=['subject_id']).to_csv("missing_from_chartevents.csv", index=False)


Total patients in your set: 36899
Total patients found in chartevents: 10969
Patients NOT found in chartevents: 25930
