In [72]:
from pathlib import Path
import pandas as pd
import pickle
import torch
import torch.backends.cudnn as cudnn
import numpy as np
import random

In [73]:
def read_txt(filename):
    data = []
    with open(filename, "r") as file:
        lines = file.read().splitlines()
        for line in lines:
            data.append(line)
    return data

In [None]:

# /home/data/2025_MIMICIV_processed/mimic4/task:mortality_90days/train_admission_ids_seed_2026.txt
# /home/data/2025_MIMICIV_processed/mimic4/task:readmission_15days
task='readmission_15days' # mortality_90days, readmission_15days
all_hosp_adm_dict_path = Path(f"/home/data/2025_MIMICIV_processed/mimic4/hosp_adm_dict_{task.split('_')[1]}.pkl")
all_hosp_adm_dict = pickle.load(open(all_hosp_adm_dict_path, "rb"))
all_hosp_adm_dict = {k:v for k,v in all_hosp_adm_dict.items() if getattr(v, task.split('_')[0]) is not None}


In [39]:
seed = 2028
train_ids = np.array(read_txt(Path("/home/data/2025_MIMICIV_processed/") / f"mimic4/task:{task}/train_admission_ids_seed_{seed}.txt"))
val_ids = np.array(read_txt(Path("/home/data/2025_MIMICIV_processed/") / f"mimic4/task:{task}/val_admission_ids_seed_{seed}.txt"))
test_ids = np.array(read_txt(Path("/home/data/2025_MIMICIV_processed/") / f"mimic4/task:{task}/test_admission_ids_seed_{seed}.txt"))
print(f"Train: {len(train_ids)}, Val: {len(val_ids)}, Test: {len(test_ids)}")

train_dict = {k:v for k,v in all_hosp_adm_dict.items() if k in train_ids}
print(len(train_dict))
val_dict = {k:v for k,v in all_hosp_adm_dict.items() if k in val_ids}
print(len(val_dict))
test_dict = {k:v for k,v in all_hosp_adm_dict.items() if k in test_ids}
print(len(test_dict))

Train: 161062, Val: 23009, Test: 46018
161062
23009
46018


In [56]:
train_dict['23598978']

HOSP_ADM ID-23598978 (2601.0 min): mortality-None, readmission-True

In [69]:
np.bincount([test_dict[key].readmission for key in test_dict.keys()]), len(test_dict)

(array([34727, 11291]), 46018)

In [40]:
def check_code(hosp_adm) : 
    age = str(hosp_adm.age)
    gender = hosp_adm.gender
    ethnicity = hosp_adm.ethnicity
    types = hosp_adm.trajectory[0]
    codes = hosp_adm.trajectory[1]
    # if there are no None, return True
    if None in [age, gender,ethnicity, types, codes] :
        return False
    else :
        return True

def check_labvector(hosp_adm) :
    labvectors= hosp_adm.labvectors
    if labvectors is None :
        return False
    else :
        return True
    
def check_discharge(hosp_adm) :
    discharge = hosp_adm.discharge
    if discharge is None :
        return False
    else :
        return True
    

In [41]:
train_code_ids = [k for k,v in train_dict.items() if check_code(v)]
val_code_ids = [k for k,v in val_dict.items() if check_code(v)]
test_code_ids = [k for k,v in test_dict.items() if check_code(v)]
print(f"Train {len(train_ids)} -> {len(train_code_ids)}, Val {len(val_ids)} -> {len(val_code_ids)}, Test {len(test_ids)} -> {len(test_code_ids)}")

Train 161062 -> 161062, Val 23009 -> 23009, Test 46018 -> 46018


In [42]:
train_lab_ids = [k for k,v in train_dict.items() if check_labvector(v)]
val_lab_ids = [k for k,v in val_dict.items() if check_labvector(v)]
test_lab_ids = [k for k,v in test_dict.items() if check_labvector(v)]
print(f"Train {len(train_ids)} -> {len(train_lab_ids)}, Val {len(val_ids)} -> {len(val_lab_ids)}, Test {len(test_ids)} -> {len(test_lab_ids)}")

Train 161062 -> 128865, Val 23009 -> 18517, Test 46018 -> 36844


In [43]:
train_discharge_ids = [k for k,v in train_dict.items() if check_discharge(v)]
val_discharge_ids = [k for k,v in val_dict.items() if check_discharge(v)]
test_discharge_ids = [k for k,v in test_dict.items() if check_discharge(v)]
print(f"Train {len(train_ids)} -> {len(train_discharge_ids)}, Val {len(val_ids)} -> {len(val_discharge_ids)}, Test {len(test_ids)} -> {len(test_discharge_ids)}")

Train 161062 -> 95070, Val 23009 -> 13607, Test 46018 -> 27334


In [44]:
ids_dict = {
    'train_code_ids': train_code_ids,
    'train_lab_ids' : train_lab_ids,
    'train_discharge_ids' : train_discharge_ids,
    'train_ids' : train_ids.tolist(),
    'val_code_ids': val_code_ids,
    'val_lab_ids' : val_lab_ids,
    'val_discharge_ids' : val_discharge_ids,
    'test_code_ids': test_code_ids,
    'test_lab_ids' : test_lab_ids,
    'test_discharge_ids' : test_discharge_ids
}

In [45]:
with open(Path(f"/home/data/2025_MIMICIV_processed/mimic4/task:{task}/admission_ids_seed_{seed}.pkl"), "wb") as f:
    pickle.dump(ids_dict, f)

In [46]:
from pathlib import Path
import pickle


In [47]:
for seed in range(2026, 2029) : 
    data = Path(f"/home/data/2025_MIMICIV_processed/mimic4/task:readmission_15days/admission_ids_seed_{seed}.pkl")
    data = pickle.load(open(data, "rb"))

    train_code_ids = data['train_code_ids']
    train_lab_ids = data['train_lab_ids']
    train_discharge_ids = data['train_discharge_ids']

    valid_code_ids = data['val_code_ids']
    valid_lab_ids = data['val_lab_ids']
    valid_discharge_ids = data['val_discharge_ids']

    test_code_ids = data['test_code_ids']
    test_lab_ids = data['test_lab_ids']
    test_discharge_ids = data['test_discharge_ids']

    train_common_ids = list(set(train_code_ids) & set(train_lab_ids) & set(train_discharge_ids))
    valid_common_ids = list(set(valid_code_ids) & set(valid_lab_ids) & set(valid_discharge_ids))
    test_common_ids = list(set(test_code_ids) & set(test_lab_ids) & set(test_discharge_ids))

    print(f"Train: {len(train_common_ids)}, Valid: {len(valid_common_ids)}, Test: {len(test_common_ids)}")

    with open(Path(f"/home/data/2025_MIMICIV_processed/mimic4/task:readmission_15days/admission_ids_seed_{seed}_INTERSECTION.pkl"), "wb") as f:
        pickle.dump({
            'train_common_ids': train_common_ids,
            'valid_common_ids': valid_common_ids,
            'test_common_ids': test_common_ids
        }, f)

Train: 89468, Valid: 12848, Test: 25553
Train: 89642, Valid: 12723, Test: 25504
Train: 89358, Valid: 12837, Test: 25674


In [None]:
code_ids = train_code_ids + valid_code_ids + test_code_ids
lab_ids = train_lab_ids + valid_lab_ids + test_lab_ids
discharge_ids = train_discharge_ids + valid_discharge_ids + test_discharge_ids

In [None]:
total_ids = set(code_ids) | set(lab_ids) | set(discharge_ids)
code_only_ids= set(code_ids) - set(lab_ids) - set(discharge_ids)
lab_only_ids = set(lab_ids) - set(discharge_ids)
discharge_only_ids = set(discharge_ids) - set(lab_ids)
all_ids = set(code_ids) & set(lab_ids) & set(discharge_ids)
print(f"Total: {len(total_ids)}, Code: {len(code_ids)}, Lab: {len(lab_ids)}, Discharge: {len(discharge_ids)}")
print(f"Code only: {len(code_only_ids)}, Lab only: {len(lab_only_ids)}, Discharge only: {len(discharge_only_ids)}")
print(f"All: {len(all_ids)}")

In [None]:
all_ratio = len(all_ids) / total_len
total_len = len(total_ids)
code_ratio = len(code_only_ids) / total_len
lab_ratio = len(lab_only_ids) / total_len
discharge_ratio = len(discharge_only_ids) / total_len
print(f"All : {len(all_ids)} {all_ratio:.2%}")
print(f"Code only : {len(code_only_ids)} {code_ratio:.2%}")
print(f"Lab only : {len(lab_only_ids)} {lab_ratio:.2%}")
print(f"Discharge only : {len(discharge_only_ids)} {discharge_ratio:.2%}")
print(f"Total : {total_len}")