### Dataset statistics

In [3]:
from cmehr.dataset import EHRDataModule
from cmehr.paths import ROOT_PATH


datamodule = EHRDataModule(
    file_path=str(ROOT_PATH / "output/pheno"),
    batch_size=1
)

In [4]:
print(len(datamodule.train_dataloader()))
print(len(datamodule.val_dataloader()))
print(len(datamodule.test_dataloader()))

Number of original samples:  18612
Number of filtered samples in train set: 18591
18591
Number of original samples:  4093
Number of filtered samples in val set: 4090
4090
Number of original samples:  4027
Number of filtered samples in test set: 4019
4019


### Create test set for MIMIC IV

In [5]:
# testset_path = '/home/fwu/Documents/myProjects/CM-EHR/cmehr/ext/mimic4benchmark/resources/mimiciii_testset.csv'
testset_path = "/home/**/Documents/CM-EHR/cmehr/ext/mimic4benchmark/resources/testset.csv"

test_set = set()
train_set = set()
with open(testset_path, 'r') as f:
    for line in f:
        x, y = line.split(',')
        if int(y) == 1:
            test_set.add(x)
        else:
            train_set.add(x)
total = len(test_set) + len(train_set)
print('test_set_len:', len(test_set), 'train_set_len:', len(train_set), 'total:', total)
print(len(train_set)/total, len(test_set)/total)

test_set_len: 5070 train_set_len: 28728 total: 33798
0.8499911237351323 0.15000887626486775


train_val_set : test_set = 0.85 : 0.15

In [7]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def is_subject_folder(x):
    return str.isdigit(x)


mimiciv_test_path = '/home/**/Documents/CM-EHR/cmehr/ext/mimic4benchmark/resources/testset.csv'
all_patiens_dir = '/home/fwu/Documents/myProjects/CM-EHR/cmehr/ext/data/root/train'

subdirectories = os.listdir(all_patiens_dir)
subjects = list(filter(is_subject_folder, subdirectories))
print(len(subjects))

train_data, test_data = train_test_split(subjects, test_size=0.15, random_state=42)

df = pd.DataFrame({'subject_id': subjects})

df['label'] = 1
df.loc[df['subject_id'].isin(train_data), 'label'] = 0

df.to_csv(mimiciv_test_path, index=False, header=False)

45127


### Create val set for MIMIC IV

In [9]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def is_subject_folder(x):
    return str.isdigit(x)


mimiciv_val_path = '/home/**/Documents/CM-EHR/cmehr/ext/mimic4models/resources/valset.csv'
all_patiens_dir = '/home/**/Documents/CM-EHR/data/mimiciv_benchmark/train'

subdirectories = os.listdir(all_patiens_dir)
subjects = list(filter(is_subject_folder, subdirectories))
print(len(subjects))

train_data, test_data = train_test_split(subjects, test_size=0.15, random_state=42)

df = pd.DataFrame({'subject_id': subjects})

df['label'] = 1
df.loc[df['subject_id'].isin(train_data), 'label'] = 0

df.to_csv(mimiciv_val_path, index=False, header=False)

38357


In [10]:
df

Unnamed: 0,subject_id,label
0,13461807,0
1,16514111,0
2,13713662,0
3,13921768,0
4,17978572,1
...,...,...
38352,13660391,0
38353,13607258,0
38354,11256275,0
38355,15563723,0


In [1]:
import os

In [7]:
import shutil

file_path = "/home/**/Documents/CM-EHR/data/mimiciv_benchmark"
subject_dir = os.listdir(file_path)
for subject in subject_dir:
    if subject.isdigit():
        # print(subject)
        # os.rmdir(os.path.join(file_path, subject))
        shutil.rmtree(os.path.join(file_path, subject))
    # if os.path.isdir(os.path.join(file_path, subject)):
    #     print(subject)

In [8]:
import pickle

In [17]:
filepath = "/home/**/Documents/CM-EHR/output/pheno/mean_std.pkl"
with open(filepath, 'rb') as f:
    mean_std = pickle.load(f)

In [3]:
from cmehr.ext.mimic4benchmark.preprocessing import read_itemid_to_variable_map
import pandas as pd

variable_map_file = "/home/**/Documents/CM-EHR/cmehr/ext/mimic4benchmark/resources/itemid_to_variable_map.csv"
d_items_path = "/home/**/Documents/CM-EHR/data/mimiciv/tables/d_items.csv"
var_map = read_itemid_to_variable_map(variable_map_file)
var_map.columns = var_map.columns.str.lower()

d_items_df = pd.read_csv(d_items_path)
var_map = var_map.loc[var_map.index.isin(
    d_items_df['itemid'])]
variables = var_map.variable.unique()

In [4]:
print(variables)

['Diastolic blood pressure' 'Fraction inspired oxygen'
 'Glascow coma scale eye opening' 'Glascow coma scale motor response'
 'Glascow coma scale verbal response' 'Glucose' 'Heart Rate' 'Height'
 'Mean blood pressure' 'Oxygen saturation' 'pH' 'Respiratory rate'
 'Systolic blood pressure' 'Temperature' 'Weight']


In [2]:
import pickle

In [6]:
# file_path = "/home/**/Documents/CM-EHR/output/ihm/test_p2x_data.pkl"
file_path = "/home/**/Documents/CM-EHR/data/mimiciii_benchmark/test_starttime.pkl"

with open(file_path, 'rb') as f:
    data = pickle.load(f)

In [9]:
data.keys()

dict_keys(['12290_1', '3437_1', '3437_2', '13020_1', '25368_1', '20498_1', '3253_1', '7631_1', '23035_1', '8427_1', '8427_2', '8427_3', '8427_4', '6918_1', '23401_1', '91950_1', '42454_1', '82312_1', '5823_1', '24094_1', '24094_2', '96950_1', '96950_2', '96950_3', '96950_4', '58351_1', '58351_2', '40351_1', '13813_1', '50471_1', '86087_1', '89929_1', '98194_1', '14186_1', '29680_1', '1636_1', '1636_2', '97937_1', '20938_1', '20938_2', '19684_1', '15336_1', '90451_1', '90451_2', '71190_1', '71190_2', '19116_1', '16141_1', '40160_1', '25818_1', '41515_1', '41515_2', '16904_1', '68813_1', '24795_1', '82352_1', '8824_1', '16178_1', '16178_2', '6143_1', '30371_1', '27874_1', '87990_1', '80825_1', '3183_1', '1104_1', '1104_2', '57491_1', '13165_1', '90957_1', '70221_1', '47529_1', '46315_1', '8799_1', '8799_2', '8799_3', '8799_4', '8799_5', '8799_6', '8799_7', '8799_8', '8799_9', '13881_1', '13881_2', '50113_1', '59829_1', '86842_1', '26361_1', '41724_1', '14651_1', '14651_2', '74817_1', '99