In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import pandas as pd 
import numpy as np
import pickle as pk

from utils import *

# DRG-based cohorts

In [21]:
# queried using *sql files on BigQuery
apr_df = pd.read_csv('raw-apr-drg.csv')
ms_df = pd.read_csv('raw-ms-drg.csv')

apr_df = clean_apr(apr_df)

In [22]:
ms_df = encode_drg(ms_df)
apr_df = encode_drg(apr_df)


The number of unique codes: 570
The number of unique codes: 849


In [23]:
ms_cohort = split_drg_cohort(ms_df)

In [24]:
apr_cohort = split_drg_cohort(apr_df)

In [17]:
with open('splits_drg_ms_raw.p', 'wb') as outf:
    pk.dump(ms_cohort, outf)

with open('splits_drg_apr_raw.p', 'wb') as outf:
    pk.dump(apr_cohort, outf)
    

# MIMIC-Extract-outcome cohort

In [42]:
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']


In [43]:
# loading from h5 file
DATAFILE = '/path/to/all_hourly_data.h5' 
statics = pd.read_hdf(DATAFILE, 'patients')

In [44]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['intime', 'mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
# Ys = Ys.astype(int)


In [45]:
# same as MIMIC-Extract nb
train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2
Ys_subj_idx = Ys.index.get_level_values('subject_id') 
lvl2_subjects = set(Ys_subj_idx)
# assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"
# assert lvl2_subjects == set(raw_subj_idx), "Subject ID pools differ!"

np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

In [46]:
def _fix_df(df):
    df = df.copy()
    cols = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']
    for col in cols:
        df[col] = df[col].astype(int)
    return df.reset_index().rename(columns=lambda x: x.upper())

In [47]:
Ys_train, Ys_dev, Ys_test = [
    _fix_df(Ys[Ys.index.get_level_values('subject_id').isin(s)]) for s in (train_subj, dev_subj, test_subj)
]

In [48]:
tr, val, te = Ys_train, Ys_dev, Ys_test

print('Number of subjects: ', 
        tr.SUBJECT_ID.nunique(), val.SUBJECT_ID.nunique(), te.SUBJECT_ID.nunique())

print('Number of hadms: ', 
    tr.HADM_ID.nunique(), val.HADM_ID.nunique(), te.HADM_ID.nunique())

print('Number of hadms: ', len(tr), len(val), len(te))

Number of subjects:  16760 2394 4790
Number of hadms:  16760 2394 4790
Number of hadms:  16760 2394 4790


In [49]:
cohort = {
    "train": Ys_train,
    "val": Ys_dev, 
    "test": Ys_test
}

In [50]:
with open('splits_mextract.p', 'wb') as outf:
    pk.dump(cohort, outf)
