## Test HospReadmDataset

In [3]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

training_data_path = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/train_test/train_full.csv"
print(training_data_path)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/train_test/train_full.csv


In [4]:
import pandas as pd

df = pd.read_csv(training_data_path)
df.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'HOSPITALIZATION_DAYS', 'NUM_COMORBIDITIES', 'TYPES_COMORBIDITIES',
       'HAS_DIABETES', 'HAS_COPD', 'HAS_CONGESTIVE_HF', 'NEXT_ADMISSION_TYPE',
       'NUM_PREV_HOSPITALIZATIONS', 'PREV_DISCHTIME', 'NEXT_ADMITTIME',
       'DAYS_SINCE_LAST_HOSPITALIZATION', 'DAYS_UNTIL_NEXT_HOSPITALIZATION',
       'LOG_DAYS_SINCE_LAST_HOSPITALIZATION',
       'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'PREV_READMISSION_30_DAYS',
       'READMISSION_30_DAYS', 'READM_30_DAYS_PAST_MEAN',
       'READM_30_DAYS_PAST_SUM', 'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_STD', 'TOTAL_HOSPITALIZATIONS',
       'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES', 'DOB', 'DOD', 'DOD_HOSP',
       'AGE', 'CHARLSON_INDEX', 'FIRST_ADMITTIME', 'LAST_DISCHTIME',
       'PARTICIPATION_DAYS', 'TOTAL_PARTICIPATION_DAYS',
       'DEATH_TIME_AFTER_LAST_DISCHARGE', 'READMISSION_EVENT',
      

In [10]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,20.739583,3,"['other', 'chronic_pulmonary_disease', 'cerebr...",False,True,False,...,False,False,False,False,True,False,False,True,True,False
1,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,7.497222,3,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,True,False
2,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,15.790278,4,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,False,True
3,138,108120,2131-10-31 08:00:00,2131-11-06 12:54:00,6.204167,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,False,True,False,False,True,True,False
4,138,188284,2133-12-21 23:52:00,2134-01-11 13:00:00,20.547222,3,"['other', 'congestive_heart_failure', 'maligna...",False,False,True,...,False,False,False,False,True,False,False,True,True,False


In [5]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "NUM_COMORBIDITIES", "LOG_NUM_DRUGS", "NUM_PROCEDURES",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
)

In [6]:
print(f"Number of samples in the dataset: {len(mimic_dataset)}")

Number of samples in the dataset: 4434


## Inspecting Patient 107

In [11]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 124][list(cols)]

Unnamed: 0,ETHNICITY_WHITE,LOG_DAYS_IN_ICU,HAS_CONGESTIVE_HF,LOG_HOSPITALIZATION_DAYS,HAS_COPD,ETHNICITY_BLACK,SUBJECT_ID,INSURANCE_PRIVATE,CHARLSON_INDEX,INSURANCE_MEDICAID,HAS_DIABETES,NUM_PROCEDURES,AGE,ETHNICITY_HISPANIC,READMISSION_30_DAYS,HADM_ID,NUM_COMORBIDITIES,ADMISSION_TYPE_ELECTIVE,LOG_NUM_DRUGS
0,True,1.591651,False,3.079135,True,False,124,True,4,False,False,4,70,False,0,172461,3,False,3.663562
1,True,2.102874,False,2.139739,True,False,124,False,5,False,False,4,71,False,0,112906,3,False,3.637586
2,True,0.848527,False,2.8208,True,False,124,False,7,False,False,12,75,False,0,134369,4,True,3.7612


In [16]:
mimic_dataset[1]

(tensor([ 2.8208,  0.8485,  7.0000, 12.0000,  3.7612, 75.0000,  1.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000]),
 tensor([[2.1397, 2.1029, 3.0000, 3.6376, 4.0000, 0.0000, 1.0000, 0.0000],
         [3.0791, 1.5917, 3.0000, 3.6636, 4.0000, 0.0000, 1.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]),
 tensor([ True,  True, False, False, False]),
 tensor(0.))

In [15]:
mimic_dataset.samples[1]

{'x_past': array([[2.1397393, 2.1028743, 3.       , 3.637586 , 4.       , 0.       ,
         1.       , 0.       ],
        [3.0791347, 1.5916511, 3.       , 3.6635616, 4.       , 0.       ,
         1.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       ]], dtype=float32),
 'x_current': array([ 2.8208    ,  0.84852725,  7.        , 12.        ,  3.7612002 ,
        75.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ], dtype=float32),
 'mask_past': array([ True,  True, False, False, False]),
 'y': 0.0,
 'subject_id': 124,
 'seq_len': 2,
 't_index': 3}

In [17]:
mimic_dataset.sample_ids[1]

np.int64(134369)

In [19]:
mimic_dataset.samples[2]

{'x_past': array([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 'x_current': array([ 1.9746596 ,  0.67731065,  3.        ,  2.        ,  3.6888795 ,
        48.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([False, False, False, False, False]),
 'y': 0.0,
 'subject_id': 138,
 'seq_len': 0,
 't_index': 1}

In [None]:
mimic_dataset[3]

np.int64(191941)

## Inspecting Patient 188

In [6]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 188][list(cols)]

Unnamed: 0,HADM_ID,INSURANCE_MEDICAID,ADMISSION_TYPE_ELECTIVE,INSURANCE_PRIVATE,ETHNICITY_WHITE,NUM_COMORBIDITIES,HAS_CONGESTIVE_HF,NUM_PROCEDURES,ETHNICITY_HISPANIC,HAS_DIABETES,SUBJECT_ID,CHARLSON_INDEX,ETHNICITY_BLACK,HAS_COPD,AGE,READMISSION_30_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_DRUGS,LOG_HOSPITALIZATION_DAYS


In [26]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.06964453,  0.09484041,  5.        ,  0.3842078 , -0.3324155 ,
          1.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  4.        , -0.18365645, -1.1613822 ,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.18365645, -0.05609327,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        ,  0.18620409, -0.6087377 ,
          1.        ,  0.        ,  0.        ],
        [ 0.00840348, -0.38091052,  4.        ,  0.3616835 ,  1.0491956 ,
          0.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}

## Checking Reverse Cronological Order False

In [27]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX", "NUM_PROCEDURES", "LOG_NUM_DRUGS",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
    reverse_chronological_order=False
)

In [28]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.00840348, -0.38091052,  6.        ,  1.0491956 ,  0.3616835 ,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        , -0.6087377 ,  0.18620409,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.05609327, -0.18365645,
          0.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  5.        , -1.1613822 , -0.18365645,
          1.        ,  0.        ,  0.        ],
        [ 0.06964453,  0.09484041,  7.        , -0.3324155 ,  0.3842078 ,
          1.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}