## Test HospReadmDataset

In [4]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)
    
training_data_path = data_config['training_data']['mimic']['data_directory'] + "/train_events_preprocessed.csv"
print(training_data_path)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/mimic_cleaned_v2/train_events_preprocessed.csv


In [5]:
import pandas as pd

df = pd.read_csv(training_data_path)
df.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'HOSPITALIZATION_DAYS', 'NUM_COMORBIDITIES', 'TYPES_COMORBIDITIES',
       'HAS_DIABETES', 'HAS_COPD', 'HAS_CONGESTIVE_HF', 'NEXT_ADMISSION_TYPE',
       'NUM_PREV_HOSPITALIZATIONS', 'PREV_DISCHTIME', 'NEXT_ADMITTIME',
       'DAYS_SINCE_LAST_HOSPITALIZATION', 'DAYS_UNTIL_NEXT_HOSPITALIZATION',
       'LOG_DAYS_SINCE_LAST_HOSPITALIZATION',
       'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'PREV_READMISSION_30_DAYS',
       'READMISSION_30_DAYS', 'READM_30_DAYS_PAST_MEAN',
       'READM_30_DAYS_PAST_SUM', 'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_STD', 'TOTAL_HOSPITALIZATIONS',
       'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES', 'GENDER', 'DOB', 'DOD',
       'DOD_HOSP', 'AGE', 'CHARLSON_INDEX', 'FIRST_ADMITTIME',
       'LAST_DISCHTIME', 'PARTICIPATION_DAYS', 'TOTAL_PARTICIPATION_DAYS',
       'DEATH_TIME_AFTER_LAST_DISCHARGE', 'READMISSION_EVEN

In [6]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "NUM_COMORBIDITIES", "LOG_NUM_DRUGS", "NUM_PROCEDURES",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
)

In [7]:
print(f"Number of samples in the dataset: {len(mimic_dataset)}")

Number of samples in the dataset: 13415


## Inspecting Patient 107

In [22]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 107][list(cols)]

Unnamed: 0,SUBJECT_ID,ADMISSION_TYPE_ELECTIVE,ETHNICITY_WHITE,AGE,ETHNICITY_BLACK,CHARLSON_INDEX,INSURANCE_PRIVATE,READMISSION_30_DAYS,NUM_COMORBIDITIES,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_DRUGS,ETHNICITY_HISPANIC,NUM_PROCEDURES,HAS_CONGESTIVE_HF,INSURANCE_MEDICAID,HADM_ID,HAS_DIABETES,HAS_COPD
14,107,False,False,0.007632,False,4,True,0,2,-2.195446,-0.900948,-0.815937,True,0.220229,False,False,191941,False,False
15,107,False,False,0.374748,False,6,False,0,4,-0.618545,-0.859414,-0.183656,True,-0.88506,True,False,182383,False,False


In [16]:
mimic_dataset[14]

(tensor([-0.6185, -0.8594,  6.0000, -0.8851, -0.1837,  0.3747,  0.0000,  0.0000,
          0.0000,  1.0000,  0.0000,  0.0000]),
 tensor([[-2.1954, -0.9009,  2.0000, -0.8159,  0.2202,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]),
 tensor([ True, False, False, False, False]),
 tensor(0.))

In [14]:
mimic_dataset.samples[14]

{'x_past': array([[-2.195446  , -0.9009482 ,  2.        , -0.8159372 ,  0.22022894,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([-0.6185455 , -0.8594142 ,  6.        , -0.88505995, -0.18365645,
         0.3747483 ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ], dtype=float32),
 'mask_past': array([ True, False, False, False, False]),
 'y': 0.0,
 'subject_id': 107,
 'seq_len': 1,
 't_index': 2}

In [15]:
mimic_dataset.samples[13]

{'x_past': array([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 'x_current': array([-2.195446  , -0.9009482 ,  4.        ,  0.22022894, -0.8159372 ,
         0.00763248,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([False, False, False, False, False]),
 'y': 0.0,
 'subject_id': 107,
 'seq_len': 0,
 't_index': 1}

## Inspecting Patient 188

In [23]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 188][list(cols)]

Unnamed: 0,SUBJECT_ID,ADMISSION_TYPE_ELECTIVE,ETHNICITY_WHITE,AGE,ETHNICITY_BLACK,CHARLSON_INDEX,INSURANCE_PRIVATE,READMISSION_30_DAYS,NUM_COMORBIDITIES,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_DRUGS,ETHNICITY_HISPANIC,NUM_PROCEDURES,HAS_CONGESTIVE_HF,INSURANCE_MEDICAID,HADM_ID,HAS_DIABETES,HAS_COPD
36,188,False,True,-0.665413,False,6,True,0,4,0.008403,-0.380911,0.361683,False,1.049196,False,False,160697,False,False
37,188,False,True,-0.665413,False,3,True,0,3,-1.113967,-0.788978,0.186204,False,-0.608738,False,False,191517,True,False
38,188,False,True,-0.665413,False,2,True,0,2,-1.140655,-0.796425,-0.183656,False,-0.056093,False,False,150463,False,False
39,188,False,True,-0.481855,False,5,True,0,4,-1.30054,-0.95611,-0.183656,False,-1.161382,False,False,192557,True,False
40,188,False,True,-0.481855,False,7,True,0,5,0.069645,0.09484,0.384208,False,-0.332415,False,False,123860,True,False
41,188,False,True,-0.420669,False,5,True,0,4,0.176049,0.411079,0.50879,False,1.60184,False,False,164735,True,False


In [26]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.06964453,  0.09484041,  5.        ,  0.3842078 , -0.3324155 ,
          1.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  4.        , -0.18365645, -1.1613822 ,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.18365645, -0.05609327,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        ,  0.18620409, -0.6087377 ,
          1.        ,  0.        ,  0.        ],
        [ 0.00840348, -0.38091052,  4.        ,  0.3616835 ,  1.0491956 ,
          0.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}

## Checking Reverse Cronological Order False

In [27]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX", "NUM_PROCEDURES", "LOG_NUM_DRUGS",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
    reverse_chronological_order=False
)

In [28]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.00840348, -0.38091052,  6.        ,  1.0491956 ,  0.3616835 ,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        , -0.6087377 ,  0.18620409,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.05609327, -0.18365645,
          0.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  5.        , -1.1613822 , -0.18365645,
          1.        ,  0.        ,  0.        ],
        [ 0.06964453,  0.09484041,  7.        , -0.3324155 ,  0.3842078 ,
          1.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}