## Test HospReadmDataset

In [14]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)
    
training_data_path = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_events_preprocessed.csv"
print(training_data_path)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_events_preprocessed.csv


In [15]:
import pandas as pd

df = pd.read_csv(training_data_path)
df.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'HOSPITALIZATION_DAYS', 'NUM_COMORBIDITIES', 'TYPES_COMORBIDITIES',
       'HAS_DIABETES', 'HAS_COPD', 'HAS_CONGESTIVE_HF', 'NEXT_ADMISSION_TYPE',
       'NUM_PREV_HOSPITALIZATIONS', 'PREV_DISCHTIME', 'NEXT_ADMITTIME',
       'DAYS_SINCE_LAST_HOSPITALIZATION', 'DAYS_UNTIL_NEXT_HOSPITALIZATION',
       'LOG_DAYS_SINCE_LAST_HOSPITALIZATION',
       'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'PREV_READMISSION_30_DAYS',
       'READMISSION_30_DAYS', 'READM_30_DAYS_PAST_MEAN',
       'READM_30_DAYS_PAST_SUM', 'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_STD', 'TOTAL_HOSPITALIZATIONS',
       'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES', 'DOB', 'DOD', 'DOD_HOSP',
       'AGE', 'CHARLSON_INDEX', 'FIRST_ADMITTIME', 'LAST_DISCHTIME',
       'PARTICIPATION_DAYS', 'TOTAL_PARTICIPATION_DAYS',
       'DEATH_TIME_AFTER_LAST_DISCHARGE', 'READMISSION_EVENT',
      

In [16]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "NUM_COMORBIDITIES", "LOG_NUM_DRUGS", "NUM_PROCEDURES",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
)

In [17]:
print(f"Number of samples in the dataset: {len(mimic_dataset)}")

Number of samples in the dataset: 13415


## Inspecting Patient 107

In [18]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 107][list(cols)]

Unnamed: 0,HADM_ID,INSURANCE_MEDICAID,ADMISSION_TYPE_ELECTIVE,INSURANCE_PRIVATE,ETHNICITY_WHITE,NUM_COMORBIDITIES,HAS_CONGESTIVE_HF,NUM_PROCEDURES,ETHNICITY_HISPANIC,HAS_DIABETES,SUBJECT_ID,CHARLSON_INDEX,ETHNICITY_BLACK,HAS_COPD,AGE,READMISSION_30_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_DRUGS,LOG_HOSPITALIZATION_DAYS
14,191941,False,False,True,False,-1.079037,False,0.220229,True,False,107,-0.373917,False,False,0.007632,0,-0.900948,-0.815937,-2.195446
15,182383,False,False,False,False,0.539749,True,-0.88506,True,False,107,0.457016,False,False,0.374748,0,-0.859414,-0.183656,-0.618545


In [19]:
mimic_dataset[14]

(tensor([-0.6185, -0.8594,  0.4570, -0.8851, -0.1837,  0.3747,  0.0000,  0.0000,
          0.0000,  1.0000,  0.0000,  0.0000]),
 tensor([[-2.1954, -0.9009, -1.0790, -0.8159,  0.2202,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]),
 tensor([ True, False, False, False, False]),
 tensor(0.))

In [20]:
mimic_dataset.samples[14]

{'x_past': array([[-2.195446  , -0.9009482 , -1.0790372 , -0.8159372 ,  0.22022894,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([-0.6185455 , -0.8594142 ,  0.45701614, -0.88505995, -0.18365645,
         0.3747483 ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ], dtype=float32),
 'mask_past': array([ True, False, False, False, False]),
 'y': 0.0,
 'subject_id': 107,
 'seq_len': 1,
 't_index': 2}

In [21]:
mimic_dataset.sample_ids[14]

np.int64(182383)

In [22]:
mimic_dataset.samples[13]

{'x_past': array([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32),
 'x_current': array([-2.195446  , -0.9009482 , -0.37391692,  0.22022894, -0.8159372 ,
         0.00763248,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([False, False, False, False, False]),
 'y': 0.0,
 'subject_id': 107,
 'seq_len': 0,
 't_index': 1}

In [23]:
mimic_dataset.sample_ids[13]

np.int64(191941)

## Inspecting Patient 188

In [6]:
cols = set(["SUBJECT_ID", "HADM_ID"] + long_feat_cols + current_feat_cols + ["READMISSION_30_DAYS"])
df[df['SUBJECT_ID'] == 188][list(cols)]

Unnamed: 0,HADM_ID,INSURANCE_MEDICAID,ADMISSION_TYPE_ELECTIVE,INSURANCE_PRIVATE,ETHNICITY_WHITE,NUM_COMORBIDITIES,HAS_CONGESTIVE_HF,NUM_PROCEDURES,ETHNICITY_HISPANIC,HAS_DIABETES,SUBJECT_ID,CHARLSON_INDEX,ETHNICITY_BLACK,HAS_COPD,AGE,READMISSION_30_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_DRUGS,LOG_HOSPITALIZATION_DAYS


In [26]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.06964453,  0.09484041,  5.        ,  0.3842078 , -0.3324155 ,
          1.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  4.        , -0.18365645, -1.1613822 ,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.18365645, -0.05609327,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        ,  0.18620409, -0.6087377 ,
          1.        ,  0.        ,  0.        ],
        [ 0.00840348, -0.38091052,  4.        ,  0.3616835 ,  1.0491956 ,
          0.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}

## Checking Reverse Cronological Order False

In [27]:
from recurrent_health_events_prediction.datasets.HospReadmDataset import HospReadmDataset

long_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX", "NUM_PROCEDURES", "LOG_NUM_DRUGS",
    "HAS_DIABETES","HAS_COPD","HAS_CONGESTIVE_HF"]

current_feat_cols = [
    "LOG_HOSPITALIZATION_DAYS",
    "LOG_DAYS_IN_ICU",
    "CHARLSON_INDEX",
    "NUM_PROCEDURES",
    "LOG_NUM_DRUGS",
    "AGE",
    "ADMISSION_TYPE_ELECTIVE",
    "ETHNICITY_WHITE",
    "ETHNICITY_BLACK",
    "ETHNICITY_HISPANIC",
    "INSURANCE_MEDICAID",
    "INSURANCE_PRIVATE",
]

mimic_dataset = HospReadmDataset(
    csv_path=training_data_path,
    longitudinal_feat_cols=long_feat_cols,
    current_feat_cols=current_feat_cols,
    subject_id_col="SUBJECT_ID",
    max_seq_len=5,
    reverse_chronological_order=False
)

In [28]:
mimic_dataset.samples[39]

{'x_past': array([[ 0.00840348, -0.38091052,  6.        ,  1.0491956 ,  0.3616835 ,
          0.        ,  0.        ,  0.        ],
        [-1.1139672 , -0.78897846,  3.        , -0.6087377 ,  0.18620409,
          1.        ,  0.        ,  0.        ],
        [-1.140655  , -0.7964251 ,  2.        , -0.05609327, -0.18365645,
          0.        ,  0.        ,  0.        ],
        [-1.3005396 , -0.9561102 ,  5.        , -1.1613822 , -0.18365645,
          1.        ,  0.        ,  0.        ],
        [ 0.06964453,  0.09484041,  7.        , -0.3324155 ,  0.3842078 ,
          1.        ,  0.        ,  0.        ]], dtype=float32),
 'x_current': array([ 0.17604949,  0.41107926,  5.        ,  1.60184   ,  0.50879043,
        -0.4206693 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ], dtype=float32),
 'mask_past': array([ True,  True,  True,  True,  True]),
 'y': 0.0,
 'subject_id': 188,
 'seq_len': 5,
 't_index': 6}