In [1]:
import pandas as pd
import numpy as np
import logging
from typing import Tuple, List
from omegaconf import ListConfig
logger = logging.getLogger(__name__)
import os, sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)
from src.data.mimic.semi_synthetic_dataset import MIMIC3SyntheticDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Initial exploration of the dataframe from all_hourly_data hd5
def process_static_features(static_features: pd.DataFrame, drop_first=False) -> pd.DataFrame:
    """
    Global standard normalisation of static features & one hot encoding
    Args:
        static_features: pd.DataFrame with unprocessed static features
        drop_first: Dropping first class of one-hot-encoded features

    Returns: pd.DataFrame with pre-processed static features

    """
    processed_static_features = []
    for feature in static_features.columns:
        if isinstance(static_features[feature].iloc[0], float):
            mean = np.mean(static_features[feature])
            std = np.std(static_features[feature])
            processed_static_features.append((static_features[feature] - mean) / std)
        else:
            one_hot = pd.get_dummies(static_features[feature], drop_first=drop_first)
            processed_static_features.append(one_hot.astype(float))

    static_features = pd.concat(processed_static_features, axis=1)
    return static_features

def load_mimic3_data_raw(data_path: str,
                         min_seq_length: int = None,
                         max_seq_length: int = None,
                         max_number: int = None,
                         vital_list: List[str] = None,
                         static_list: List[str] = None,
                         data_seed: int = 100,
                         drop_first=False,
                         **kwargs) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Load MIMIC-3 hourly averaged dataset, without preprocessing (for semi-synthetic experiments)
    :param data_path: Path with MIMIC-3 dataset (HDFStore)
    :param min_seq_length: Min sequence lenght in cohort
    :param max_seq_length: Max sequence length in cohort
    :param vital_list: List of vitals (time-varying covariates)
    :param static_list: List of static features
    :param max_number: Maximum number of patients in cohort
    :param data_seed: Seed for random cohort patient selection
    :param drop_first: Dropping first class of one-hot-encoded features
    :return: Tuple of DataFrames (all_vitals, static_features)
    """
    logger.info(f'Loading MIMIC-III dataset from {data_path}.')

    h5 = pd.HDFStore(data_path, 'r')
    if vital_list is None:
        vital_list = [
            'heart rate',
            'red blood cell count',
            'sodium',
            'mean blood pressure',
            'systemic vascular resistance',
            'glucose',
            'chloride urine',
            'glascow coma scale total',
            'hematocrit',
            'positive end-expiratory pressure set',
            'respiratory rate',
            'prothrombin time pt',
            'cholesterol',
            'hemoglobin',
            'creatinine',
            'blood urea nitrogen',
            'bicarbonate',
            'calcium ionized',
            'partial pressure of carbon dioxide',
            'magnesium',
            'anion gap',
            'phosphorous',
            'platelets'
        ]
    if static_list is None:
        static_list = [
            'gender',
            'ethnicity',
            'age'
        ]

    all_vitals = h5['/vitals_labs_mean'][vital_list]
    static_features = h5['/patients'][static_list]

    all_vitals = all_vitals.droplevel(['hadm_id', 'icustay_id'])
    column_names = []
    for column in all_vitals.columns:
        if isinstance(column, str):
            column_names.append(column)
        else:
            column_names.append(column[0])
    all_vitals.columns = column_names
    static_features = static_features.droplevel(['hadm_id', 'icustay_id'])

    # Filling NA
    all_vitals = all_vitals.fillna(method='ffill')
    all_vitals = all_vitals.fillna(method='bfill')

    # Filtering longer then min_seq_length and cropping to max_seq_length
    user_sizes = all_vitals.groupby('subject_id').size()
    filtered_users = user_sizes.index[user_sizes >= min_seq_length] if min_seq_length is not None else user_sizes.index
    if max_number is not None:
        np.random.seed(data_seed)
        filtered_users = np.random.choice(filtered_users, size=max_number, replace=False)
    all_vitals = all_vitals.loc[filtered_users]
    static_features = static_features.loc[filtered_users]
    if max_seq_length is not None:
        all_vitals = all_vitals.groupby('subject_id').head(max_seq_length)
    logger.info(f'Number of patients filtered: {len(filtered_users)}.')

    # Global Mean-Std Normalisation
    mean = np.mean(all_vitals, axis=0)
    std = np.std(all_vitals, axis=0)
    all_vitals = (all_vitals - mean) / std

    static_features = process_static_features(static_features, drop_first=drop_first)

    h5.close()
    return all_vitals, static_features

In [3]:
data_path = r'C:\Users\mhr_k\Data\mimic_extract\all_hourly_data_100.h5'
all_vitals, static_features = load_mimic3_data_raw(data_path, min_seq_length=24, max_seq_length=48, max_number=None)

  all_vitals = all_vitals.fillna(method='ffill')
  all_vitals = all_vitals.fillna(method='bfill')


static_features.head()

In [23]:
all_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,heart rate,red blood cell count,sodium,mean blood pressure,systemic vascular resistance,glucose,chloride urine,glascow coma scale total,hematocrit,positive end-expiratory pressure set,...,hemoglobin,creatinine,blood urea nitrogen,bicarbonate,calcium ionized,partial pressure of carbon dioxide,magnesium,anion gap,phosphorous,platelets
subject_id,hours_in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10006,0,1.007209,2.005556,0.012826,-0.911506,-2.371708,1.289831,0.431609,0.468275,2.762765,0.294488,...,2.251161,1.857278,-0.922549,1.143392,-2.250698,1.390189,-1.782192,1.640699,1.260629,-0.942348
10006,1,1.007209,2.005556,0.012826,-0.911506,-2.371708,1.289831,0.431609,0.468275,2.762765,0.294488,...,2.251161,1.857278,-0.922549,1.143392,-2.250698,1.390189,-1.782192,1.640699,1.260629,-0.942348
10006,2,0.683042,2.005556,0.012826,-0.043978,-2.371708,0.120058,0.431609,0.468275,2.762765,0.294488,...,2.251161,1.857278,-0.922549,1.143392,-2.250698,1.390189,-1.782192,1.640699,1.260629,-0.942348
10006,3,0.624102,2.005556,0.012826,-0.586182,-2.371708,0.120058,0.431609,0.468275,2.762765,0.294488,...,2.251161,1.857278,-0.922549,1.143392,-2.250698,1.390189,-1.782192,1.640699,1.260629,-0.942348
10006,4,0.447284,2.005556,0.012826,-0.304237,-2.371708,0.120058,0.431609,0.468275,2.762765,0.294488,...,2.251161,1.857278,-0.922549,1.143392,-2.250698,1.390189,-1.782192,1.640699,1.260629,-0.942348


In [24]:
static_features.head()

Unnamed: 0_level_0,F,M,AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,ASIAN,BLACK/AFRICAN AMERICAN,HISPANIC OR LATINO,HISPANIC/LATINO - PUERTO RICAN,OTHER,UNKNOWN/NOT SPECIFIED,WHITE,age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10006,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.343801
10013,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.087383
10017,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.296261
10019,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.682628
10026,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.23154


In [4]:
import os, sys
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)
from src.data.mimic.semi_synthetic_dataset import MIMIC3SyntheticDataset

ImportError: cannot import name 'ROOT_PATH' from 'src' (c:\Users\mhr_k\OneDrive\Documents\Thesis\Neural-R-Learner\src\__init__.py)

In [4]:
data_path = r'C:\Users\mhr_k\Data\mimic_extract\all_hourly_data_100.h5'
h5 = pd.HDFStore(data_path, 'r')
for key in h5.keys():
    print(key)

/codes
/interventions
/patients
/vitals_labs
/vitals_labs_mean
/patients/meta/values_block_6/meta
/patients/meta/values_block_5/meta
/patients/meta/values_block_4/meta
/patients/meta/values_block_0/meta


In [5]:
vitals = h5['/vitals_labs']
vitals_mean = h5['/vitals_labs_mean']
patients = h5['/patients']
treatments = h5['/interventions']

In [17]:
vital_list = [
            'heart rate',
            'red blood cell count',
            'sodium',
            'mean blood pressure',
            'systemic vascular resistance',
            'glucose',
            'chloride urine',
            'glascow coma scale total',
            'hematocrit',
            'positive end-expiratory pressure set',
            'respiratory rate',
            'prothrombin time pt',
            'cholesterol',
            'hemoglobin',
            'creatinine',
            'blood urea nitrogen',
            'bicarbonate',
            'calcium ionized',
            'partial pressure of carbon dioxide',
            'magnesium',
            'anion gap',
            'phosphorous',
            'platelets'
        ]
all_vitals = vitals_mean[vital_list]
all_vitals = all_vitals.droplevel(['hadm_id', 'icustay_id'])
column_names = []
for column in all_vitals.columns:
    if isinstance(column, str):
        column_names.append(column)
    else:
        column_names.append(column[0])
all_vitals.columns = column_names
all_vitals = all_vitals.fillna(method='ffill')

  all_vitals = all_vitals.fillna(method='ffill')


In [18]:
all_vitals.head(n = 48)

Unnamed: 0_level_0,Unnamed: 1_level_0,heart rate,red blood cell count,sodium,mean blood pressure,systemic vascular resistance,glucose,chloride urine,glascow coma scale total,hematocrit,positive end-expiratory pressure set,...,hemoglobin,creatinine,blood urea nitrogen,bicarbonate,calcium ionized,partial pressure of carbon dioxide,magnesium,anion gap,phosphorous,platelets
subject_id,hours_in,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10006,0,,4.46,139.0,,,217.0,,,42.4,,...,13.7,3.0,9.0,29.0,,,1.3,20.0,,116.0
10006,1,101.5,4.46,139.0,63.666698,,217.0,,15.0,42.4,,...,13.7,3.0,9.0,29.0,,,1.3,20.0,,116.0
10006,2,96.0,4.46,139.0,77.0,,149.0,,15.0,42.4,,...,13.7,3.0,9.0,29.0,,,1.3,20.0,,116.0
10006,3,95.0,4.46,139.0,68.666702,,149.0,,15.0,42.4,,...,13.7,3.0,9.0,29.0,,,1.3,20.0,,116.0
10006,4,92.0,4.46,139.0,73.0,,149.0,,15.0,42.4,,...,13.7,3.0,9.0,29.0,,,1.3,20.0,,116.0
10006,5,88.0,3.77,139.0,76.0,,87.5,,15.0,36.900001,,...,11.7,3.5,11.0,31.0,,,1.4,12.0,5.0,106.0
10006,6,92.0,3.77,139.0,73.666702,,87.5,,15.0,36.900001,,...,11.7,3.5,11.0,31.0,,,1.4,12.0,5.0,106.0
10006,7,92.0,3.77,139.0,70.333298,,87.5,,15.0,36.900001,,...,11.7,3.5,11.0,31.0,,,1.4,12.0,5.0,106.0
10006,8,93.0,3.77,139.0,74.666702,,55.0,,15.0,36.900001,,...,11.7,3.5,11.0,31.0,,,1.4,12.0,5.0,106.0
10006,9,93.0,3.77,139.0,74.666702,,55.0,,15.0,36.900001,,...,11.7,3.5,11.0,31.0,,,1.4,12.0,5.0,106.0


In [19]:
print(all_vitals)

                     heart rate  red blood cell count  sodium  \
subject_id hours_in                                             
10006      0                NaN                  4.46   139.0   
           1              101.5                  4.46   139.0   
           2               96.0                  4.46   139.0   
           3               95.0                  4.46   139.0   
           4               92.0                  4.46   139.0   
...                         ...                   ...     ...   
44228      106             81.0                  3.12   139.0   
           107             73.0                  3.12   139.0   
           108             77.0                  3.12   139.0   
           109             77.0                  3.12   139.0   
           110             77.0                  3.12   139.0   

                     mean blood pressure  systemic vascular resistance  \
subject_id hours_in                                                      
10006 