## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import shutil
import random
from tqdm import tqdm
from random import seed
import matplotlib
import matplotlib.pyplot as plt
import missingno as msno
  
seed(1121)

## Load data


gather all files into folder "processed_all"

In [60]:
# !rsync -a /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_non_sepsis/ /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all/
# !rsync -a /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_sepsis/ /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all/
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_non_sepsis | wc -l
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_sepsis | wc -l
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all| wc -l

In [2]:
path_all = '../datasets/MIMICIII/neonates/processed_all/'
path_non_sepsis = '../datasets/MIMICIII/neonates/processed_non_sepsis/'
path_sepsis = '../datasets/MIMICIII/neonates/processed_sepsis/'

Sort all the patient id

In [3]:
patient_id_sepsis = sorted(os.listdir(path_sepsis))
patient_id_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_sepsis))  # filter out the DS file from MacOS
len(patient_id_sepsis)

113

In [4]:
patient_id_non_sepsis = sorted(os.listdir(path_non_sepsis))
patient_id_non_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_non_sepsis))  # filter out the DS file from MacOS
len(patient_id_non_sepsis)

3319

In [5]:
patient_id = sorted(os.listdir(path_all))
patient_id = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id))  # filter out the DS file from MacOS
len(patient_id)

3432

## Train-Val-Test Split -> 0.7 + 0.15 + 0.15 

In [6]:
len_train_sepsis = round(0.7*len(patient_id_sepsis))
len_val_sepsis = round(0.15*len(patient_id_sepsis))
len_test_sepsis = round(0.15*len(patient_id_sepsis))
len_train_sepsis + len_val_sepsis + len_test_sepsis == len(patient_id_sepsis)

True

In [7]:
len_train_non_sepsis = round(0.7*len(patient_id_non_sepsis))
len_val_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_test_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_train_non_sepsis + len_val_non_sepsis + len_test_non_sepsis == len(patient_id_non_sepsis)

True

In [8]:
train_id_sepsis = random.sample(patient_id_sepsis, len_train_sepsis)
val_id_sepsis = random.sample(set(patient_id_sepsis) - set(train_id_sepsis), len_val_sepsis)
test_id_sepsis = list(set(patient_id_sepsis) - set(train_id_sepsis) - set(val_id_sepsis))

In [9]:
train_id_non_sepsis = random.sample(patient_id_non_sepsis, len_train_non_sepsis)
val_id_non_sepsis = random.sample(set(patient_id_non_sepsis) - set(train_id_non_sepsis), len_val_non_sepsis)
test_id_non_sepsis = list(set(patient_id_non_sepsis) - set(train_id_non_sepsis) - set(val_id_non_sepsis))

In [10]:
train_id = train_id_sepsis + train_id_non_sepsis
val_id = val_id_sepsis + val_id_non_sepsis
test_id = test_id_sepsis + test_id_non_sepsis

sepsis_id = train_id_sepsis + val_id_sepsis
nonsepsis_id = train_id_non_sepsis + val_id_non_sepsis

In [11]:
print('Number of train:', len(train_id))
print('Number of validation:', len(val_id))
print('Number of test:', len(test_id))


Number of train: 2402
Number of validation: 515
Number of test: 515


In [12]:
np.save('./data/test_set.npy', test_id)
np.save('./data/train_sepsis.npy', sepsis_id)
np.save('./data/train_nonsepsis.npy', nonsepsis_id)
np.save('./data/all_sepsis.npy', patient_id_sepsis)
np.save('./data/all_nonsepsis.npy', patient_id_non_sepsis)

## Pre-process dataset
### Helper Functions

* Function 1: Fill the missing value

In [13]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

* Function 2: Genearte new feature columns

In [14]:
# function to add new columns containing as mang features as possible
def new_features(df):
  #attributes that are worked on: HR, Resp, Temp

  #copy df
  df_dev = df.copy()

  # three standard deviation for HR -> HR_dev_1, HR_dev_2, HR_dev_3
  hr = df_dev['HR']
  hr_len = len(hr)
  hr_dev_1 = hr.copy()
  hr_dev_2 = hr.copy()
  hr_dev_3 = hr.copy()
  for i, j in hr_dev_1.iteritems():
    if i > hr_len - 5:
      hr_dev_1[i] = hr[i:hr_len].std()
    else:
      hr_dev_1[i] = hr[i:i+5].std()
  hr_dev_1[hr_len-1] = hr_dev_1[hr_len-2]
  df_dev['HR_dev_1'] = hr_dev_1

  for i, j in hr_dev_2.iteritems():
    if i > hr_len - 10:
      hr_dev_2[i] = hr[i:hr_len].std()
    else:
      hr_dev_2[i] = hr[i:i+10].std()
  hr_dev_2[hr_len-1] = hr_dev_2[hr_len-2]
  df_dev['HR_dev_2'] = hr_dev_2

  for i, j in hr_dev_3.iteritems():
    if i > hr_len - 20:
      hr_dev_3[i] = hr[i:hr_len].std()
    else:
      hr_dev_3[i] = hr[i:i+20].std()
  hr_dev_3[hr_len-1] = hr_dev_3[hr_len-2]
  df_dev['HR_dev_3'] = hr_dev_3


  # three standard deviation for Resp -> Resp_dev_1, Resp_dev_2, Resp_dev_3
  rr = df_dev['RR']
  rr_len = len(rr)
  rr_dev_1 = rr.copy()
  rr_dev_2 = rr.copy()
  rr_dev_3 = rr.copy()
  for i, j in rr_dev_1.iteritems():
    if i < rr_len -5:
      rr_dev_1[i] = rr[i:rr_len].std()
    else:
      rr_dev_1[i] = rr[i:i+5].std()
  rr_dev_1[rr_len-1] = rr_dev_1[rr_len-2]
  df_dev['RR_dev_1'] = rr_dev_1

  for i, j in rr_dev_2.iteritems():
    if i < rr_len -10:
      rr_dev_2[i] = rr[i:rr_len].std()
    else:
      rr_dev_2[i] = rr[i:i+10].std()
  rr_dev_2[rr_len-1] = rr_dev_2[rr_len-2]
  df_dev['RR_dev_2'] = rr_dev_2

  for i, j in rr_dev_3.iteritems():
    if i < rr_len -20:
      rr_dev_3[i] = rr[i:rr_len].std()
    else:
      rr_dev_3[i] = rr[i:i+20].std()
  rr_dev_3[rr_len-1] = rr_dev_3[rr_len-2]
  df_dev['RR_dev_3'] = rr_dev_3

  # three standard deviation for Temp -> Temp_dev_1, Temp_dev_2, Temp_dev_3
  t = df_dev['Temp']
  t_len = len(t)
  t_dev_1 = t.copy()
  t_dev_2 = t.copy()
  t_dev_3 = t.copy()
  for i, j in t_dev_1.iteritems():
    if i < t_len - 5:
      t_dev_1[i] = t[i:t_len].std()
    else:
      t_dev_1[i] = t[i:i+5].std()
  t_dev_1[t_len-1] = t_dev_1[t_len-2]
  df_dev['Temp_dev_1'] = t_dev_1

  for i, j in t_dev_2.iteritems():
    if i < t_len - 10:
      t_dev_2[i] = t[i:t_len].std()
    else:
      t_dev_2[i] = t[i:i+10].std()
  t_dev_2[t_len-1] = t_dev_2[t_len-2]
  df_dev['Temp_dev_2'] = t_dev_2

  for i, j in t_dev_3.iteritems():
    if i < t_len - 20:
      t_dev_3[i] = t[i:t_len].std()
    else:
      t_dev_3[i] = t[i:i+20].std()
  t_dev_3[t_len-1] = t_dev_3[t_len-2]
  df_dev['Temp_dev_3'] = t_dev_3

  # features for indicating bradycardia and tachycardia
  brady = hr.copy()
  tachy = hr.copy()
  for i,j in brady.iteritems():
    if hr[i] < 100:
      brady[i] = True
    else:
      brady[i] = False
    if hr[i] > 220:
      tachy[i] = True
    else:
      tachy[i] = False
  df_dev['Bradycardia'] = brady
  df_dev['Tachycardia'] = tachy

  # features for indicating Hypothermia, Hyperthermia(Fever) and Hyperpyrexia
  hypothermia = t.copy()
  fever = t.copy()
  hyperpyrexia = t.copy()
  for i,j in t.iteritems():
    if t[i] < 36.5:
      hypothermia[i] = True
    else:
      hypothermia[i] = False
    if t[i] > 38:
      fever[i] = True
    else:
      fever[i] = False
    if t[i] > 40:
      hyperpyrexia[i] = True
    else:
      hyperpyrexia[i] = False
  df_dev['Hypothermia'] = hypothermia
  df_dev['Fever'] = fever
  df_dev['Hyperpyrexia'] = hyperpyrexia

  return df_dev


* Function 3 : Mark positive sepsis label 6 hrs ahead

In [15]:
def relabel_positive(patient):
  df = patient
  df_len = len(df)
  if df.iloc[df_len-1].sepsis == 1:
    for i in range(7):
      df.loc[df_len-1-i,'sepsis'] = 1
  return df

Function 4: modify extra features that are different from Cinc2019

In [16]:
def modify_extra_features(df):
    df['Lactic'] = 0
    df['Magnesium'] = 0
    df['HCO3'] = 0
    df['age'] = 0
    df = df.drop('FiO2', axis=1)
    df = df.drop('PCO2', axis=1)

    return df 

### Generate Raw Data Pickle

In [17]:
#raw data of the whole dataset
all_data = pd.DataFrame([])

for p in tqdm(patient_id):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data = pd.concat([all_data, df], ignore_index=False)

filename = './data/data_all.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data, f)

100%|██████████| 3432/3432 [01:41<00:00, 33.95it/s]


In [18]:
#raw data of the septic dataset
sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    sepsis_data = pd.concat([sepsis_data, df], ignore_index=False)

filename = './data/sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(sepsis_data, f)

100%|██████████| 113/113 [00:00<00:00, 185.83it/s]


In [19]:
#raw data of the non-septic dataset
non_sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_non_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    non_sepsis_data = pd.concat([non_sepsis_data, df], ignore_index=False)

filename = './data/non_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(non_sepsis_data, f)

100%|██████████| 3319/3319 [01:38<00:00, 33.85it/s]


### Generate Baseline Data

In [52]:
!mkdir -p ../datasets/MIMICIII/neonates/baseline_all

In [20]:
# impute missing values and create clean dfs for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")

    # drop features that doesn't exist in Cinc2019
    df_dropped = df.drop(['Sodium'], axis = 1)
    
    # impute missing values
    attributes = df_dropped.columns
    df_filled = impute_missing_vals(df_dropped, attributes)

    # change the temperature value
    df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)

    # change the False/True into 0/1
    df_replace_label = df_filled.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    #re-label sepsis 6 hours ahead
    df_final = relabel_positive(df_replace_gender)

    df_final = modify_extra_features(df_final)
    
    # save new patient data
    save_path = '../datasets/MIMICIII/neonates/baseline_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 3432/3432 [01:55<00:00, 29.72it/s]


In [21]:
# check the features in Cinc2019 and MIMIC-III are the same
attris_cinc2019 = list(pd.read_csv('../datasets/Cinc2019/baseline_all/p000001.csv').columns)
print(attris_cinc2019)
attris_mimiciii = list(pd.read_csv('../datasets/MIMICIII/neonates/baseline_all/8.csv').columns)
attris_mimiciii.remove('time')
print(attris_mimiciii)
sorted(attris_cinc2019) == sorted(attris_mimiciii)

['HR', 'SaO2', 'Temp', 'SBP', 'MAP', 'DBP', 'RR', 'BaseExcess', 'HCO3', 'PH', 'BUN', 'Calcium', 'Chloride', 'Creatinine', 'Glucose', 'Lactic', 'Magnesium', 'Potassium', 'PTT', 'WBC', 'Platelet', 'age', 'gender', 'sepsis', 'subject_id']
['sepsis', 'subject_id', 'gender', 'SBP', 'DBP', 'MAP', 'Temp', 'HR', 'RR', 'BaseExcess', 'SaO2', 'PH', 'Calcium', 'Potassium', 'Creatinine', 'Chloride', 'Glucose', 'WBC', 'BUN', 'PTT', 'Platelet', 'Lactic', 'Magnesium', 'HCO3', 'age']


True

### Generate Engineered Data

In [57]:
!mkdir -p ../datasets/MIMICIII/neonates/engineered_all

In [22]:
# impute missing values and add new features for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")

    # drop features that doesn't exist in Cinc2019
    df_dropped = df.drop(['Sodium'], axis = 1)
    
    # impute missing values
    attributes = df_dropped.columns
    df_filled = impute_missing_vals(df_dropped, attributes)

    # change the temperature value
    df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)

    # change the False/True into 0/1
    df_replace_label = df_filled.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    # add new features
    df_new_f = new_features(df_replace_gender)

    #re-label sepsis 6 hours ahead
    df_final = relabel_positive(df_new_f)

    df_final = modify_extra_features(df_final)
    
    # save new patient data
    save_path = '../datasets/MIMICIII/neonates/engineered_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 3432/3432 [06:19<00:00,  9.04it/s]
