## Import Libraries

In [50]:
import numpy as np
import pandas as pd
import pickle
import os
import shutil
import random
from tqdm import tqdm
from random import seed
import matplotlib
import matplotlib.pyplot as plt
import missingno as msno
  
seed(1121)

## Load data


In [26]:
! mkdir -p ./data/balanced_2000/data_all/
! mkdir -p ./data/balanced_2000/baseline_all/
! mkdir -p ./data/balanced_2000/engineered_all/

In [27]:
# !rsync -a ./data/balanced_2000/non_sepsis/ ./data/balanced_2000/data_all/
# !rsync -a ./data/balanced_2000/sepsis/ ./data/balanced_2000/data_all/

In [51]:
path_all = './data/balanced_2000/data_all/'
path_non_sepsis = './data/balanced_2000/non_sepsis/'
path_sepsis = './data/balanced_2000/sepsis/'

Sort all the patient id

In [52]:
patient_id_sepsis = sorted(os.listdir(path_sepsis))
patient_id_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_sepsis))  # filter out the DS file from MacOS
len(patient_id_sepsis)

1000

In [53]:
patient_id_non_sepsis = sorted(os.listdir(path_non_sepsis))
patient_id_non_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_non_sepsis))  # filter out the DS file from MacOS
len(patient_id_non_sepsis)

1000

In [54]:
patient_id = sorted(os.listdir(path_all))
patient_id = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id))  # filter out the DS file from MacOS
len(patient_id)

2000

Train-Val-Test Split -> 0.7 + 0.15 + 0.15 

In [55]:
len_train_sepsis = round(0.7*len(patient_id_sepsis))
len_val_sepsis = round(0.15*len(patient_id_sepsis))
len_test_sepsis = round(0.15*len(patient_id_sepsis))
len_train_sepsis + len_val_sepsis + len_test_sepsis == len(patient_id_sepsis)

True

In [56]:
len_train_non_sepsis = round(0.7*len(patient_id_non_sepsis))
len_val_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_test_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_train_non_sepsis + len_val_non_sepsis + len_test_non_sepsis == len(patient_id_non_sepsis)

True

In [57]:
train_id_sepsis = random.sample(patient_id_sepsis, len_train_sepsis)
val_id_sepsis = random.sample(set(patient_id_sepsis) - set(train_id_sepsis), len_val_sepsis)
test_id_sepsis = list(set(patient_id_sepsis) - set(train_id_sepsis) - set(val_id_sepsis))

In [58]:
train_id_non_sepsis = random.sample(patient_id_non_sepsis, len_train_non_sepsis)
val_id_non_sepsis = random.sample(set(patient_id_non_sepsis) - set(train_id_non_sepsis), len_val_non_sepsis)
test_id_non_sepsis = list(set(patient_id_non_sepsis) - set(train_id_non_sepsis) - set(val_id_non_sepsis))

In [59]:
train_id = train_id_sepsis + train_id_non_sepsis
val_id = val_id_sepsis + val_id_non_sepsis
test_id = test_id_sepsis + test_id_non_sepsis

In [60]:
print('Number of train:', len(train_id))
print('Number of validation:', len(val_id))
print('Number of test:', len(test_id))


Number of train: 1400
Number of validation: 300
Number of test: 300


In [39]:
test_set_balanced = np.concatenate((patient_id_sepsis, patient_id_non_sepsis))
np.save('./data/balanced_2000/test_set.npy', test_set_balanced)
np.save('./data/balanced_2000/train_sepsis.npy', patient_id_sepsis)
np.save('./data/balanced_2000/train_nonsepsis.npy', patient_id_non_sepsis)

## Pre-process dataset

### Helper Functions

* Function 1: Fill the missing values

In [61]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

* Function 2: Genearte new feature columns

In [62]:
# function to add new columns containing as mang features as possible
def new_features(df):
  #attributes that are worked on: HR, Resp, Temp

  #copy df
  df_dev = df.copy()

  # three standard deviation for HR -> HR_dev_1, HR_dev_2, HR_dev_3
  hr = df_dev['HR']
  hr_len = len(hr)
  hr_dev_1 = hr.copy()
  hr_dev_2 = hr.copy()
  hr_dev_3 = hr.copy()
  for i, j in hr_dev_1.iteritems():
    if i > hr_len - 5:
      hr_dev_1[i] = hr[i:hr_len].std()
    else:
      hr_dev_1[i] = hr[i:i+5].std()
  hr_dev_1[hr_len-1] = hr_dev_1[hr_len-2]
  df_dev['HR_dev_1'] = hr_dev_1

  for i, j in hr_dev_2.iteritems():
    if i > hr_len - 10:
      hr_dev_2[i] = hr[i:hr_len].std()
    else:
      hr_dev_2[i] = hr[i:i+10].std()
  hr_dev_2[hr_len-1] = hr_dev_2[hr_len-2]
  df_dev['HR_dev_2'] = hr_dev_2

  for i, j in hr_dev_3.iteritems():
    if i > hr_len - 20:
      hr_dev_3[i] = hr[i:hr_len].std()
    else:
      hr_dev_3[i] = hr[i:i+20].std()
  hr_dev_3[hr_len-1] = hr_dev_3[hr_len-2]
  df_dev['HR_dev_3'] = hr_dev_3


  # three standard deviation for Resp -> Resp_dev_1, Resp_dev_2, Resp_dev_3
  rr = df_dev['RR']
  rr_len = len(rr)
  rr_dev_1 = rr.copy()
  rr_dev_2 = rr.copy()
  rr_dev_3 = rr.copy()
  for i, j in rr_dev_1.iteritems():
    if i < rr_len -5:
      rr_dev_1[i] = rr[i:rr_len].std()
    else:
      rr_dev_1[i] = rr[i:i+5].std()
  rr_dev_1[rr_len-1] = rr_dev_1[rr_len-2]
  df_dev['RR_dev_1'] = rr_dev_1

  for i, j in rr_dev_2.iteritems():
    if i < rr_len -10:
      rr_dev_2[i] = rr[i:rr_len].std()
    else:
      rr_dev_2[i] = rr[i:i+10].std()
  rr_dev_2[rr_len-1] = rr_dev_2[rr_len-2]
  df_dev['RR_dev_2'] = rr_dev_2

  for i, j in rr_dev_3.iteritems():
    if i < rr_len -20:
      rr_dev_3[i] = rr[i:rr_len].std()
    else:
      rr_dev_3[i] = rr[i:i+20].std()
  rr_dev_3[rr_len-1] = rr_dev_3[rr_len-2]
  df_dev['RR_dev_3'] = rr_dev_3

  # three standard deviation for Temp -> Temp_dev_1, Temp_dev_2, Temp_dev_3
  t = df_dev['Temp']
  t_len = len(t)
  t_dev_1 = t.copy()
  t_dev_2 = t.copy()
  t_dev_3 = t.copy()
  for i, j in t_dev_1.iteritems():
    if i < t_len - 5:
      t_dev_1[i] = t[i:t_len].std()
    else:
      t_dev_1[i] = t[i:i+5].std()
  t_dev_1[t_len-1] = t_dev_1[t_len-2]
  df_dev['Temp_dev_1'] = t_dev_1

  for i, j in t_dev_2.iteritems():
    if i < t_len - 10:
      t_dev_2[i] = t[i:t_len].std()
    else:
      t_dev_2[i] = t[i:i+10].std()
  t_dev_2[t_len-1] = t_dev_2[t_len-2]
  df_dev['Temp_dev_2'] = t_dev_2

  for i, j in t_dev_3.iteritems():
    if i < t_len - 20:
      t_dev_3[i] = t[i:t_len].std()
    else:
      t_dev_3[i] = t[i:i+20].std()
  t_dev_3[t_len-1] = t_dev_3[t_len-2]
  df_dev['Temp_dev_3'] = t_dev_3

  # features for indicating bradycardia and tachycardia
  brady = hr.copy()
  tachy = hr.copy()
  for i,j in brady.iteritems():
    if hr[i] < 100:
      brady[i] = True
    else:
      brady[i] = False
    if hr[i] > 220:
      tachy[i] = True
    else:
      tachy[i] = False
  df_dev['Bradycardia'] = brady
  df_dev['Tachycardia'] = tachy

  # features for indicating Hypothermia, Hyperthermia(Fever) and Hyperpyrexia
  hypothermia = t.copy()
  fever = t.copy()
  hyperpyrexia = t.copy()
  for i,j in t.iteritems():
    if t[i] < 36.5:
      hypothermia[i] = True
    else:
      hypothermia[i] = False
    if t[i] > 38:
      fever[i] = True
    else:
      fever[i] = False
    if t[i] > 40:
      hyperpyrexia[i] = True
    else:
      hyperpyrexia[i] = False
  df_dev['Hypothermia'] = hypothermia
  df_dev['Fever'] = fever
  df_dev['Hyperpyrexia'] = hyperpyrexia

  return df_dev


* Function 3: modify extra features that are different from Cinc2019

In [63]:
def modify_extra_features(df):
    df['Lactic'] = 0
    df['Magnesium'] = 0
    df['HCO3'] = 0
    df['age'] = 0
    df = df.drop('Sodium', axis=1)
    df = df.drop('FiO2', axis=1)
    df = df.drop('PCO2', axis=1)

    return df 

### Generate Raw Data Pickle

In [49]:
#raw data of the whole dataset
all_data = pd.DataFrame([])

for p in tqdm(patient_id):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data = pd.concat([all_data, df], ignore_index=False)

filename = './data/balanced_2000/data_all.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data, f)

100%|██████████| 2000/2000 [00:56<00:00, 35.54it/s]


In [64]:
#raw data of the septic dataset
sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    sepsis_data = pd.concat([sepsis_data, df], ignore_index=False)

filename = './data/balanced_2000/sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(sepsis_data, f)

100%|██████████| 1000/1000 [00:19<00:00, 51.63it/s]


In [65]:
#raw data of the non-septic dataset
non_sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_non_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    non_sepsis_data = pd.concat([non_sepsis_data, df], ignore_index=False)

filename = './data/balanced_2000/non_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(non_sepsis_data, f)

100%|██████████| 1000/1000 [00:19<00:00, 50.46it/s]


### Generate Baseline Data

In [45]:
# impute missing values and create clean dfs for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    
    # impute missing values
    attributes = df.columns
    df_filled = impute_missing_vals(df, attributes)

    # change the temperature value
    # df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)

    # change the False/True into 0/1
    df_replace_label = df_filled.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    #re-label sepsis 6 hours ahead
    # df_final = relabel_positive(df_replace_gender)

    df_final = modify_extra_features(df_replace_gender)
    
    # save new patient data
    save_path = './data/balanced_2000/baseline_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 2000/2000 [03:21<00:00,  9.94it/s]


In [47]:
# check the features in Cinc2019 and MIMIC-III are the same
attris_cinc2019 = list(pd.read_csv('../datasets/Cinc2019/baseline_all/p000001.csv').columns)
print(attris_cinc2019)
attris_mimiciii = list(pd.read_csv('./data/balanced_2000/baseline_all/1001.csv').columns)
print(attris_mimiciii)
sorted(attris_cinc2019) == sorted(attris_mimiciii)

['HR', 'SaO2', 'Temp', 'SBP', 'MAP', 'DBP', 'RR', 'BaseExcess', 'HCO3', 'PH', 'BUN', 'Calcium', 'Chloride', 'Creatinine', 'Glucose', 'Lactic', 'Magnesium', 'Potassium', 'PTT', 'WBC', 'Platelet', 'age', 'gender', 'sepsis', 'subject_id']
['sepsis', 'subject_id', 'gender', 'SBP', 'DBP', 'MAP', 'Temp', 'HR', 'RR', 'BaseExcess', 'SaO2', 'PH', 'Calcium', 'Potassium', 'Creatinine', 'Chloride', 'Glucose', 'WBC', 'BUN', 'PTT', 'Platelet', 'Lactic', 'Magnesium', 'HCO3', 'age']


True

### Generate Engineered Data

In [48]:
# impute missing values and add new features for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    
    # impute missing values
    attributes = df.columns
    df_filled = impute_missing_vals(df, attributes)

    # change the temperature value
    # df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)
    
    # add new features
    df_new_f = new_features(df_filled)
    # change the False/True into 0/1
    df_replace_label = df_new_f.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    df_final = modify_extra_features(df_replace_gender)
    
    # save new patient data
    save_path = './data/balanced_2000/engineered_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 2000/2000 [07:41<00:00,  4.33it/s]


## Others?

## Process raw data


In [25]:
data_train = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/training/'
data_val = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/validation/'
data_test = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/test/'

Split into folders

In [27]:
for p in train_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_train  + p, sep=',', index = False) 

In [28]:
for p in val_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_val + p, sep=',', index = False) 

In [29]:
for p in test_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_test + p, sep=',', index = False)

Files -> Pickle

In [30]:
#raw data of the whole dataset
all_data = pd.DataFrame([])

for p in patient_id:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    attributes = df.columns[:-1]
    all_data = pd.concat([all_data, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data, f)

In [31]:
all_data_sepsis = pd.DataFrame([])

for p in patient_id_sepsis:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data_sepsis = pd.concat([all_data_sepsis, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data_sepsis, f)

In [32]:
all_data_non_sepsis = pd.DataFrame([])

for p in patient_id_non_sepsis:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data_non_sepsis = pd.concat([all_data_non_sepsis, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data_non_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data_non_sepsis, f)

In [33]:
train_data = pd.DataFrame([])

for p in train_id:

    # read in patient data
    df = pd.read_csv(data_train + p, sep = ",")
    attributes = df.columns[:-1]
    train_data = pd.concat([train_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(train_data, f)

In [34]:
val_data = pd.DataFrame([])

for p in val_id:

    # read in patient data
    df = pd.read_csv(data_val + p, sep = ",")
    attributes = df.columns[:-1]
    val_data = pd.concat([val_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(val_data, f)

In [35]:
test_data = pd.DataFrame([])

for p in test_id:

    # read in patient data
    df = pd.read_csv(data_test + p, sep = ",")
    attributes = df.columns[:-1]
    test_data = pd.concat([test_data, df], ignore_index=False)
    
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(test_data, f)

Check the shape of traning, val and test dataset

In [36]:
print(all_data.shape)
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(177748, 25)
(123853, 25)
(27287, 25)
(26608, 25)


Pickle -> CSV

In [37]:
#all
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data.pickle'
with open(filename, 'rb') as f:
    all = pickle.load(f)

all.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/all_data.csv', sep = ',')

In [39]:
#train
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/train_data.pickle'
with open(filename, 'rb') as f:
    train = pickle.load(f)

train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/train_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_datatrain_data.csv', sep = ',')

In [40]:
#val
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/val_data.pickle'
with open(filename, 'rb') as f:
    val = pickle.load(f)

val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/val_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_dataval_data.csv', sep = ',')

In [41]:
#test
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/test_data.pickle'
with open(filename, 'rb') as f:
    test = pickle.load(f)

test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/test_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/raw_data/test_data.csv', sep = ',')

## Helper Functions

Function 2: Fill the missing value

In [49]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

Function 3: Generate new feature columns

In [50]:
# function to add new columns containing as mang features as possible
def new_features(df):
  #attributes that are worked on: HR, Resp, Temp

  #copy df
  df_dev = df.copy()

  # three standard deviation for HR -> HR_dev_1, HR_dev_2, HR_dev_3
  hr = df_dev['HR']
  hr_len = len(hr)
  hr_dev_1 = hr.copy()
  hr_dev_2 = hr.copy()
  hr_dev_3 = hr.copy()
  for i, j in hr_dev_1.iteritems():
    if i > hr_len - 5:
      hr_dev_1[i] = hr[i:hr_len].std()
    else:
      hr_dev_1[i] = hr[i:i+5].std()
  hr_dev_1[hr_len-1] = hr_dev_1[hr_len-2]
  df_dev['HR_dev_1'] = hr_dev_1

  for i, j in hr_dev_2.iteritems():
    if i > hr_len - 10:
      hr_dev_2[i] = hr[i:hr_len].std()
    else:
      hr_dev_2[i] = hr[i:i+10].std()
  hr_dev_2[hr_len-1] = hr_dev_2[hr_len-2]
  df_dev['HR_dev_2'] = hr_dev_2

  for i, j in hr_dev_3.iteritems():
    if i > hr_len - 20:
      hr_dev_3[i] = hr[i:hr_len].std()
    else:
      hr_dev_3[i] = hr[i:i+20].std()
  hr_dev_3[hr_len-1] = hr_dev_3[hr_len-2]
  df_dev['HR_dev_3'] = hr_dev_3


  # three standard deviation for Resp -> Resp_dev_1, Resp_dev_2, Resp_dev_3
  rr = df_dev['RR']
  rr_len = len(rr)
  rr_dev_1 = rr.copy()
  rr_dev_2 = rr.copy()
  rr_dev_3 = rr.copy()
  for i, j in rr_dev_1.iteritems():
    if i < rr_len -5:
      rr_dev_1[i] = rr[i:rr_len].std()
    else:
      rr_dev_1[i] = rr[i:i+5].std()
  rr_dev_1[rr_len-1] = rr_dev_1[rr_len-2]
  df_dev['RR_dev_1'] = rr_dev_1

  for i, j in rr_dev_2.iteritems():
    if i < rr_len -10:
      rr_dev_2[i] = rr[i:rr_len].std()
    else:
      rr_dev_2[i] = rr[i:i+10].std()
  rr_dev_2[rr_len-1] = rr_dev_2[rr_len-2]
  df_dev['RR_dev_2'] = rr_dev_2

  for i, j in rr_dev_3.iteritems():
    if i < rr_len -20:
      rr_dev_3[i] = rr[i:rr_len].std()
    else:
      rr_dev_3[i] = rr[i:i+20].std()
  rr_dev_3[rr_len-1] = rr_dev_3[rr_len-2]
  df_dev['RR_dev_3'] = rr_dev_3

  # three standard deviation for Temp -> Temp_dev_1, Temp_dev_2, Temp_dev_3
  t = df_dev['Temp']
  t_len = len(t)
  t_dev_1 = t.copy()
  t_dev_2 = t.copy()
  t_dev_3 = t.copy()
  for i, j in t_dev_1.iteritems():
    if i < t_len - 5:
      t_dev_1[i] = t[i:t_len].std()
    else:
      t_dev_1[i] = t[i:i+5].std()
  t_dev_1[t_len-1] = t_dev_1[t_len-2]
  df_dev['Temp_dev_1'] = t_dev_1

  for i, j in t_dev_2.iteritems():
    if i < t_len - 10:
      t_dev_2[i] = t[i:t_len].std()
    else:
      t_dev_2[i] = t[i:i+10].std()
  t_dev_2[t_len-1] = t_dev_2[t_len-2]
  df_dev['Temp_dev_2'] = t_dev_2

  for i, j in t_dev_3.iteritems():
    if i < t_len - 20:
      t_dev_3[i] = t[i:t_len].std()
    else:
      t_dev_3[i] = t[i:i+20].std()
  t_dev_3[t_len-1] = t_dev_3[t_len-2]
  df_dev['Temp_dev_3'] = t_dev_3

  # features for indicating bradycardia and tachycardia
  brady = hr.copy()
  tachy = hr.copy()
  for i,j in brady.iteritems():
    if hr[i] < 100:
      brady[i] = True
    else:
      brady[i] = False
    if hr[i] > 220:
      tachy[i] = True
    else:
      tachy[i] = False
  df_dev['Bradycardia'] = brady
  df_dev['Tachycardia'] = tachy

  # features for indicating Hypothermia, Hyperthermia(Fever) and Hyperpyrexia
  hypothermia = t.copy()
  fever = t.copy()
  hyperpyrexia = t.copy()
  for i,j in t.iteritems():
    if t[i] < 97.7:
      hypothermia[i] = True
    else:
      hypothermia[i] = False
    if t[i] >100.4:
      fever[i] = True
    else:
      fever[i] = False
    if t[i] > 104:
      hyperpyrexia[i] = True
    else:
      hyperpyrexia[i] = False
  df_dev['Hypothermia'] = hypothermia
  df_dev['Fever'] = fever
  df_dev['Hyperpyrexia'] = hyperpyrexia

  return df_dev


## Process baseline data


raw data can not be used for model training due to the null in the data;

baseline data are just raw data with the blank filled

In [45]:
data_train = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/training/'
data_val = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/validation/'
data_test = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/test/'

Split into folders

In [51]:
for p in train_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_train  + p, sep=',', index = False) 

In [52]:
for p in val_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_val + p, sep=',', index = False) 

In [53]:
for p in test_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_test + p, sep=',', index = False)

Files -> Pickle

In [71]:
train_data = pd.DataFrame([])

for p in train_id:

    # read in patient data
    df = pd.read_csv(data_train + p, sep = ",")
    attributes = df.columns[:-1]
    train_data = pd.concat([train_data, df], ignore_index=False)
    train_data = train_data.drop(df.columns[0], axis = 1)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(train_data, f)

In [73]:
val_data = pd.DataFrame([])

for p in val_id:

    # read in patient data
    df = pd.read_csv(data_val + p, sep = ",")
    attributes = df.columns[:-1]
    val_data = pd.concat([val_data, df], ignore_index=True)
    val_data = val_data.drop(df.columns[0], axis = 1)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(val_data, f)

In [74]:
test_data = pd.DataFrame([])

for p in test_id:

    # read in patient data
    df = pd.read_csv(data_test + p, sep = ",")
    attributes = df.columns[:-1]
    test_data = pd.concat([test_data, df], ignore_index=False)
    test_data = test_data.drop(df.columns[0], axis = 1)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(test_data, f)

Check the shape of traning, val and test dataset

In [75]:
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(123853, 24)
(27287, 24)
(26608, 24)


Pickle -> CSV

In [76]:
#train
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/train_data.pickle'
with open(filename, 'rb') as f:
    train = pickle.load(f)

train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/train_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/train_data.csv', sep = ',')

In [77]:
#val
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/val_data.pickle'
with open(filename, 'rb') as f:
    val = pickle.load(f)

val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/val_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/val_data.csv', sep = ',')

In [78]:
#raw_test
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/test_data.pickle'
with open(filename, 'rb') as f:
    test = pickle.load(f)

test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/test_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/baseline_data/test_data.csv', sep = ',')

##Process feature-engineered data

Impute missing data and add engineered features

In [79]:
# impute missing values and create clean dfs for all patients
for p in patient_id:
    
    # read in patient data
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    attributes = df.columns
    
    # impute missing values
    df_filled = impute_missing_vals(df, attributes)

    # add features of deviation
    df_engineered = new_features(df_filled)
    
    # drop unit1 and unit2 with half missing values
    # because these two features have few information
    # drop EtCO2 with all missing values
    df_clean = df_engineered.drop(df_engineered.columns[0], axis=1)
    
    # save new patient data
    if p in train_id:
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/training/'
        df_clean.to_csv(save_path + p, sep=',', index = False)        
    
    elif p in val_id:
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/validation/'
        df_clean.to_csv(save_path + p, sep=',', index = False)        
    
    else:
        
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test/'
        df_clean.to_csv(save_path + p, sep=',', index = False)
    
    print(p)

1.csv
10.csv
100.csv
1000.csv
1001.csv
1002.csv
1003.csv
1004.csv
1005.csv
1006.csv
1007.csv
1008.csv
1009.csv
101.csv
1010.csv
1011.csv
1012.csv
1013.csv
1014.csv
1015.csv
1016.csv
1017.csv
1018.csv
1019.csv
102.csv
1020.csv
1021.csv
1022.csv
1023.csv
1024.csv
1025.csv
1026.csv
1027.csv
1028.csv
1029.csv
103.csv
1030.csv
1031.csv
1032.csv
1033.csv
1034.csv
1035.csv
1036.csv
1037.csv
1038.csv
1039.csv
104.csv
1040.csv
1041.csv
1042.csv
1043.csv
1044.csv
1045.csv
1046.csv
1047.csv
1048.csv
1049.csv
105.csv
1050.csv
1051.csv
1052.csv
1053.csv
1054.csv
1055.csv
1056.csv
1057.csv
1058.csv
1059.csv
106.csv
1060.csv
1061.csv
1062.csv
1063.csv
1064.csv
1065.csv
1066.csv
1067.csv
1068.csv
1069.csv
107.csv
1070.csv
1071.csv
1072.csv
1073.csv
1074.csv
1075.csv
1076.csv
1077.csv
1078.csv
1079.csv
108.csv
1080.csv
1081.csv
1082.csv
1083.csv
1084.csv
1085.csv
1086.csv
1087.csv
1088.csv
1089.csv
109.csv
1090.csv
1091.csv
1092.csv
1093.csv
1094.csv
1095.csv
1096.csv
1097.csv
1098.csv
1099.csv
11.csv


Files -> Pickle

In [80]:
engineered_train_data = pd.DataFrame([])

for p in train_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/training/' + p, sep = ",")
    engineered_train_data = pd.concat([engineered_train_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_train_data, f)

In [81]:
engineered_val_data = pd.DataFrame([])

for p in val_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/validation/' + p, sep = ",")
    engineered_val_data = pd.concat([engineered_val_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_val_data, f)

In [82]:
engineered_test_data = pd.DataFrame([])

for p in test_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test/' + p, sep = ",")
    engineered_test_data = pd.concat([engineered_test_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_test_data, f)

In [83]:
print(engineered_train_data.shape)
print(engineered_val_data.shape)
print(engineered_test_data.shape)

(123853, 38)
(27287, 38)
(26608, 38)


Pickle -> CSV

In [84]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/train_data.pickle'
with open(filename, 'rb') as f:
    engineered_train = pickle.load(f)

engineered_train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/train_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/train_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,665,1,62.840644,5.734510,58.089622,97.914951,128.459351,41.864642,5.296790,...,16.659873,16.659873,0.580194,0.580194,0.580194,False,False,False,False,False
1,False,665,1,62.840644,5.734510,58.089622,97.914951,144.501993,41.864642,5.296790,...,16.688524,16.688524,0.582453,0.582453,0.582453,False,False,False,False,False
2,False,665,1,62.840644,5.734510,58.089622,97.914951,132.596588,41.864642,5.296790,...,16.716338,16.716338,0.584733,0.584733,0.584733,False,False,False,False,False
3,False,665,1,62.840644,5.734510,58.089622,97.914951,121.902163,41.864642,5.296790,...,16.743243,16.743243,0.587033,0.587033,0.587033,False,False,False,False,False
4,False,665,1,62.840644,5.734510,58.089622,97.914951,126.429033,47.394515,5.296790,...,16.769162,16.769162,0.589354,0.589354,0.589354,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123848,False,1073,0,164.453382,55.174784,97.458325,96.802494,152.158735,16.132928,-33.526991,...,5.973354,5.973354,0.395858,0.395858,0.395858,False,False,True,False,False
123849,False,1073,0,164.453382,55.174784,97.458325,96.802494,152.158735,17.294812,-33.526991,...,6.809208,6.809208,0.417271,0.417271,0.417271,False,False,True,False,False
123850,False,1073,0,117.241324,55.174784,97.458325,96.802494,155.680511,12.381573,-33.526991,...,8.302033,8.302033,0.417271,0.417271,0.417271,False,False,True,False,False
123851,False,1073,0,117.241324,55.174784,97.458325,97.525229,129.028689,15.357255,-33.526991,...,8.951194,8.951194,0.000000,0.000000,0.000000,False,False,True,False,False


In [85]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/val_data.pickle'
with open(filename, 'rb') as f:
    engineered_val = pickle.load(f)

engineered_val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/val_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/val_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,191,1,86.642980,31.924519,12.889798,97.923393,146.577895,37.787744,-9.459031,...,11.889296,11.889296,0.523023,0.523023,0.523023,False,False,False,False,False
1,False,191,1,86.642980,31.924519,12.889798,97.923393,131.416146,56.875332,-9.459031,...,11.977427,11.977427,0.527892,0.527892,0.527892,False,False,False,False,False
2,False,191,1,86.642980,31.924519,12.889798,97.923393,158.953846,41.253360,-9.459031,...,11.941891,11.941891,0.532900,0.532900,0.532900,False,False,False,False,False
3,False,191,1,86.642980,31.924519,12.889798,97.923393,146.395909,44.204160,-9.459031,...,12.055732,12.055732,0.538053,0.538053,0.538053,False,False,False,False,False
4,False,191,1,86.642980,31.924519,12.889798,97.923393,138.888659,56.806749,-9.459031,...,12.174638,12.174638,0.543358,0.543358,0.543358,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27282,False,1869,1,85.762604,43.860206,70.834211,103.785962,221.249999,39.910141,-11.212679,...,17.649122,17.649122,0.975046,0.975046,0.975046,False,True,False,True,False
27283,False,1869,1,85.762604,124.558308,70.834211,103.785962,221.249999,33.208830,-11.212679,...,19.520897,19.520897,1.025984,1.025984,1.025984,False,True,False,True,False
27284,False,1869,1,85.762604,124.558308,70.834211,101.410378,239.853568,53.150431,-11.212679,...,23.606981,23.606981,1.020550,1.020550,1.020550,False,True,False,True,False
27285,False,1869,1,85.762604,124.558308,70.834211,103.178023,239.853568,20.728944,-11.212679,...,9.555171,9.555171,0.000000,0.000000,0.000000,False,True,False,True,False


In [86]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test_data.pickle'
with open(filename, 'rb') as f:
    engineered_test = pickle.load(f)

engineered_test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/artificial_neonates/models/engineered_data/test_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,119,0,74.800175,10.410552,47.247130,97.180918,123.207887,48.257224,-13.579506,...,13.779532,13.779532,0.334325,0.334325,0.334325,False,False,True,False,False
1,False,119,0,74.800175,10.410552,47.247130,97.180918,145.172469,48.257224,-13.579506,...,13.838639,13.838639,0.336477,0.336477,0.336477,False,False,True,False,False
2,False,119,0,74.800175,10.410552,47.247130,97.180918,121.516758,32.220504,-13.579506,...,13.894127,13.894127,0.338611,0.338611,0.338611,False,False,True,False,False
3,False,119,0,74.800175,10.410552,47.247130,97.180918,145.962110,55.909502,-13.579506,...,14.067176,14.067176,0.340717,0.340717,0.340717,False,False,True,False,False
4,False,119,0,74.800175,10.410552,47.247130,97.180918,134.249036,38.028743,-13.579506,...,13.878545,13.878545,0.342784,0.342784,0.342784,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26603,False,1789,1,16.012443,95.198022,53.718158,101.865009,133.314000,26.772464,-11.305648,...,7.939554,7.939554,0.000000,0.000000,0.000000,False,False,False,True,False
26604,False,1789,1,16.012443,95.198022,53.718158,101.865009,139.652727,26.769985,-11.305648,...,8.114253,8.114253,0.000000,0.000000,0.000000,False,False,False,True,False
26605,False,1789,1,16.012443,95.198022,53.718158,101.865009,144.656176,7.474069,-11.305648,...,7.298627,7.298627,0.000000,0.000000,0.000000,False,False,False,True,False
26606,False,1789,1,16.012443,95.198022,55.916371,101.865009,134.330772,18.544328,-11.305648,...,1.912602,1.912602,0.000000,0.000000,0.000000,False,False,False,True,False
