## Import Libraries

In [68]:
import numpy as np
import pandas as pd
import pickle
import os
import shutil
import random
from tqdm import tqdm
from random import seed
import matplotlib
import matplotlib.pyplot as plt
import missingno as msno
  
seed(1121)

## Load data


gather all files into folder "processed_all"

In [60]:
# !rsync -a /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_non_sepsis/ /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all/
# !rsync -a /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_sepsis/ /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all/
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_non_sepsis | wc -l
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_sepsis | wc -l
# !ls -1U /content/drive/MyDrive/Colab_Notebooks/MIMIC/Results/neonates/processed_all| wc -l

In [69]:
path_all = '../datasets/MIMICIII/neonates/processed_all/'
path_non_sepsis = '../datasets/MIMICIII/neonates/processed_non_sepsis/'
path_sepsis = '../datasets/MIMICIII/neonates/processed_sepsis/'

Sort all the patient id

In [70]:
patient_id_sepsis = sorted(os.listdir(path_sepsis))
patient_id_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_sepsis))  # filter out the DS file from MacOS
len(patient_id_sepsis)

113

In [71]:
patient_id_non_sepsis = sorted(os.listdir(path_non_sepsis))
patient_id_non_sepsis = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id_non_sepsis))  # filter out the DS file from MacOS
len(patient_id_non_sepsis)

3319

In [72]:
patient_id = sorted(os.listdir(path_all))
patient_id = list(filter(lambda x: x !="index.html" and x != ".DS_Store", patient_id))  # filter out the DS file from MacOS
len(patient_id)

3432

## Train-Val-Test Split -> 0.7 + 0.15 + 0.15 

In [73]:
len_train_sepsis = round(0.7*len(patient_id_sepsis))
len_val_sepsis = round(0.15*len(patient_id_sepsis))
len_test_sepsis = round(0.15*len(patient_id_sepsis))
len_train_sepsis + len_val_sepsis + len_test_sepsis == len(patient_id_sepsis)

True

In [74]:
len_train_non_sepsis = round(0.7*len(patient_id_non_sepsis))
len_val_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_test_non_sepsis = round(0.15*len(patient_id_non_sepsis))
len_train_non_sepsis + len_val_non_sepsis + len_test_non_sepsis == len(patient_id_non_sepsis)

True

In [75]:
train_id_sepsis = random.sample(patient_id_sepsis, len_train_sepsis)
val_id_sepsis = random.sample(set(patient_id_sepsis) - set(train_id_sepsis), len_val_sepsis)
test_id_sepsis = list(set(patient_id_sepsis) - set(train_id_sepsis) - set(val_id_sepsis))

In [76]:
train_id_non_sepsis = random.sample(patient_id_non_sepsis, len_train_non_sepsis)
val_id_non_sepsis = random.sample(set(patient_id_non_sepsis) - set(train_id_non_sepsis), len_val_non_sepsis)
test_id_non_sepsis = list(set(patient_id_non_sepsis) - set(train_id_non_sepsis) - set(val_id_non_sepsis))

In [77]:
train_id = train_id_sepsis + train_id_non_sepsis
val_id = val_id_sepsis + val_id_non_sepsis
test_id = test_id_sepsis + test_id_non_sepsis

sepsis_id = train_id_sepsis + val_id_sepsis
nonsepsis_id = train_id_non_sepsis + val_id_non_sepsis

In [78]:
print('Number of train:', len(train_id))
print('Number of validation:', len(val_id))
print('Number of test:', len(test_id))


Number of train: 2402
Number of validation: 515
Number of test: 515


In [67]:
np.save('./data/test_set.npy', test_id)
np.save('./data/train_sepsis.npy', sepsis_id)
np.save('./data/train_nonsepsis.npy', nonsepsis_id)
np.save('./data/all_sepsis.npy', patient_id_sepsis)
np.save('./data/all_nonsepsis.npy', patient_id_non_sepsis)

## Pre-process dataset
### Helper Functions

* Function 1: Fill the missing value

In [48]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

* Function 2: Genearte new feature columns

In [49]:
# function to add new columns containing as mang features as possible
def new_features(df):
  #attributes that are worked on: HR, Resp, Temp

  #copy df
  df_dev = df.copy()

  # three standard deviation for HR -> HR_dev_1, HR_dev_2, HR_dev_3
  hr = df_dev['HR']
  hr_len = len(hr)
  hr_dev_1 = hr.copy()
  hr_dev_2 = hr.copy()
  hr_dev_3 = hr.copy()
  for i, j in hr_dev_1.iteritems():
    if i > hr_len - 5:
      hr_dev_1[i] = hr[i:hr_len].std()
    else:
      hr_dev_1[i] = hr[i:i+5].std()
  hr_dev_1[hr_len-1] = hr_dev_1[hr_len-2]
  df_dev['HR_dev_1'] = hr_dev_1

  for i, j in hr_dev_2.iteritems():
    if i > hr_len - 10:
      hr_dev_2[i] = hr[i:hr_len].std()
    else:
      hr_dev_2[i] = hr[i:i+10].std()
  hr_dev_2[hr_len-1] = hr_dev_2[hr_len-2]
  df_dev['HR_dev_2'] = hr_dev_2

  for i, j in hr_dev_3.iteritems():
    if i > hr_len - 20:
      hr_dev_3[i] = hr[i:hr_len].std()
    else:
      hr_dev_3[i] = hr[i:i+20].std()
  hr_dev_3[hr_len-1] = hr_dev_3[hr_len-2]
  df_dev['HR_dev_3'] = hr_dev_3


  # three standard deviation for Resp -> Resp_dev_1, Resp_dev_2, Resp_dev_3
  rr = df_dev['RR']
  rr_len = len(rr)
  rr_dev_1 = rr.copy()
  rr_dev_2 = rr.copy()
  rr_dev_3 = rr.copy()
  for i, j in rr_dev_1.iteritems():
    if i < rr_len -5:
      rr_dev_1[i] = rr[i:rr_len].std()
    else:
      rr_dev_1[i] = rr[i:i+5].std()
  rr_dev_1[rr_len-1] = rr_dev_1[rr_len-2]
  df_dev['RR_dev_1'] = rr_dev_1

  for i, j in rr_dev_2.iteritems():
    if i < rr_len -10:
      rr_dev_2[i] = rr[i:rr_len].std()
    else:
      rr_dev_2[i] = rr[i:i+10].std()
  rr_dev_2[rr_len-1] = rr_dev_2[rr_len-2]
  df_dev['RR_dev_2'] = rr_dev_2

  for i, j in rr_dev_3.iteritems():
    if i < rr_len -20:
      rr_dev_3[i] = rr[i:rr_len].std()
    else:
      rr_dev_3[i] = rr[i:i+20].std()
  rr_dev_3[rr_len-1] = rr_dev_3[rr_len-2]
  df_dev['RR_dev_3'] = rr_dev_3

  # three standard deviation for Temp -> Temp_dev_1, Temp_dev_2, Temp_dev_3
  t = df_dev['Temp']
  t_len = len(t)
  t_dev_1 = t.copy()
  t_dev_2 = t.copy()
  t_dev_3 = t.copy()
  for i, j in t_dev_1.iteritems():
    if i < t_len - 5:
      t_dev_1[i] = t[i:t_len].std()
    else:
      t_dev_1[i] = t[i:i+5].std()
  t_dev_1[t_len-1] = t_dev_1[t_len-2]
  df_dev['Temp_dev_1'] = t_dev_1

  for i, j in t_dev_2.iteritems():
    if i < t_len - 10:
      t_dev_2[i] = t[i:t_len].std()
    else:
      t_dev_2[i] = t[i:i+10].std()
  t_dev_2[t_len-1] = t_dev_2[t_len-2]
  df_dev['Temp_dev_2'] = t_dev_2

  for i, j in t_dev_3.iteritems():
    if i < t_len - 20:
      t_dev_3[i] = t[i:t_len].std()
    else:
      t_dev_3[i] = t[i:i+20].std()
  t_dev_3[t_len-1] = t_dev_3[t_len-2]
  df_dev['Temp_dev_3'] = t_dev_3

  # features for indicating bradycardia and tachycardia
  brady = hr.copy()
  tachy = hr.copy()
  for i,j in brady.iteritems():
    if hr[i] < 100:
      brady[i] = True
    else:
      brady[i] = False
    if hr[i] > 220:
      tachy[i] = True
    else:
      tachy[i] = False
  df_dev['Bradycardia'] = brady
  df_dev['Tachycardia'] = tachy

  # features for indicating Hypothermia, Hyperthermia(Fever) and Hyperpyrexia
  hypothermia = t.copy()
  fever = t.copy()
  hyperpyrexia = t.copy()
  for i,j in t.iteritems():
    if t[i] < 36.5:
      hypothermia[i] = True
    else:
      hypothermia[i] = False
    if t[i] > 38:
      fever[i] = True
    else:
      fever[i] = False
    if t[i] > 40:
      hyperpyrexia[i] = True
    else:
      hyperpyrexia[i] = False
  df_dev['Hypothermia'] = hypothermia
  df_dev['Fever'] = fever
  df_dev['Hyperpyrexia'] = hyperpyrexia

  return df_dev


* Function 3 : Mark positive sepsis label 6 hrs ahead

In [50]:
def relabel_positive(patient):
  df = patient
  df_len = len(df)
  if df.iloc[df_len-1].sepsis == 1:
    for i in range(7):
      df.loc[df_len-1-i,'sepsis'] = 1
  return df

Function 4: modify extra features that are different from Cinc2019

In [51]:
def modify_extra_features(df):
    df['Lactic'] = 0
    df['Magnesium'] = 0
    df['HCO3'] = 0
    df['age'] = 0
    df = df.drop('FiO2', axis=1)
    df = df.drop('PCO2', axis=1)

    return df 

### Generate Raw Data Pickle

In [79]:
#raw data of the whole dataset
all_data = pd.DataFrame([])

for p in tqdm(patient_id):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data = pd.concat([all_data, df], ignore_index=False)

filename = './data/data_all.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data, f)

100%|██████████| 3432/3432 [03:21<00:00, 17.01it/s]


In [80]:
#raw data of the septic dataset
sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    sepsis_data = pd.concat([sepsis_data, df], ignore_index=False)

filename = './data/sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(sepsis_data, f)

100%|██████████| 113/113 [00:01<00:00, 74.20it/s]


In [81]:
#raw data of the non-septic dataset
non_sepsis_data = pd.DataFrame([])

for p in tqdm(patient_id_non_sepsis):

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    non_sepsis_data = pd.concat([non_sepsis_data, df], ignore_index=False)

filename = './data/non_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(non_sepsis_data, f)

100%|██████████| 3319/3319 [03:08<00:00, 17.57it/s]


### Generate Baseline Data

In [52]:
!mkdir -p ../datasets/MIMICIII/neonates/baseline_all

In [53]:
# impute missing values and create clean dfs for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")

    # drop features that doesn't exist in Cinc2019
    df_dropped = df.drop(['Sodium'], axis = 1)
    
    # impute missing values
    attributes = df_dropped.columns
    df_filled = impute_missing_vals(df_dropped, attributes)

    # change the temperature value
    df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)

    # change the False/True into 0/1
    df_replace_label = df_filled.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    #re-label sepsis 6 hours ahead
    df_final = relabel_positive(df_replace_gender)

    df_final = modify_extra_features(df_final)
    
    # save new patient data
    save_path = '../datasets/MIMICIII/neonates/baseline_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 3432/3432 [01:46<00:00, 32.29it/s]


In [54]:
# check the features in Cinc2019 and MIMIC-III are the same
attris_cinc2019 = list(pd.read_csv('../datasets/Cinc2019/baseline_all/p000001.csv').columns)
print(attris_cinc2019)
attris_mimiciii = list(pd.read_csv('../datasets/MIMICIII/neonates/baseline_all/8.csv').columns)
attris_mimiciii.remove('time')
print(attris_mimiciii)
sorted(attris_cinc2019) == sorted(attris_mimiciii)

['HR', 'SaO2', 'Temp', 'SBP', 'MAP', 'DBP', 'RR', 'BaseExcess', 'HCO3', 'PH', 'BUN', 'Calcium', 'Chloride', 'Creatinine', 'Glucose', 'Lactic', 'Magnesium', 'Potassium', 'PTT', 'WBC', 'Platelet', 'age', 'gender', 'sepsis', 'subject_id']
['sepsis', 'subject_id', 'gender', 'SBP', 'DBP', 'MAP', 'Temp', 'HR', 'RR', 'BaseExcess', 'SaO2', 'PH', 'Calcium', 'Potassium', 'Creatinine', 'Chloride', 'Glucose', 'WBC', 'BUN', 'PTT', 'Platelet', 'Lactic', 'Magnesium', 'HCO3', 'age']


True

### Generate Engineered Data

In [57]:
!mkdir -p ../datasets/MIMICIII/neonates/engineered_all

In [58]:
# impute missing values and add new features for all patients
for p in tqdm(patient_id):
    
    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")

    # drop features that doesn't exist in Cinc2019
    df_dropped = df.drop(['Sodium'], axis = 1)
    
    # impute missing values
    attributes = df_dropped.columns
    df_filled = impute_missing_vals(df_dropped, attributes)

    # change the temperature value
    df_filled['Temp'] = df_filled['Temp'].apply(lambda x: (x-32)/1.8)

    # change the False/True into 0/1
    df_replace_label = df_filled.replace([True,False],[1,0]) # from the label
    df_replace_gender = df_replace_label.replace(['M','F'],[1,0]) #from the gender

    # add new features
    df_new_f = new_features(df_replace_gender)

    #re-label sepsis 6 hours ahead
    df_final = relabel_positive(df_new_f)

    df_final = modify_extra_features(df_final)
    
    # save new patient data
    save_path = '../datasets/MIMICIII/neonates/engineered_all/'
    df_final.to_csv(save_path + p , sep=',', index = False)     

    if np.any(pd.isnull(df_final)): break

100%|██████████| 3432/3432 [08:24<00:00,  6.81it/s]


## Others?

## Process raw data


In [None]:
data_train = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/training/'
data_val = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/validation/'
data_test = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/test/'

Split into folders

In [None]:
for p in train_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_train  + p, sep=',', index = False) 

In [None]:
for p in val_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_val + p, sep=',', index = False) 

In [None]:
for p in test_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df.to_csv(data_test + p, sep=',', index = False)

Files -> Pickle

In [None]:
#raw data of the whole dataset
all_data = pd.DataFrame([])

for p in patient_id:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    attributes = df.columns[:-1]
    all_data = pd.concat([all_data, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data, f)

In [8]:
all_data_sepsis = pd.DataFrame([])

for p in patient_id_sepsis:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data_sepsis = pd.concat([all_data_sepsis, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data_sepsis, f)

In [9]:
all_data_non_sepsis = pd.DataFrame([])

for p in patient_id_non_sepsis:

    # read in patient data
    df = pd.read_csv(path_all + p, sep = ",")
    all_data_non_sepsis = pd.concat([all_data_non_sepsis, df], ignore_index=False)

filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data_non_sepsis.pickle'
with open(filename, "wb") as f:
    pickle.dump(all_data_non_sepsis, f)

In [None]:
train_data = pd.DataFrame([])

for p in train_id:

    # read in patient data
    df = pd.read_csv(data_train + p, sep = ",")
    attributes = df.columns[:-1]
    train_data = pd.concat([train_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(train_data, f)

In [None]:
val_data = pd.DataFrame([])

for p in val_id:

    # read in patient data
    df = pd.read_csv(data_val + p, sep = ",")
    attributes = df.columns[:-1]
    val_data = pd.concat([val_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(val_data, f)

In [None]:
test_data = pd.DataFrame([])

for p in test_id:

    # read in patient data
    df = pd.read_csv(data_test + p, sep = ",")
    attributes = df.columns[:-1]
    test_data = pd.concat([test_data, df], ignore_index=False)
    
import pickle
filename = '//content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(test_data, f)

Check the shape of traning, val and test dataset

In [None]:
print(all_data.shape)
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(293181, 25)
(204784, 25)
(45036, 25)
(43361, 25)


Pickle -> CSV

In [None]:
#all
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data.pickle'
with open(filename, 'rb') as f:
    all = pickle.load(f)

all.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/all_data.csv', sep = ',')

In [None]:
#train
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/train_data.pickle'
with open(filename, 'rb') as f:
    train = pickle.load(f)

train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/train_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_datatrain_data.csv', sep = ',')

In [None]:
#val
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/val_data.pickle'
with open(filename, 'rb') as f:
    val = pickle.load(f)

val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/val_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_dataval_data.csv', sep = ',')

In [None]:
#raw_test
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/test_data.pickle'
with open(filename, 'rb') as f:
    test = pickle.load(f)

test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/test_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/raw_data/test_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,Sodium,Potassium,Creatinine,Chloride,Glucose,WBC,BUN,PTT,Platelet,time
0,False,10826,M,,,,97.800003,128.0,60.0,,...,,,,,,,,,,2182-07-18 17:00:00
1,False,10826,M,,,,,128.0,66.0,,...,,,,,,,,,,2182-07-18 18:00:00
2,False,10826,M,,,,98.199997,138.0,54.0,,...,,,,,,,,,,2182-07-18 19:00:00
3,False,10826,M,,,,,139.0,68.0,,...,,,,,,,,,,2182-07-18 20:00:00
4,False,10826,M,,,,,144.0,58.0,,...,,,,,,,,,,2182-07-18 21:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43356,False,31416,F,,,,,148.0,52.0,,...,,,,,,,,,,2175-07-22 10:00:00
43357,False,31416,F,,,,97.800003,150.0,48.0,,...,,,,,,,,,,2175-07-22 11:00:00
43358,False,31416,F,,,,,124.0,59.0,,...,,,,,,,,,,2175-07-22 12:00:00
43359,False,31416,F,,,,,111.0,55.0,,...,,,,,,,,,,2175-07-22 13:00:00


## Helper Functions

Function 1: Mark sepsis label as positive 6 hours before onset

In [None]:
def mark_positive(patient):
  df = patient
  df_len = len(df)
  if df.iloc[df_len-1].sepsis == True:
    for i in range(7):
      df.loc[df_len-1-i,'sepsis'] = True
  return df

Function 2: Fill the missing value

In [None]:
# function to fill missing values
def impute_missing_vals(df, attributes):

    """
    function that imputes missing values.
    
    @param df: dataframe that has missing values to be
               imputed
           attributes: list of String, attributes of dataframe
    @return df_clean: dataframe without missing values

    """
    
    """
    fill missing values by the closest values first
    ffill to fill missing values in the tail
    bfill to fill missing values in the head
    """
    # copy df
    df_clean = df.copy()
    for att in attributes:
        if df_clean[att].isnull().sum() == len(df_clean):
            df_clean[att] = df_clean[att].fillna(0)
        elif df_clean[att].isnull().sum() == len(df_clean) - 1:
            df_clean[att] = df_clean[att].ffill().bfill()
        else:
            df_clean[att] = df_clean[att].interpolate(method='nearest', limit_direction='both')
            df_clean[att] = df_clean[att].ffill().bfill()
    
    return df_clean

Function 3: Generate new feature columns

In [None]:
# function to add new columns containing as mang features as possible
def new_features(df):
  #attributes that are worked on: HR, Resp, Temp

  #copy df
  df_dev = df.copy()

  # three standard deviation for HR -> HR_dev_1, HR_dev_2, HR_dev_3
  hr = df_dev['HR']
  hr_len = len(hr)
  hr_dev_1 = hr.copy()
  hr_dev_2 = hr.copy()
  hr_dev_3 = hr.copy()
  for i, j in hr_dev_1.iteritems():
    if i > hr_len - 5:
      hr_dev_1[i] = hr[i:hr_len].std()
    else:
      hr_dev_1[i] = hr[i:i+5].std()
  hr_dev_1[hr_len-1] = hr_dev_1[hr_len-2]
  df_dev['HR_dev_1'] = hr_dev_1

  for i, j in hr_dev_2.iteritems():
    if i > hr_len - 10:
      hr_dev_2[i] = hr[i:hr_len].std()
    else:
      hr_dev_2[i] = hr[i:i+10].std()
  hr_dev_2[hr_len-1] = hr_dev_2[hr_len-2]
  df_dev['HR_dev_2'] = hr_dev_2

  for i, j in hr_dev_3.iteritems():
    if i > hr_len - 20:
      hr_dev_3[i] = hr[i:hr_len].std()
    else:
      hr_dev_3[i] = hr[i:i+20].std()
  hr_dev_3[hr_len-1] = hr_dev_3[hr_len-2]
  df_dev['HR_dev_3'] = hr_dev_3


  # three standard deviation for Resp -> Resp_dev_1, Resp_dev_2, Resp_dev_3
  rr = df_dev['RR']
  rr_len = len(rr)
  rr_dev_1 = rr.copy()
  rr_dev_2 = rr.copy()
  rr_dev_3 = rr.copy()
  for i, j in rr_dev_1.iteritems():
    if i < rr_len -5:
      rr_dev_1[i] = rr[i:rr_len].std()
    else:
      rr_dev_1[i] = rr[i:i+5].std()
  rr_dev_1[rr_len-1] = rr_dev_1[rr_len-2]
  df_dev['RR_dev_1'] = rr_dev_1

  for i, j in rr_dev_2.iteritems():
    if i < rr_len -10:
      rr_dev_2[i] = rr[i:rr_len].std()
    else:
      rr_dev_2[i] = rr[i:i+10].std()
  rr_dev_2[rr_len-1] = rr_dev_2[rr_len-2]
  df_dev['RR_dev_2'] = rr_dev_2

  for i, j in rr_dev_3.iteritems():
    if i < rr_len -20:
      rr_dev_3[i] = rr[i:rr_len].std()
    else:
      rr_dev_3[i] = rr[i:i+20].std()
  rr_dev_3[rr_len-1] = rr_dev_3[rr_len-2]
  df_dev['RR_dev_3'] = rr_dev_3

  # three standard deviation for Temp -> Temp_dev_1, Temp_dev_2, Temp_dev_3
  t = df_dev['Temp']
  t_len = len(t)
  t_dev_1 = t.copy()
  t_dev_2 = t.copy()
  t_dev_3 = t.copy()
  for i, j in t_dev_1.iteritems():
    if i < t_len - 5:
      t_dev_1[i] = t[i:t_len].std()
    else:
      t_dev_1[i] = t[i:i+5].std()
  t_dev_1[t_len-1] = t_dev_1[t_len-2]
  df_dev['Temp_dev_1'] = t_dev_1

  for i, j in t_dev_2.iteritems():
    if i < t_len - 10:
      t_dev_2[i] = t[i:t_len].std()
    else:
      t_dev_2[i] = t[i:i+10].std()
  t_dev_2[t_len-1] = t_dev_2[t_len-2]
  df_dev['Temp_dev_2'] = t_dev_2

  for i, j in t_dev_3.iteritems():
    if i < t_len - 20:
      t_dev_3[i] = t[i:t_len].std()
    else:
      t_dev_3[i] = t[i:i+20].std()
  t_dev_3[t_len-1] = t_dev_3[t_len-2]
  df_dev['Temp_dev_3'] = t_dev_3

  # features for indicating bradycardia and tachycardia
  brady = hr.copy()
  tachy = hr.copy()
  for i,j in brady.iteritems():
    if hr[i] < 100:
      brady[i] = True
    else:
      brady[i] = False
    if hr[i] > 220:
      tachy[i] = True
    else:
      tachy[i] = False
  df_dev['Bradycardia'] = brady
  df_dev['Tachycardia'] = tachy

  # features for indicating Hypothermia, Hyperthermia(Fever) and Hyperpyrexia
  hypothermia = t.copy()
  fever = t.copy()
  hyperpyrexia = t.copy()
  for i,j in t.iteritems():
    if t[i] < 97.7:
      hypothermia[i] = True
    else:
      hypothermia[i] = False
    if t[i] >100.4:
      fever[i] = True
    else:
      fever[i] = False
    if t[i] > 104:
      hyperpyrexia[i] = True
    else:
      hyperpyrexia[i] = False
  df_dev['Hypothermia'] = hypothermia
  df_dev['Fever'] = fever
  df_dev['Hyperpyrexia'] = hyperpyrexia

  return df_dev


## Process baseline data


raw data can not be used for model training due to the null in the data;

baseline data are just raw data with the blank filled

In [None]:
data_train = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/training/'
data_val = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/validation/'
data_test = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/test/'

Split into folders

In [None]:
for p in train_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df = df.drop(['time'], axis=1)  #drop time
    df = mark_positive(df)  # 6 hrs ahead
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_train  + p, sep=',', index = False) 

In [None]:
for p in val_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df = df.drop(['time'], axis=1)  #drop time
    df = mark_positive(df)  # 6 hrs ahead
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_val + p, sep=',', index = False) 

In [None]:
for p in test_id:
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    df = df.drop(['time'], axis=1)  #drop time
    df = mark_positive(df)  # 6 hrs ahead
    attributes = df.columns
    df = impute_missing_vals(df,attributes) # impute missing values
    df.to_csv(data_test + p, sep=',', index = False)

Files -> Pickle

In [None]:
train_data = pd.DataFrame([])

for p in train_id:

    # read in patient data
    df = pd.read_csv(data_train + p, sep = ",")
    attributes = df.columns[:-1]
    train_data = pd.concat([train_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(train_data, f)

In [None]:
val_data = pd.DataFrame([])

for p in val_id:

    # read in patient data
    df = pd.read_csv(data_val + p, sep = ",")
    attributes = df.columns[:-1]
    val_data = pd.concat([val_data, df], ignore_index=False)
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(val_data, f)

In [None]:
test_data = pd.DataFrame([])

for p in test_id:

    # read in patient data
    df = pd.read_csv(data_test + p, sep = ",")
    attributes = df.columns[:-1]
    test_data = pd.concat([test_data, df], ignore_index=False)
    
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(test_data, f)

Check the shape of traning, val and test dataset

In [None]:
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

(205720, 24)
(44087, 24)
(43374, 24)


Pickle -> CSV

In [None]:
#train
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/train_data.pickle'
with open(filename, 'rb') as f:
    train = pickle.load(f)

train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/train_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/train_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,Calcium,Sodium,Potassium,Creatinine,Chloride,Glucose,WBC,BUN,PTT,Platelet
0,False,3146,M,57.0,32.0,39.0,98.599998,152.0,40.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,515.0
1,False,3146,M,57.0,32.0,39.0,98.599998,165.0,28.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,515.0
2,False,3146,M,57.0,32.0,39.0,98.599998,140.0,40.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,515.0
3,False,3146,M,57.0,32.0,39.0,98.599998,141.0,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,515.0
4,False,3146,M,57.0,32.0,39.0,98.599998,151.0,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.5,0.0,0.0,515.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205715,False,3624,M,74.0,42.0,54.0,98.900002,160.0,54.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,223.0
205716,False,3624,M,74.0,42.0,54.0,98.900002,140.0,52.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,223.0
205717,False,3624,M,74.0,42.0,54.0,98.900002,129.0,47.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,223.0
205718,False,3624,M,74.0,42.0,54.0,98.900002,141.0,48.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,223.0


In [None]:
#val
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/val_data.pickle'
with open(filename, 'rb') as f:
    val = pickle.load(f)

val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/val_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/val_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,Calcium,Sodium,Potassium,Creatinine,Chloride,Glucose,WBC,BUN,PTT,Platelet
0,False,13786,M,70.0,37.0,49.0,98.400002,137.0,32.0,0.0,...,0.0,137.0,5.0,0.0,104.0,0.0,46.5,0.0,0.0,505.0
1,False,13786,M,68.0,39.0,49.0,98.400002,131.0,26.0,0.0,...,0.0,137.0,5.0,0.0,104.0,0.0,46.5,0.0,0.0,505.0
2,False,13786,M,76.0,61.0,66.0,98.400002,134.0,49.0,0.0,...,0.0,137.0,5.0,0.0,104.0,0.0,46.5,0.0,0.0,505.0
3,False,13786,M,77.0,40.0,52.0,98.400002,138.0,26.0,0.0,...,0.0,137.0,5.0,0.0,104.0,0.0,46.5,0.0,0.0,505.0
4,False,13786,M,77.0,36.0,52.0,99.199997,147.0,41.0,0.0,...,0.0,137.0,5.0,0.0,104.0,0.0,46.5,0.0,0.0,505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44082,False,12069,M,60.0,33.0,43.0,98.800003,139.0,46.0,-2.0,...,0.0,140.0,4.9,0.0,108.0,0.0,7.4,0.0,0.0,181.0
44083,False,12069,M,60.0,33.0,43.0,98.800003,137.0,48.0,-2.0,...,0.0,140.0,4.9,0.0,108.0,0.0,7.4,0.0,0.0,181.0
44084,False,12069,M,60.0,33.0,43.0,98.599998,154.0,26.0,-2.0,...,0.0,140.0,4.9,0.0,108.0,0.0,7.4,0.0,0.0,181.0
44085,False,12069,M,60.0,33.0,43.0,98.599998,135.0,30.0,-2.0,...,0.0,140.0,4.9,0.0,108.0,0.0,7.4,0.0,0.0,181.0


In [None]:
#raw_test
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/test_data.pickle'
with open(filename, 'rb') as f:
    test = pickle.load(f)

test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/test_data.csv', sep = ',', index = False)

In [None]:
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/baseline_data/test_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,Calcium,Sodium,Potassium,Creatinine,Chloride,Glucose,WBC,BUN,PTT,Platelet
0,False,24440,F,47.0,22.0,30.0,99.800003,134.0,18.0,0.0,...,0.0,146.0,4.5,0.0,114.0,0.0,0.0,0.0,0.0,0.0
1,False,24440,F,47.0,22.0,30.0,99.800003,141.0,18.0,0.0,...,0.0,146.0,4.5,0.0,114.0,0.0,0.0,0.0,0.0,0.0
2,False,24440,F,53.0,22.0,32.0,99.800003,137.0,18.0,0.0,...,0.0,146.0,4.5,0.0,114.0,0.0,0.0,0.0,0.0,0.0
3,False,24440,F,53.0,33.0,40.0,99.800003,127.0,25.0,0.0,...,0.0,146.0,4.5,0.0,114.0,0.0,0.0,0.0,0.0,0.0
4,False,24440,F,53.0,33.0,40.0,99.800003,128.0,56.0,0.0,...,0.0,146.0,4.5,0.0,114.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43369,False,21203,M,72.0,35.0,48.0,98.199997,124.0,36.0,0.0,...,0.0,142.0,4.2,0.0,108.0,0.0,15.7,0.0,0.0,373.0
43370,False,21203,M,72.0,35.0,48.0,98.199997,132.0,56.0,0.0,...,0.0,142.0,4.2,0.0,108.0,0.0,15.7,0.0,0.0,373.0
43371,False,21203,M,72.0,35.0,48.0,98.400002,124.0,60.0,0.0,...,0.0,142.0,4.2,0.0,108.0,0.0,15.7,0.0,0.0,373.0
43372,False,21203,M,72.0,35.0,48.0,98.400002,126.0,44.0,0.0,...,0.0,142.0,4.2,0.0,108.0,0.0,15.7,0.0,0.0,373.0


##Process feature-engineered data

Impute missing data and add ebngineered features

In [None]:
! touch /content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/

touch: setting times of '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/': No such file or directory


In [None]:
# impute missing values and create clean dfs for all patients
for p in patient_id:
    
    # read in patient data
    df = pd.read_csv(path_all + '/' + p, sep = ",")
    attributes = df.columns
    
    # impute missing values
    df_filled = impute_missing_vals(df, attributes)

    # add features of deviation
    df_engineered = new_features(df_filled)
    
    # drop unit1 and unit2 with half missing values
    # because these two features have few information
    # drop EtCO2 with all missing values
    df_clean = df_engineered.drop(['PTT','Calcium','time'], axis=1)
    
    # save new patient data
    if p in train_id:
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/training/'
        df_clean.to_csv(save_path + p, sep=',', index = False)        
    
    elif p in val_id:
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/validation/'
        df_clean.to_csv(save_path + p, sep=',', index = False)        
    
    else:
        
        save_path = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test/'
        df_clean.to_csv(save_path + p, sep=',', index = False)
    
    print(p)

10.csv
10005.csv
10008.csv
10024.csv
10054.csv
10055.csv
10062.csv
10066.csv
10079.csv
10081.csv
10082.csv
10100.csv
10109.csv
10113.csv
10125.csv
10135.csv
10141.csv
10156.csv
10166.csv
10169.csv
10176.csv
10191.csv
10192.csv
102.csv
10201.csv
10218.csv
10219.csv
1022.csv
10221.csv
10228.csv
10240.csv
10249.csv
1025.csv
10252.csv
10253.csv
10259.csv
10260.csv
10263.csv
10267.csv
10271.csv
10273.csv
10282.csv
10285.csv
10291.csv
10296.csv
10318.csv
10336.csv
10343.csv
10354.csv
10365.csv
10367.csv
10372.csv
10378.csv
10393.csv
10396.csv
104.csv
10405.csv
10438.csv
10441.csv
10450.csv
10472.csv
10476.csv
10486.csv
10488.csv
10507.csv
10516.csv
10526.csv
10528.csv
10533.csv
10544.csv
10548.csv
1055.csv
10553.csv
10554.csv
10566.csv
10575.csv
10576.csv
10582.csv
10583.csv
10587.csv
10590.csv
10596.csv
1065.csv
10688.csv
10699.csv
10703.csv
1071.csv
10722.csv
10724.csv
10730.csv
1074.csv
10740.csv
10754.csv
10756.csv
10777.csv
10781.csv
10798.csv
1080.csv
10824.csv
10826.csv
10828.csv
1083

Files -> Pickle

In [None]:
engineered_train_data = pd.DataFrame([])

for p in train_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/training/' + p, sep = ",")
    engineered_train_data = pd.concat([engineered_train_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/train_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_train_data, f)

In [None]:
engineered_val_data = pd.DataFrame([])

for p in val_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/validation/' + p, sep = ",")
    engineered_val_data = pd.concat([engineered_val_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/val_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_val_data, f)

In [None]:
engineered_test_data = pd.DataFrame([])

for p in test_id:
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test/' + p, sep = ",")
    engineered_test_data = pd.concat([engineered_test_data, df], ignore_index=False)  #True or False
    
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test_data.pickle'
with open(filename, "wb") as f:
    pickle.dump(engineered_test_data, f)

In [None]:
print(engineered_train_data.shape)
print(engineered_val_data.shape)
print(engineered_test_data.shape)

(205720, 36)
(44087, 36)
(43374, 36)


Pickle -> CSV

In [None]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/train_data.pickle'
with open(filename, 'rb') as f:
    engineered_train = pickle.load(f)

engineered_train.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/train_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/train_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,3146,M,57.0,32.0,39.0,98.599998,152.0,40.0,0.0,...,9.113693,9.113693,0.711066,0.711066,0.711066,False,False,False,False,False
1,False,3146,M,57.0,32.0,39.0,98.599998,165.0,28.0,0.0,...,9.155801,9.155801,0.714164,0.714164,0.714164,False,False,False,False,False
2,False,3146,M,57.0,32.0,39.0,98.599998,140.0,40.0,0.0,...,9.081572,9.081572,0.717302,0.717302,0.717302,False,False,False,False,False
3,False,3146,M,57.0,32.0,39.0,98.599998,141.0,34.0,0.0,...,9.123830,9.123830,0.720480,0.720480,0.720480,False,False,False,False,False
4,False,3146,M,57.0,32.0,39.0,98.599998,151.0,34.0,0.0,...,9.126521,9.126521,0.723699,0.723699,0.723699,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205715,False,3624,M,74.0,42.0,54.0,98.900002,160.0,54.0,0.0,...,2.966479,2.966479,0.000000,0.000000,0.000000,False,False,False,False,False
205716,False,3624,M,74.0,42.0,54.0,98.900002,140.0,52.0,0.0,...,2.629956,2.629956,0.000000,0.000000,0.000000,False,False,False,False,False
205717,False,3624,M,74.0,42.0,54.0,98.900002,129.0,47.0,0.0,...,2.645751,2.645751,0.000000,0.000000,0.000000,False,False,False,False,False
205718,False,3624,M,74.0,42.0,54.0,98.900002,141.0,48.0,0.0,...,2.828427,2.828427,0.000000,0.000000,0.000000,False,False,False,False,False


In [None]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/val_data.pickle'
with open(filename, 'rb') as f:
    engineered_val = pickle.load(f)

engineered_val.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/val_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/val_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,13786,M,70.0,37.0,49.0,98.400002,137.0,32.0,0.0,...,17.894531,17.894531,0.722172,0.722172,0.722172,False,False,False,False,False
1,False,13786,M,68.0,39.0,49.0,98.400002,131.0,26.0,0.0,...,17.970020,17.970020,0.725231,0.725231,0.725231,False,False,False,False,False
2,False,13786,M,76.0,61.0,66.0,98.400002,134.0,49.0,0.0,...,18.011231,18.011231,0.728327,0.728327,0.728327,False,False,False,False,False
3,False,13786,M,77.0,40.0,52.0,98.400002,138.0,26.0,0.0,...,18.077274,18.077274,0.731461,0.731461,0.731461,False,False,False,False,False
4,False,13786,M,77.0,36.0,52.0,99.199997,147.0,41.0,0.0,...,18.119917,18.119917,0.734633,0.734633,0.734633,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44082,False,12069,M,60.0,33.0,43.0,98.800003,139.0,46.0,-2.0,...,19.728152,19.728152,0.109547,0.109547,0.109547,False,False,False,False,False
44083,False,12069,M,60.0,33.0,43.0,98.800003,137.0,48.0,-2.0,...,22.774254,22.774254,0.100002,0.100002,0.100002,False,False,False,False,False
44084,False,12069,M,60.0,33.0,43.0,98.599998,154.0,26.0,-2.0,...,27.784888,27.784888,0.000000,0.000000,0.000000,False,False,False,False,False
44085,False,12069,M,60.0,33.0,43.0,98.599998,135.0,30.0,-2.0,...,32.526912,32.526912,0.000000,0.000000,0.000000,False,False,False,False,False


In [None]:
filename = '/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test_data.pickle'
with open(filename, 'rb') as f:
    engineered_test = pickle.load(f)

engineered_test.to_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test_data.csv', sep = ',', index = False)
pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/MIMIC/models_neoantes/engineered_data/test_data.csv', sep = ',')

Unnamed: 0,sepsis,subject_id,gender,SBP,DBP,MAP,Temp,HR,RR,BaseExcess,...,RR_dev_2,RR_dev_3,Temp_dev_1,Temp_dev_2,Temp_dev_3,Bradycardia,Tachycardia,Hypothermia,Fever,Hyperpyrexia
0,False,24440,F,47.0,22.0,30.0,99.800003,134.0,18.0,0.0,...,20.499515,20.499515,0.517945,0.517945,0.517945,False,False,False,False,False
1,False,24440,F,47.0,22.0,30.0,99.800003,141.0,18.0,0.0,...,20.277579,20.277579,0.501790,0.501790,0.501790,False,False,False,False,False
2,False,24440,F,53.0,22.0,32.0,99.800003,137.0,18.0,0.0,...,20.041814,20.041814,0.484346,0.484346,0.484346,False,False,False,False,False
3,False,24440,F,53.0,33.0,40.0,99.800003,127.0,25.0,0.0,...,19.791157,19.791157,0.465431,0.465431,0.465431,False,False,False,False,False
4,False,24440,F,53.0,33.0,40.0,99.800003,128.0,56.0,0.0,...,19.651184,19.651184,0.444818,0.444818,0.444818,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43369,False,21203,M,72.0,35.0,48.0,98.199997,124.0,36.0,0.0,...,9.633276,9.633276,0.109547,0.109547,0.109547,False,False,False,False,False
43370,False,21203,M,72.0,35.0,48.0,98.199997,132.0,56.0,0.0,...,6.831301,6.831301,0.100002,0.100002,0.100002,False,False,False,False,False
43371,False,21203,M,72.0,35.0,48.0,98.400002,124.0,60.0,0.0,...,8.000000,8.000000,0.000000,0.000000,0.000000,False,False,False,False,False
43372,False,21203,M,72.0,35.0,48.0,98.400002,126.0,44.0,0.0,...,5.656854,5.656854,0.000000,0.000000,0.000000,False,False,False,False,False
