In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import pandas as pd
import numpy as np
from itertools import groupby
from statistics import median,mean,stdev
from scipy import stats as s
from warnings import simplefilter
import math
from warnings import simplefilter
simplefilter(action='ignore')

Reading and Preparing Input from Project_Data

Parameters raw_to_summary:
1.   input_dir

> Specify the input directory, which contains subdirectories for each Fold. The Fold1_Outcomes.csv,Fold2_Outcomes.csv,Fold3_Outcomes.csv,Fold4_Outcomes.csv are assumed to be under the input directory. Same structure as given in Project_Data.zip.

2.   output_dir

> Output directory where the data which is processed to be stored as csv.

Note: Please make sure that *output_dir* is not a sub folder inside *input_dir*.

3. summary_type


> The type of summarization for raw_data, examples of summary_type is 'mean','mode','standard_deviation'.



One file for each type of summary_type is obtained. Each file contians all the patients summarized values. The csv has records ordered as per folds. The fold outcome csv files are from the Imputed Outcome Files provided in luminus.



In [0]:
### All attributes/features we need to look in raw data file for summarizing 
def raw_to_summary(input_dir,output_dir,summary_type):
  attr_list = [ "RecordID", "Age", "Gender", "Height", "ICUType", "Weight",
                "Albumin", "ALP", "ALT", "AST", "Bilirubin", "BUN", "Cholesterol",
                "Creatinine", "DiasABP", "FiO2", "GCS", "Glucose", "HCO3", "HCT",
                "HR", "K", "Lactate", "Mg", "MAP", "MechVent", "Na", "NIDiasABP",
                "NIMAP", "NISysABP", "PaCO2", "PaO2", "pH", "Platelets", 
                "RespRate", "SaO2", "SysABP", "Temp", "TroponinI", "TroponinT",
                "Urine", "WBC"]

  # Give the root folder which contains Fold1,Fold2,Fold3,Fold4 of the data
  ##Change Directory where raw files are under their respective folders
  ## Sorting the folders so that folds are read as per order
  ## Each file under fold is read as per sorted order
  ## One directory for all patients
  ## each patient data is a dictionary of feature: values, where values are stored as list of list

  dir_path = input_dir
  #"/Users/mahendrensundararajan/Desktop/Project_Data/"
  patients_dir = {}
  c = 0
  mylist = []
  for root, dirs, files in sorted(os.walk(dir_path, topdown=False)):
      for name in sorted(files):
          # Checking the filename it is has txt extension it is taken up for processing
          if 'txt' in name:
              mylist.append(name)
              f = open(os.path.join(root, name), 'r')
              rows = []
              for row in f.readlines():
                  rows.append(row)
              p1 = {}
              # Adding the time of each measurement
              p1["time"] = []
              for var in attr_list:
                  p1[var] = []
              for row in rows[1:]:
                  p1["time"].append(row.split(',')[0])
                  p1[row.split(',')[1]].append([row.split(',')[0],row.rstrip().split(',')[2]])
              patients_dir[c] = p1
              c+=1

  dup_dir = patients_dir.copy()
  # Iterate over the patients dictionary for summarizing each feature
  for key, value in dup_dir.items():
      # As each value gives the patient dictionary the iterating on the attributes of that patient
      for key_,val in value.items():
      # Ignoring the time when measurement is made
          if 'time' not in key_:
          # Some features may not have any values replace it with NA 
              if isinstance(val,(list)) and len(val) == 0:
                  value[key_]='NA'
          # If only one value for a feature is available then take that value
              elif isinstance(val,(list)) and len(val) == 1:
                  templist = val
                  res_ = [el[1] for el in templist]
                  value[key_] = res_[0]
          # When feature has many values, then different types of summarization can be done like mean, median,mode, stddev
              elif isinstance(val,(list)) and len(val) > 1:
                  templist = val
                  res = [float(el[1]) for el in templist]
                  if 'stddev' in summary_type:
                    value[key_] = stdev(res)
                  elif 'mean' in summary_type:
                    value[key_] = sum(res)/len(res)
                  elif 'mode' in summary_type:
                    # If multiple modes then take the first mode
                    value[key_] = float(s.mode(res)[0])
  
  ## Create a dataframe then add each patient, where each feature is a summary statistic
  my_df  = pd.DataFrame(columns = attr_list)
  for key, value in patients_dir.items():
    my_df = my_df.append({'RecordID':value['RecordID'],
    'Age':value['Age'],
    'Gender':value['Gender'],
    'Height':value['Height'],
    'ICUType':value['ICUType'],
    'Weight':value['Weight'],
    'Albumin':value['Albumin'],
    'ALP':value['ALP'],
    'ALT':value['ALT'],
    'AST':value['AST'],
    'Bilirubin':value['Bilirubin'],
    'BUN':value['BUN'],
    'Cholesterol':value['Cholesterol'],
    'Creatinine':value['Creatinine'],
    'DiasABP':value['DiasABP'],
    'FiO2':value['FiO2'],
    'GCS':value['GCS'],
    'Glucose':value['Glucose'],
    'HCO3':value['HCO3'],
    'HCT':value['HCT'],
    'HR':value['HR'],
    'K':value['K'],
    'Lactate':value['Lactate'],
    'Mg':value['Mg'],
    'MAP':value['MAP'],
    'MechVent':value['MechVent'],
    'Na':value['Na'],
    'NIDiasABP':value['NIDiasABP'],
    'NIMAP':value['NIMAP'],
    'NISysABP':value['NISysABP'],
    'PaCO2':value['PaCO2'],
    'PaO2':value['PaO2'],
    'pH':value['pH'],
    'Platelets':value['Platelets'],
    'RespRate':value['RespRate'],
    'SaO2':value['SaO2'],
    'SysABP':value['SysABP'],
    'Temp':value['Temp'],
    'TroponinI':value['TroponinI'],
    'TroponinT':value['TroponinT'],
    'Urine':value['Urine'],
    'WBC':value['WBC']},ignore_index=True)

  # reading y and sorting by patients_id on each fold
  # As data files are read in order of folds and order of filenames(record_id),
  # the y label is also read as per fold and sorted on filename (record_id)
  # Appending it to the patients dataframe
  # Finally a single dataframe containing all the patients, ordered on folds, is got 
  
  filepath = dir_path + 'Fold1_Outcomes.csv'
  fold1_out = pd.read_csv(filepath)
  filepath = dir_path + 'Fold2_Outcomes.csv'

  fold2_out = pd.read_csv(filepath)
  filepath = dir_path + 'Fold3_Outcomes.csv'

  fold3_out = pd.read_csv(filepath)
  filepath = dir_path + 'Fold4_Outcomes.csv'

  fold4_out = pd.read_csv(filepath)

  fold1_out.sort_values(by=['RecordID'],inplace=True)
  fold2_out.sort_values(by=['RecordID'],inplace=True)
  fold3_out.sort_values(by=['RecordID'],inplace=True)
  fold4_out.sort_values(by=['RecordID'],inplace=True)

  frames = [fold1_out, fold2_out, fold3_out,fold4_out]
  result = pd.concat(frames,ignore_index=True)
  result.sort_values(by=['RecordID'],inplace=True)
  my_df['Length_of_stay'] = result['Length_of_stay']
  my_df['In-hospital_death'] = result['In-hospital_death']
  fullname = os.path.join(output_dir,'summary_'+summary_type+'.csv')  
  my_df.to_csv(fullname,index=False)

Modify the input_dir or output_dir variable for respective file paths.

Code to generate summarized files of each summary_type.


In [0]:
sum_dict = {}
sum_dict[0] = 'mean'
sum_dict[1] = 'mode'
sum_dict[2] = 'stddev'

# Please enter the output and input folder paths here.
input_dir = './Project_Data/'
output_dir = './Prep_Data/'

for k,v in sum_dict.items():
  raw_to_summary(input_dir,output_dir,v)

This Code creates the design matrix with additional columns that are standard_deviation of corresponding features.

In [0]:
# Add Standard deviation of certain features to the dataset and use it as design matrix
df = pd.read_csv(output_dir+'summary_stddev.csv')
my_df = pd.read_csv(output_dir+'summary_mean.csv')

# The following columns are chosen based on Exploratory Data Analysis  
df = df[['RecordID','DiasABP', 'FiO2', 'GCS', 'HR', 'MAP','NIDiasABP','NIMAP','SysABP','Temp','Urine', 'Weight']]
df.sort_values(by=['RecordID'],inplace=True)

# Deign_matrix is summary_mean along with stddev of some timeseries columns
my_df['DiasABP_std'] = df['DiasABP']
my_df['FiO2_std'] = df['FiO2']
my_df['GCS_std'] = df['GCS']
my_df['HR_std'] = df['HR']
my_df['MAP_std'] = df['MAP']
my_df['NIDiasABP_std'] = df['NIDiasABP']
my_df['NIMAP_std'] = df['NIMAP']
my_df['SysABP_std'] = df['SysABP']
my_df['Temp_std'] = df['Temp']
my_df['Urine_std'] = df['Urine']
my_df['Weight_std'] = df['Weight']

fullname = os.path.join(output_dir,'combined_mean_std.csv')  
my_df.to_csv(fullname,index=False)