# Post processing the data after generation
## Import libraries and mount Google drive


In [1]:
import os
from datetime import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read data path

In [3]:
patients_sepsis = '/content/drive/MyDrive/MIMIC/Results/adults/sepsis/' #change path here 
sepsis_dir = os.listdir(patients_sepsis)
patients_non_sepsis = '/content/drive/MyDrive/MIMIC/Results/adults/non_sepsis/' #change path here
non_sepsis_dir = os.listdir(patients_non_sepsis)

In [None]:
#test
a_neonate = '/content/drive/MyDrive/MIMIC/Results/neonates/sepsis/10055.csv'
a_neonate = pd.read_csv(a_neonate)
a_neonate

In [4]:
#test
an_adult = '/content/drive/MyDrive/MIMIC/Results/adults/sepsis/1000.csv'
an_adult = pd.read_csv(an_adult)
an_adult

Unnamed: 0,sepsis,subject_id,gender,age,SBP,DBP,MAP,Temp,HR,RR,...,Potassium,HCO3,Creatinine,Chloride,Glucose,WBC,BUN,PTT,Platelet,time
0,False,1000,M,70.0,,,,97.500000,85.0,21.0,...,,,,,,,,,,2144-01-20 13:15:00
1,False,1000,M,70.0,,,,97.399943,81.0,16.0,...,,,,,,,,,,2144-01-20 14:00:00
2,False,1000,M,70.0,,,,97.300003,89.0,21.0,...,,,,,,,,,,2144-01-20 14:30:00
3,False,1000,M,70.0,,,,96.400043,86.0,16.0,...,,,,,,,,,,2144-01-20 15:00:00
4,False,1000,M,70.0,,,,,82.0,20.0,...,,,,,,,,,,2144-01-20 15:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,False,1000,M,70.0,,,,,59.0,20.0,...,,,,,,,,,,2144-02-25 16:45:00
1065,False,1000,M,70.0,,,,,60.0,24.0,...,,,,,,,,,,2144-02-25 16:50:00
1066,False,1000,M,70.0,,,,96.800000,70.0,18.0,...,,,,,,,,,,2144-02-25 17:00:00
1067,False,1000,M,70.0,,,,,78.0,24.0,...,,,,,,,,,,2144-02-25 17:15:00


## Help functions

In [5]:
# this function round the time to the nearest hour
# used for new_start > start, then round the new_start
def rounder_nearest(t):
    if t.minute >= 30:
        return t.replace(second=0, microsecond=0, minute=0, hour=t.hour+1)
    else:
        return t.replace(second=0, microsecond=0, minute=0)

# this function round the time to the current hour
# used for new_start <= start, then round the start
def rounder_forward(t):
      return t.replace(second=0, microsecond=0, minute=0)

In [6]:
def merge_values(value_1, value_2):
  if type(value_1) == bool:
    return value_1 or value_2
  elif type(value_1) ==str:
    return value_1  # always keep the time point of the hour
  else:
    if pd.isnull(value_1) and pd.isnull(value_2):
      return np.nan
    elif pd.isnull(value_1):
      return value_2
    elif pd.isnull(value_2):
      return value_1
    else:
      return value_2  #retun the later value if both are not nan

In [7]:
# columns_info = a_neonate.columns
columns_info = an_adult.columns

def merge_rows(row_1, row_2):
  for column in columns_info:
    row_1[column] = merge_values(row_1[column], row_2[column])
  return row_1



## This block processes the neonates/adults with sepsis


1.   discard when the los before sepsis onset is < 6 hours
2.   discard when there is no recording within the 6 hours before sepsis onset
3.   maximun recodring hours will be limited to 100 hours
3.   missingh hours will be compensated by empty row with only static info



In [None]:
step = 0  # for indicating processing step
type_1 = 0  # neonates/adults with less than 6 hours los before sepsis onset
type_2 = 0  # neonates/adults with no recordings within the 6 hours before sepsis onset
type_3 = 0  # neonates/adults with successful generation of table

for csv_file in sepsis_dir:
  patient = pd.read_csv(patients_sepsis+csv_file)

  step +=1
  print("processing {}/{}, subject_id = {}".format(step,len(sepsis_dir),patient.subject_id[0]))

  new_df = pd.DataFrame(columns = patient.columns)

  # empty_row = {'subject_id':patient.subject_id[0], 'gender':patient.gender[0],
  #              'sepsis':False, 'SBP':np.nan, 'DBP':np.nan, 'MAP':np.nan, 'Temp':np.nan, 'HR':np.nan,
  #              'RR':np.nan, 'BaseExcess':np.nan, 'FiO2':np.nan, 'SaO2':np.nan, 'PCO2':np.nan, 
  #              'PH':np.nan, 'Calcium':np.nan, 'Sodium':np.nan, 'Potassium':np.nan, 'Creatinine':np.nan, 
  #              'Chloride':np.nan, 'Glucose':np.nan, 'WBC':np.nan, 'BUN':np.nan, 'PTT':np.nan,
  #              'Platelet':np.nan, 'time':np.nan}  # for neonates

  empty_row = {'subject_id':patient.subject_id[0], 'gender':patient.gender[0], 'age':patient.age[0],
               'sepsis':False, 'SBP':np.nan, 'DBP':np.nan, 'MAP':np.nan, 'Temp':np.nan, 'HR':np.nan,
               'RR':np.nan, 'BaseExcess':np.nan, 'SaO2':np.nan, 'PH':np.nan, 'Magnesium':np.nan,
               'Lactic':np.nan, 'Calcium':np.nan, 'Sodium':np.nan, 'Potassium':np.nan, 'HCO3':np.nan,
               'Creatinine':np.nan, 'Chloride':np.nan, 'Glucose':np.nan, 'WBC':np.nan, 'BUN':np.nan, 
               'PTT':np.nan, 'Platelet':np.nan, 'time':np.nan}  # for adults

  time_points = len(patient)
  start_time = patient.time[0]
  end_time = patient.time[time_points-1]
  onset_index = patient.index[patient['sepsis']][0]
  onset_time = patient.time[onset_index]
  FMT = '%Y-%m-%d %H:%M:%S'
  los = ( datetime.strptime( end_time, FMT) - datetime.strptime(start_time, FMT) ).total_seconds()/3600 #in hours
  los_before_sepsis = ( datetime.strptime(onset_time, FMT) - datetime.strptime(start_time, FMT) ).total_seconds()/3600  #in hours
  if los_before_sepsis < 6:
    print("Patient {} has less then 6 hours recordings, discard from the final dataset".format(patient.subject_id[0]))
    type_1 += 1
    continue
  last_record_before_sepsis_time = patient.time[onset_index-1]
  last_record_before_sepsis = ( datetime.strptime(onset_time, FMT) - datetime.strptime(last_record_before_sepsis_time, FMT) ).total_seconds()/3600  #in hours
  if last_record_before_sepsis >= 6:
    print("Patient {} has no recordings within the 6 hours before onset, discard from the final dataset".format(patient.subject_id[0]))
    type_2 += 1
    continue

  type_3 += 1

  # determine the start time
  start = datetime.strptime(start_time,FMT)
  onset = datetime.strptime(onset_time,FMT)
  delta = timedelta(hours = 100)  #maximun 100 hours before onset for each neonate
  new_start = onset - delta
  # get the starting index and starting time
  if new_start < start: 
    new_start = rounder_forward(start)
    new_start_index = 0
  else:
    new_start = rounder_forward(new_start)
    for index, row in patient.iterrows():
      current  = datetime.strptime(row.time, FMT)
      if current < new_start: 
        continue
      else: 
        new_start_index = index
        break

  time_diff = (onset - new_start).total_seconds()/3600  # in hours
  for i in range(int(time_diff)+1):
    new_row = pd.Series(empty_row)
    new_row['time'] = new_start
    for index in range(new_start_index, onset_index+1):
      if datetime.strptime( patient.iloc[index].time, FMT) >= new_start and datetime.strptime( patient.iloc[index].time, FMT) < new_start + timedelta(hours = 1):
        new_row = merge_rows(new_row,patient.iloc[index])
    new_df = new_df.append(new_row, ignore_index = True)
    new_start = new_start + timedelta(hours = 1)

  new_df.to_csv('/content/drive/MyDrive/MIMIC/Results/adults/processed_sepsis/'+ str(patient.subject_id[0]) + '.csv', index = False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
processing 1540/4709, subject_id = 28278
Patient 28278 has no recordings within the 6 hours before onset, discard from the final dataset
processing 1541/4709, subject_id = 28292
Patient 28292 has no recordings within the 6 hours before onset, discard from the final dataset
processing 1542/4709, subject_id = 28304
Patient 28304 has less then 6 hours recordings, discard from the final dataset
processing 1543/4709, subject_id = 28309
Patient 28309 has less then 6 hours recordings, discard from the final dataset
processing 1544/4709, subject_id = 28315
processing 1545/4709, subject_id = 28325
Patient 28325 has less then 6 hours recordings, discard from the final dataset
processing 1546/4709, subject_id = 28329
Patient 28329 has less then 6 hours recordings, discard from the final dataset
processing 1547/4709, subject_id = 28345
Patient 28345 has less then 6 hours recordings, discard from the final dataset
processing 1548/4709

In [None]:
print(type_1)
print(type_2)
print(type_3)

1764
1059
1886


## This block process the neonates/adults with no sepsis

1.   always starting with the first row
2.   if the whole los < 6 hrs, discard
2.   if the los is longer than 100 hours, only keep the first 100 hours
3.   aggregate the data into hourly time points



In [8]:
step = 0  # for indicating processing step
type_4 = 0  # neonates/adults with less than 6 hrs of recordings (non_sepsis)
type_5 = 0  # neonates/adults with successful generation of table (non_sepsis)

for csv_file in non_sepsis_dir:
  patient = pd.read_csv(patients_non_sepsis + csv_file)

  step +=1
  print("processing {}/{}, subject_id = {}".format(step,len(non_sepsis_dir),patient.subject_id[0]))

  new_df = pd.DataFrame(columns=patient.columns)

  # empty_row = {'subject_id':patient.subject_id[0], 'gender':patient.gender[0],
  #              'sepsis':False, 'SBP':np.nan, 'DBP':np.nan, 'MAP':np.nan, 'Temp':np.nan, 'HR':np.nan,
  #              'RR':np.nan, 'BaseExcess':np.nan, 'FiO2':np.nan, 'SaO2':np.nan, 'PCO2':np.nan, 
  #              'PH':np.nan, 'Calcium':np.nan, 'Sodium':np.nan, 'Potassium':np.nan, 'Creatinine':np.nan, 
  #              'Chloride':np.nan, 'Glucose':np.nan, 'WBC':np.nan, 'BUN':np.nan, 'PTT':np.nan,
  #              'Platelet':np.nan, 'time':np.nan}  # for neonates

  empty_row = {'subject_id':patient.subject_id[0], 'gender':patient.gender[0], 'age':patient.age[0],
               'sepsis':False, 'SBP':np.nan, 'DBP':np.nan, 'MAP':np.nan, 'Temp':np.nan, 'HR':np.nan,
               'RR':np.nan, 'BaseExcess':np.nan, 'SaO2':np.nan, 'PH':np.nan, 'Magnesium':np.nan,
               'Lactic':np.nan, 'Calcium':np.nan, 'Sodium':np.nan, 'Potassium':np.nan, 'HCO3':np.nan,
               'Creatinine':np.nan, 'Chloride':np.nan, 'Glucose':np.nan, 'WBC':np.nan, 'BUN':np.nan, 
               'PTT':np.nan, 'Platelet':np.nan, 'time':np.nan}  # for adults

  time_points = len(patient)
  start_time = patient.time[0]
  end_time = patient.time[time_points-1]
  FMT = '%Y-%m-%d %H:%M:%S'
  los = ( datetime.strptime( end_time, FMT) - datetime.strptime(start_time, FMT) ).total_seconds()/3600 #in hours
  if los < 6:
    print("Patient{} has less then 6 hours recordings, discard from the final dataset".format(patient.subject_id[0]))
    type_4 += 1
    continue

  type_5 += 1

  # determine the start time
  start = datetime.strptime(start_time,FMT)
  end = datetime.strptime(end_time,FMT)
  delta = timedelta(hours = 100)  #maximun 100 hours before onset for each neonate
  new_end = start + delta
  # get the ending index and ending time
  if new_end > end: 
    new_end = rounder_forward(end)
    new_end_index = time_points - 1
  else:
    new_end = rounder_forward(new_end)
    for index, row in patient.iterrows():
      current  = datetime.strptime(row.time, FMT)
      if current < new_end: 
        continue
      else: 
        new_end_index = index
        break

  time_diff = (new_end - start).total_seconds()/3600  # in hours
  for i in range(int(time_diff)+1):
    new_row = pd.Series(empty_row)
    new_row['time'] = start
    for index in range(0, new_end_index+1):
      if datetime.strptime( patient.iloc[index].time, FMT) >= start and datetime.strptime( patient.iloc[index].time, FMT) < start + timedelta(hours = 1):
        new_row = merge_rows(new_row,patient.iloc[index])
    new_df = new_df.append(new_row, ignore_index = True)
    start = start + timedelta(hours = 1)

  new_df.to_csv('/content/drive/MyDrive/MIMIC/Results/adults/processed_non_sepsis/'+ str(patient.subject_id[0]) + '.csv', index = False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
processing 4287/7499, subject_id = 13646
Patient13646 has less then 6 hours recordings, discard from the final dataset
processing 4288/7499, subject_id = 13648
Patient13648 has less then 6 hours recordings, discard from the final dataset
processing 4289/7499, subject_id = 13654
Patient13654 has less then 6 hours recordings, discard from the final dataset
processing 4290/7499, subject_id = 13657
Patient13657 has less then 6 hours recordings, discard from the final dataset
processing 4291/7499, subject_id = 13660
processing 4292/7499, subject_id = 13664
processing 4293/7499, subject_id = 13666
Patient13666 has less then 6 hours recordings, discard from the final dataset
processing 4294/7499, subject_id = 13668
processing 4295/7499, subject_id = 13669
Patient13669 has less then 6 hours recordings, discard from the final dataset
processing 4296/7499, subject_id = 13672
Patient13672 has less then 6 hours recordings, discard fr

In [9]:
print(type_4)
print(type_5)

4119
3380
