In [None]:

import pandas as pd
from tqdm.auto import tqdm
import os
import re
import numpy as np
import ujson as json
import dill
import pickle

In [None]:
# select the physiological time-series variables to be extracted (35 variables)
variables = ['ALP','HR', 'DiasABP','Na', 'Lactate', 'NIDiasABP', 'PaO2', 'WBC', 'pH', 'Albumin', 'ALT', 'Glucose', 'SaO2',
              'Temp', 'AST', 'Bilirubin', 'BUN', 'RespRate', 'Mg', 'HCT', 'SysABP', 'FiO2', 'K', 'GCS',
              'Cholesterol', 'NISysABP', 'TroponinT', 'MAP', 'TroponinI', 'PaCO2', 'Platelets', 'Urine', 'NIMAP',
              'Creatinine','HCO3' ]

Get patient ids and outcomes

In [None]:


def extract_interventions(patient_record, intervention_str):
    """ this function gets available intervention adminstration data for the chosen patient record """
    patient_record = patient_record.set_index('Parameter').to_dict()['Value']
    intervention_values = []
    for recording in [intervention_str]:
        if (recording in patient_record):
            intervention_values.append(patient_record[recording])
        else:
            intervention_values.append(np.nan)
    return intervention_values


def extract_observations(patient_record, variables):
    """ this function gets available observations for each of the variables /per chosen patient record """
    data = []
    patient_record = patient_record.set_index('Parameter').to_dict()['Value']
    for recording in variables:
        if (recording in patient_record):
            data.append(patient_record[recording])
        else:
            data.append(np.nan)
    return data

def group_time_hr(value):
    """ this function groups the observations per hour """
    hours, _ = map(int, value.split(':'))
    return hours

def get_dictionary(values, intervention):
    """ this function creates a data dictionary for each of the patient's data """
    m = pd.DataFrame(values)
    dictionary = {}
    dictionary["intervention"] = intervention
    dictionary['raw'] = values
    return dictionary

def myconverter(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, datetime.datetime):
            return obj.__str__()

In [None]:
def extract_patient_record_ts(id_,set_name,time_length,variables):
    """this function creates the patient dictionary from the txt files """
    f_name = set_name+"/"+ str(id_)+".txt"
    data = pd.read_csv(f_name)
    data['Time'] = data['Time'].apply(lambda x: group_time_hr(x))

    raw = []
    intervention= []
    for h in range(time_length):
        raw.append(extract_observations(data[data['Time'] == h],variables))
        intervention.append(extract_interventions(data[data['Time'] == h],"MechVent"))
    
    raw = np.array(raw)
    patient_dictionary = {'id': id_}
    patient_dictionary["data"] = get_dictionary(raw,intervention)
    patient_dictionary = json.dumps(patient_dictionary,default=myconverter)
    return(patient_dictionary)

In [None]:
def data_to_tensors(json_file_name, set_name, time_length, record_ids,variables):
    """ extract the data for the records in the chosen set folder and convert them to tensors"""
    json_file = open(json_file_name, 'w')
    for id_ in tqdm(record_ids):
            json_file.write(extract_patient_record_ts(id_,set_name,time_length,variables) + '\n')
    json_file.close()


In [None]:
def load_tensors(filename):
    """ this function  loads the json files with extracted/formatted data into two components, intervention and physiological data"""
    Data_raw = []
    Interventions =[]

    for i in open(filename):
        data_raw = json.loads(i)["data"]["raw"]
        interv = json.loads(i)["data"]["intervention"]

        Interventions.append(interv)
        Data_raw.append(data_raw)
    
    Interventions = np.array(Interventions)
    Data_raw =np.array(Data_raw)

    return(Data_raw,Interventions)

Load extracted ids and outcomes

In [None]:
ids_a = pd.read_pickle("extracts/ids_set_a.pkl")
ids_b = pd.read_pickle("extracts/ids_set_b.pkl")
ids_c = pd.read_pickle("extracts/ids_set_c.pkl")
outcomes_a = pd.read_pickle("extracts/outcomes_set_a.pkl")
outcomes_b = pd.read_pickle("extracts/outcomes_set_b.pkl")
outcomes_c = pd.read_pickle("extracts/outcomes_set_c.pkl")

In [None]:
ids_a.sort()
ids_b.sort()
ids_c.sort()

In [None]:
time_steps_to_extract = 48

In [None]:
data_to_tensors("set_a","set-a",time_steps_to_extract, ids_a,variables)

In [None]:
data_to_tensors("set_b","set-b",time_steps_to_extract, ids_b, variables)

In [None]:
data_to_tensors("set_c","set-c",time_steps_to_extract, ids_c,variables)

In [None]:
raw_data_a,interventions_a =load_tensors("set_a")

In [None]:
raw_data_b,interventions_b =load_tensors("set_b")

In [None]:
raw_data_c,interventions_c =load_tensors("set_c")

Export files

In [None]:
with open('extracts/3d_tensor_set_a.pkl', 'wb') as outfile:
    dill.dump(raw_data_a, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/3d_tensor_set_b.pkl', 'wb') as outfile:
    dill.dump(raw_data_b, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/3d_tensor_set_c.pkl', 'wb') as outfile:
    dill.dump(raw_data_c, outfile, pickle.HIGHEST_PROTOCOL) 


In [None]:
with open('extracts/interventions_a.pkl', 'wb') as outfile:
    dill.dump(interventions_a, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/interventions_b.pkl', 'wb') as outfile:
    dill.dump(interventions_b, outfile, pickle.HIGHEST_PROTOCOL) 
with open('extracts/interventions_c.pkl', 'wb') as outfile:
    dill.dump(interventions_c, outfile, pickle.HIGHEST_PROTOCOL) 
