In [1]:
import numpy as np
import pandas as pd
import os
import torch # GRUD

In [2]:
inputpath = './input/set-a/'
inputdict = {
    "ALP" : 0,
    "ALT" : 1,
    "AST" : 2,
    "Albumin" : 3,
    "BUN" : 4,
    "Bilirubin" : 5,
    "Cholesterol" : 6,
    "Creatinine" : 7,
    "DiasABP" : 8,
    "FiO2" : 9,
    "GCS" : 10,
    "Glucose" : 11,
    "HCO3" : 12,
    "HCT" : 13,
    "HR" : 14,
    "K" : 15,
    "Lactate" : 16,
    "MAP" : 17,
    "Mg" : 18,
    "Na" : 19,
    "PaCO2" : 20,
    "PaO2" : 21,
    "Platelets" : 22,
    "RespRate" : 23,
    "SaO2" : 24,
    "SysABP" : 25,
    "Temp" : 26,
    "Tropl" : 27,
    "TroponinI" : 27, #temp
    "TropT" : 28,
    "TroponinT" : 28, #temp
    "Urine" : 29,
    "WBC" : 30,
    "Weight" : 31,
    "pH" : 32,
    "NIDiasABP" : 33,
    "NIMAP" : 34,
    "NISysABP" : 35,
    "MechVent" : 36,
    "RecordID" : 37,
    "Age" : 38,
    "Gender" :39,
    "ICUType" : 40,
    "Height": 41
}

In [3]:
max_input = np.load('./input/x_max.npy')
print(max_input)
print(max_input.shape)

[2.205e+03 1.147e+04 1.843e+04 5.300e+00 1.970e+02 4.770e+01 3.300e+02
 2.210e+01 2.680e+02 1.000e+00 1.500e+01 1.143e+03 5.000e+01 6.180e+01
 3.000e+02 2.290e+01 2.930e+01 3.000e+02 9.900e+00 1.770e+02 1.000e+02
 5.000e+02 1.047e+03 9.800e+01 1.000e+02 2.950e+02 4.210e+01 4.920e+01
 2.491e+01 1.100e+04 1.875e+02 3.000e+02 7.350e+02]
(33,)


In [4]:
def timeparser(time):
    return pd.to_timedelta(time + ':00')

def timedelta_to_day_figure(timedelta):
    return timedelta.days + (timedelta.seconds/86400) #(24*60*60)

In [5]:
df = pd.read_csv(inputpath+'132539.txt',\
                   header=0,\
                   parse_dates=['Time'],\
                   date_parser=timeparser)

In [6]:
def df_to_x_m_d(df, inputdic, size, id_posistion, split, max_input):
    grouped_data = df.groupby('Time')
    
    #generate input vectors
    x = np.zeros((len(inputdic)-2, grouped_data.ngroups))
    masking = np.zeros((len(inputdic)-2, grouped_data.ngroups))
    delta = np.zeros((split, size))
    timetable = np.zeros(grouped_data.ngroups)
    id = 0
    
    s_dataset = np.zeros((3, split, size))
    
    if grouped_data.ngroups > size:
        
        # fill the x and masking vectors
        pre_time = pd.to_timedelta(0)
        t = 0
        for row_index, value in df.iterrows():
            '''
            t = colum, time frame
            agg_no = row, variable
            '''
            #print(value)
            agg_no = inputdict[value.Parameter]

            # same timeline check.        
            if pre_time != value.Time:
                pre_time = value.Time
                t += 1
                timetable[t] = timedelta_to_day_figure(value.Time)

            #print('agg_no : {}\t t : {}\t value : {}'.format(agg_no, t, value.Value))
            x[agg_no, t] = value.Value    
            masking[agg_no, t] = 1
        
        # generate random index array 
        ran_index = np.random.choice(grouped_data.ngroups, size=size, replace=False)
        ran_index.sort()
        ran_index[0] = 0
        ran_index[size-1] = grouped_data.ngroups-1
        
        # take id for outcome comparing
        id = x[id_posistion, 0]
        
        # remove unnesserly parts(rows)
        x = x[:split, :]
        masking = masking[:split, :]
        
        # coulme(time) sampling
        x_sample = np.zeros((split, size))
        m_sample = np.zeros((split, size))
        time_sample = np.zeros(size)

        t_x_sample = x_sample.T
        t_marsking = m_sample.T
        #t_time = t_sample.T
        
        t_x = x.T
        t_m = masking.T
        #t_t = t.T

        it = np.nditer(ran_index, flags=['f_index'])
        while not it.finished:
            #print('it.index = {}, it[0] = {}, ran_index = {}'.format(it.index, it[0], ran_index[it.index]))
            t_x_sample[it.index] = t_x[it[0]]
            t_marsking[it.index] = t_m[it[0]]
            time_sample[it.index] = timetable[it[0]]
            it.iternext()
        
        x = x_sample
        masking = m_sample
        timetable = time_sample
        
        # normalize the X
        nor_x = x/max_input[:, np.newaxis]
     
        # fill the delta vectors
        for index, value in np.ndenumerate(masking):
            '''
            index[0] = row, agg
            index[1] = col, time
            '''
            if index[1] == 0:
                delta[index[0], index[1]] = 0
            elif masking[index[0], index[1]-1] == 0:
                delta[index[0], index[1]] = timetable[index[1]] - timetable[index[1]-1] + delta[index[0], index[1]-1]
            else:
                delta[index[0], index[1]] = timetable[index[1]] - timetable[index[1]-1]
    
    else:
                
        # fill the x and masking vectors
        pre_time = pd.to_timedelta(0)
        t = 0
        for row_index, value in df.iterrows():
            '''
            t = colum, time frame
            agg_no = row, variable
            '''
            #print(value)
            agg_no = inputdict[value.Parameter]

            # same timeline check.        
            if pre_time != value.Time:
                pre_time = value.Time
                t += 1
                timetable[t] = timedelta_to_day_figure(value.Time)

            #print('agg_no : {}\t t : {}\t value : {}'.format(agg_no, t, value.Value))
            x[agg_no, t] = value.Value    
            masking[agg_no, t] = 1
        
        # take id for outcome comparing
        id = x[id_posistion, 0]
        
        # remove unnesserly parts(rows)
        x = x[:split, :]
        masking = masking[:split, :]
        
        x = np.pad(x, ((0,0), (size-grouped_data.ngroups, 0)), 'constant')
        masking = np.pad(masking, ((0,0), (size-grouped_data.ngroups, 0)), 'constant')
        timetable = np.pad(timetable, (size-grouped_data.ngroups, 0), 'constant')
        
        # normalize the X
        nor_x = x/max_input[:, np.newaxis]
        
        # fill the delta vectors
        for index, value in np.ndenumerate(masking):
            '''
            index[0] = row, agg
            index[1] = col, time
            '''
            if index[1] == 0:
                delta[index[0], index[1]] = 0
            elif masking[index[0], index[1]-1] == 0:
                delta[index[0], index[1]] = timetable[index[1]] - timetable[index[1]-1] + delta[index[0], index[1]-1]
            else:
                delta[index[0], index[1]] = timetable[index[1]] - timetable[index[1]-1]
                
    s_dataset[0] = x
    s_dataset[1] = masking
    s_dataset[2] = delta
    
    return s_dataset, id

In [7]:
# def df_to_x_m_d(df, inputdic, size, id_posistion, split, max_input):
size = 49
id_posistion = 37
split = 33

s_dataset, id = df_to_x_m_d(df, inputdict, size, id_posistion, split, max_input)

print(s_dataset.shape)
print(s_dataset.size)

x = s_dataset[0]
m = s_dataset[1]
d = s_dataset[2]

print(x.shape)
print('---------')
print(m.shape)
print('---------')
print(d.shape)
print('---------')
print(id)

(3, 33, 49)
4851
(33, 49)
---------
(33, 49)
---------
(33, 49)
---------
132539.0


In [8]:
dataset = np.zeros((1,3, split, size))

for filename in os.listdir(inputpath):
    df = pd.read_csv(inputpath + filename,\
                     header=0,\
                     parse_dates=['Time'],\
                     date_parser=timeparser)
    s_dataset, id = df_to_x_m_d(df, inputdict, size, id_posistion, split, max_input)
    
    dataset = np.concatenate((dataset, s_dataset[np.newaxis, :,:,:]))
    

dataset = dataset[1:, :,:,:]    
print(dataset.shape)

print(dataset[0].shape)

print(dataset[0][0])

(4000, 3, 33, 49)
(3, 33, 49)
[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [-1.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


In [9]:
np.save('./input/dataset', dataset)

t_dataset = np.load('./input/dataset.npy')

print(t_dataset.shape)

(4000, 3, 33, 49)
