In [1]:
import pandas as pd
from itertools import islice
import numpy as np
import json
import os

In [2]:
timeseries_nurse = pd.read_csv('Data/eICU_data/timeseriesnurse.csv')
timeseries_periodic = pd.read_csv('Data/eICU_data/timeseriesperiodic.csv')

In [3]:
def reconfigure_timeseries(timeseries, offset_column, feature_column=None, test=False):
    if test:
        timeseries = timeseries.iloc[300000:5000000]
    timeseries.set_index(['patientunitstayid', pd.to_timedelta(timeseries[offset_column], unit='T')], inplace=True)
    timeseries.drop(columns=offset_column, inplace=True)
    if feature_column is not None:
        timeseries = timeseries.pivot_table(columns=feature_column, index=timeseries.index)
    # convert index to multi-index with both patients and timedelta stamp
    timeseries.index = pd.MultiIndex.from_tuples(timeseries.index, names=['patient', 'time'])
    return timeseries

In [None]:
test=False

print('==> Reconfiguring nurse timeseries...')
# remove non numeric data
timeseries_nurse['nursingchartvalue'] = pd.to_numeric(timeseries_nurse['nursingchartvalue'], errors='coerce')
timeseries_nurse = timeseries_nurse.loc[timeseries_nurse['nursingchartvalue'].notnull()]
timeseries_nurse = reconfigure_timeseries(timeseries_nurse,
                                          offset_column='nursingchartoffset',
                                          feature_column='nursingchartcelltypevallabel',
                                          test=test)
timeseries_nurse.columns = timeseries_nurse.columns.droplevel()
    
print('==> Reconfiguring periodic timeseries...')
timeseries_periodic = reconfigure_timeseries(timeseries_periodic,
                                             offset_column='observationoffset',
                                             test=test)

patients = timeseries_periodic.index.unique(level=0)

==> Reconfiguring nurse timeseries...


In [4]:
timeseries_resp = pd.read_csv('Data/eICU_data/timeseries_resp.csv', index_col = ['patient', 'time'])

In [5]:
timeseries_resp

Unnamed: 0_level_0,Unnamed: 1_level_0,Exhaled MV,Exhaled TV (patient),FiO2,LPM O2,Mean Airway Pressure,PEEP,Peak Insp. Pressure,Plateau Pressure,Pressure Support,RR (patient),SaO2,TV/kg IBW,Tidal Volume (set),Total RR,Vent Rate
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
141168,0 days 06:36:00,,,,4.0,,,,,,,,,,,
141168,0 days 09:23:00,,,,4.0,,,,,,,,,,,
141168,0 days 10:06:00,,,,4.0,,,,,,,,,,,
141168,0 days 11:06:00,,,,4.0,,,,,,,,,,,
141168,0 days 12:06:00,,,,4.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353251,7 days 21:43:00,,,40.0,,,8.0,,,,,,,,,
3353251,7 days 22:23:00,,525.0,40.0,,,8.0,16.0,,7.0,,,,,,
3353251,7 days 22:43:00,,,40.0,,,8.0,,,7.0,,,,,,
3353254,-1 days +20:21:00,,,,,,,,,,,,79.90,,,


In [8]:
len(list(timeseries_resp.index.unique(level=0)))

87177

In [9]:
timeseries_aperiodic = pd.read_csv('Data/eICU_data/timeseries_aperiodic.csv', index_col = ['patient', 'time'])

In [10]:
timeseries_aperiodic

Unnamed: 0_level_0,Unnamed: 1_level_0,noninvasivesystolic,noninvasivediastolic,noninvasivemean
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
141168,0 days 02:03:00,106.0,68.0,81.0
141168,0 days 02:18:00,111.0,62.0,82.0
141168,0 days 05:49:00,,,79.0
141168,0 days 07:21:00,,,62.0
141168,0 days 23:18:00,,,27.0
...,...,...,...,...
3353263,0 days 11:50:00,118.0,85.0,98.0
3353263,0 days 12:50:00,128.0,91.0,106.0
3353263,0 days 13:50:00,131.0,88.0,107.0
3353263,0 days 14:50:00,147.0,98.0,118.0


In [18]:
timeseries_lab = pd.read_csv('Data/eICU_data/timeseries_lab.csv', index_col = ['patient', 'time'])

In [19]:
timeseries_lab

Unnamed: 0_level_0,Unnamed: 1_level_0,-basos,-eos,-lymphs,-monos,-polys,ALT (SGPT),AST (SGOT),BUN,Base Excess,FiO2,...,paCO2,paO2,phosphate,platelets x 1000,potassium,sodium,total bilirubin,total protein,troponin - I,urinary specific gravity
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
141168,0 days 03:51:00,,,,,,,,,,,...,,,,,,,,,,
141168,0 days 08:36:00,0.0,1.0,19.0,19.0,61.0,40.0,59.0,26.0,,,...,,,,209.0,4.0,139.0,2.6,7.1,,
141168,0 days 18:53:00,0.0,0.0,6.0,14.0,80.0,358.0,878.0,27.0,,,...,,,,213.0,4.2,139.0,4.1,7.1,,
141168,1 days 06:05:00,,,,,,,,,,28.0,...,46.0,41.0,,,,,,,,
141168,1 days 09:30:00,,,,,,,,,,100.0,...,44.0,42.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353254,0 days 03:31:00,,,,,,,,,,,...,,,,,,,,,,
3353254,0 days 06:34:00,,,,,,,,,,,...,,,,,,,,,,
3353263,-1 days +23:23:00,,,,,,,,,,,...,,,,,,,,,,
3353263,-1 days +23:53:00,0.0,1.0,24.0,10.0,,,,13.0,,,...,,,2.5,162.5,4.1,135.0,,,,


In [11]:
len(list(timeseries_aperiodic.index.unique(level=0)))

145754

In [14]:
set_difference = set(list(timeseries_aperiodic.index.unique(level=0))) - set(list(timeseries_resp.index.unique(level=0)))
list_difference = list(set_difference)

print(len(list_difference))

60082


In [24]:
shared_indices = list(set(list(timeseries_aperiodic.index.unique(level=0))).intersection(list(timeseries_resp.index.unique(level=0))))

In [25]:
len(shared_indices)

85672

In [26]:
shared_indices = list(set(shared_indices).intersection(list(timeseries_lab.index.unique(level=0))))

In [27]:
len(shared_indices)

85194

In [31]:
patients = timeseries_aperiodic.index.unique(level=0)

In [32]:
print(patients)

Int64Index([ 141168,  141194,  141203,  141208,  141227,  141233,  141244,
             141260,  141265,  141266,
            ...
            3353200, 3353201, 3353213, 3353216, 3353226, 3353235, 3353237,
            3353251, 3353254, 3353263],
           dtype='int64', name='patient', length=145754)


In [33]:
patients = shared_indices

In [59]:
timeseries_aperiodic.index.unique(level=1)

Index(['0 days 02:03:00', '0 days 02:18:00', '0 days 05:49:00',
       '0 days 07:21:00', '0 days 23:18:00', '0 days 00:51:00',
       '0 days 01:42:00', '0 days 02:42:00', '0 days 03:42:00',
       '0 days 04:42:00',
       ...
       '98 days 15:28:00', '98 days 21:27:00', '99 days 01:27:00',
       '99 days 05:27:00', '99 days 06:46:00', '99 days 07:08:00',
       '99 days 07:10:00', '99 days 07:11:00', '99 days 09:25:00',
       '99 days 12:27:00'],
      dtype='object', name='time', length=121174)

In [86]:
def resample_and_mask(timeseries, header, mask_decay=True, decay_rate=4/3, test=False,
                       verbose=False, length_limit=24*14):
    if test:
        mask_decay = False
        verbose = True
    if verbose:
        print('Resampling to 1 hour intervals...')
    # take the mean of any duplicate index entries for unstacking
    timeseries = timeseries.groupby(level=[0, 1]).mean()

    # Round up the time-stamps to the next hour
    timeseries.reset_index(level=1, inplace=True)
    start = pd.to_datetime('2000-01-01 00:00:00')
    timeseries.time = pd.to_timedelta(timeseries.time, errors='coerce') + start
    timeseries.time = timeseries.time.dt.ceil(freq='H')
    timeseries.time = timeseries.time - start
    timeseries.time = pd.to_timedelta(timeseries.time, unit='T')
    timeseries.set_index('time', append=True, inplace=True)
    timeseries.reset_index(level=0, inplace=True)
    resampled = timeseries.groupby('patient').resample('H', closed='right', label='right').mean().drop(columns='patient')
    del (timeseries)

    def apply_mask_decay(mask_bool):
        mask = mask_bool.astype(int)
        mask.replace({0: np.nan}, inplace=True)  # so that forward fill works
        inv_mask_bool = ~mask_bool
        count_non_measurements = inv_mask_bool.cumsum() - \
                                 inv_mask_bool.cumsum().where(mask_bool).ffill().fillna(0)
        decay_mask = mask.ffill().fillna(0) / (count_non_measurements * decay_rate).replace(0, 1)
        return decay_mask

    # store which values had to be imputed
    if mask_decay:
        if verbose:
            print('Calculating mask decay features...')
        mask_bool = resampled.notnull()
        mask = mask_bool.groupby('patient').transform(apply_mask_decay)
        del (mask_bool)
    else:
        if verbose:
            print('Calculating binary mask features...')
        mask = resampled.notnull()
        mask = mask.astype(int)

    if verbose:
        print('Filling missing data forwards...')
    # carry forward missing values (note they will still be 0 in the nulls table)
    resampled = resampled.fillna(method='ffill')

    # simplify the indexes of both tables
    mask = mask.rename(index=dict(zip(mask.index.levels[1],
                                      mask.index.levels[1].days*24 + mask.index.levels[1].seconds//3600)))
    resampled = resampled.rename(index=dict(zip(resampled.index.levels[1],
                                                resampled.index.levels[1].days*24 +
                                                resampled.index.levels[1].seconds//3600)))

    # clip to length_limit
    if length_limit is not None:
        within_length_limit = resampled.index.get_level_values(1) < length_limit
        resampled = resampled.loc[within_length_limit]
        mask = mask.loc[within_length_limit]

    if verbose:
        print('Filling in remaining values with zeros...')
    resampled.fillna(0, inplace=True)

    # rename the columns in pandas for the mask so it doesn't complain
    mask.columns = [str(col) + '_mask' for col in mask.columns]

    # merge the mask with the features
    final = pd.concat([resampled, mask], axis=1)
    final.reset_index(level=1, inplace=True)
    final = final.loc[final.time > 0]

    if verbose:
        print('Saving progress...')
    # save to csv
    if test is False:
        final.to_csv('Data/eICU_data/preprocessed_timeseries.csv', mode='a', header=header)
    return

def gen_patient_chunk(patients, size=1000):
    it = iter(patients)
    chunk = list(islice(it, size))
    while chunk:
        yield chunk
        chunk = list(islice(it, size))

In [87]:
size = 4000
test=False
gen_chunks = gen_patient_chunk(patients, size=size)
i = size
header = True  # for the first chunk include the header in the csv file

print('==> Starting main processing loop...')

for patient_chunk in gen_chunks:

    merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
    merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)

    if i == size:  # fixed from first run
        # all if not all are not normally distributed
        quantiles = merged.quantile([0.05, 0.95])
        maxs = quantiles.loc[0.95]
        mins = quantiles.loc[0.05]

    merged = 2 * (merged - mins) / (maxs - mins) - 1

    # we then need to make sure that ridiculous outliers are clipped to something sensible
    merged.clip(lower=-4, upper=4, inplace=True)  # room for +- 3 on each side, as variables are scaled roughly between 0 and 1

    resample_and_mask(merged, header, mask_decay=True, decay_rate=4/3, test=test, verbose=False)
    print('==> Processed ' + str(i) + ' patients...')
    i += size
    header = False

==> Starting main processing loop...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 4000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 8000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 12000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 16000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 20000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 24000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 28000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 32000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 36000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 40000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 44000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 48000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 52000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 56000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 60000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 64000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 68000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 72000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 76000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 80000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 84000 patients...


  merged = timeseries_lab.loc[patient_chunk].append(timeseries_resp.loc[patient_chunk], sort=False)
  merged = merged.append(timeseries_aperiodic.loc[patient_chunk], sort=True)


==> Processed 88000 patients...


In [2]:
timeseries = pd.read_csv('Data/eICU_data/preprocessed_timeseries.csv', index_col = ['patient', 'time'])

  timeseries = pd.read_csv('Data/eICU_data/preprocessed_timeseries.csv', index_col = ['patient', 'time'])


In [3]:
timeseries

Unnamed: 0_level_0,Unnamed: 1_level_0,-basos,-eos,-lymphs,-monos,-polys,ALT (SGPT),AST (SGOT),BUN,Base Excess,Exhaled MV,...,paCO2_mask,paO2_mask,phosphate_mask,platelets x 1000_mask,potassium_mask,sodium_mask,total bilirubin_mask,total protein_mask,troponin - I_mask,urinary specific gravity_mask
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
262151,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262151,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022624,0.075472,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262151,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022624,0.075472,...,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262151,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022624,0.075472,...,0.375,0.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262151,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022624,-0.122642,...,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145726,43,-0.6,-0.803922,0.046595,-0.945455,0.241026,-0.540166,-0.773594,-0.972222,-0.140271,-0.283019,...,0.017045,0.017045,0.0,0.034091,0.034091,0.034091,0.0,0.0,0.0,0.018293
3145726,44,-0.6,-0.803922,0.046595,-0.945455,0.241026,-0.540166,-0.773594,-0.972222,-0.140271,-0.283019,...,0.016667,0.016667,0.0,0.032609,0.032609,0.032609,0.0,0.0,0.0,0.017857
3145726,45,-0.6,-0.803922,0.046595,-0.945455,0.241026,-0.540166,-0.773594,-0.972222,-0.140271,-0.283019,...,0.016304,0.016304,0.0,0.03125,0.03125,0.03125,0.0,0.0,0.0,0.017442
3145726,46,-0.6,-0.803922,0.046595,-0.945455,0.241026,-0.540166,-0.773594,-0.972222,-0.140271,-0.283019,...,0.015957,0.015957,0.0,0.03,0.03,0.03,0.0,0.0,0.0,0.017045


In [3]:
def add_time_of_day(processed_timeseries, flat_features):

    print('==> Adding time of day features...')
    processed_timeseries = processed_timeseries.join(flat_features[['hour']], how='inner', on='patient')
    hour_list = np.linspace(0, 1, 24)  # make sure it's still scaled well
    processed_timeseries['hour'] = processed_timeseries['hour'].apply(lambda x: hour_list[x%24 - 24])
    return processed_timeseries

def further_processing(processed_timeseries, test=False):
    processed_timeseries.rename(columns={'Unnamed: 1': 'time'}, inplace=True)
    #processed_timeseries.set_index('patient', inplace=True)
    flat_features = pd.read_csv('Data/eICU_data/flat_features.csv')
    flat_features.rename(columns={'patientunitstayid': 'patient'}, inplace=True)
    processed_timeseries.sort_values(['patient', 'time'], inplace=True)
    flat_features.set_index('patient', inplace=True)

    processed_timeseries = add_time_of_day(processed_timeseries, flat_features)

    if test is False:
        print('==> Saving finalised preprocessed timeseries...')
        # this will replace old one that was updated earlier in the script
        processed_timeseries.to_csv('Data/eICU_data/preprocessed_timeseries.csv')

    return

In [4]:
further_processing(timeseries, False)

==> Adding time of day features...


  key_col = Index(lvals).where(~mask_left, rvals)


==> Saving finalised preprocessed timeseries...
