In [None]:
# Use spline to fill lab data as well as non-invasive BP.

In [1]:
# import
import pickle
import numpy as np
import pandas as pd
from scipy import interpolate

In [2]:
# data diectory, modify before use
data_dir = "scripts/0_sepsis_ground_truth/full_data"

In [3]:
# patient id
separate_id = pickle.load(open(
    data_dir + "Final_ID.pickle",
    "rb"
))
nonsep_id, sepsis_id = separate_id['nonsep'], separate_id['sepsis']
all_id = nonsep_id + sepsis_id

In [4]:
interp_list = [
    'paO2_FiO2', 'platelets_x_1000', 'total_bilirubin', 'urinary_creatinine', 
    'creatinine', 'HCO3', 'pH', 'paCO2', 'direct_bilirubin', 'excess', 'ast', 'bun',
    'calcium', 'glucose', 'lactate', 'magnesium', 'phosphate', 'potassium', 'hct',
    'hgb', 'ptt', 'wbc', 'fibrinogen', 'troponin', 'GCS_Score', 'noninvasivesystolic',
    'noninvasivediastolic', 'noninvasivemean'
]

In [5]:
# do this for each patient
i = 0
for p_id in all_id:
    # progress
    if i == int(len(all_id) * 0.1):
        print("10% done.")
    if i == int(len(all_id) * 0.25):
        print("25% done.")
    if i == int(len(all_id) * 0.5):
        print("50% done.")
    if i == int(len(all_id) * 0.75):
        print("75% done.")
    if i == int(len(all_id) * 0.9):
        print("90% done.")
    # load data
    patient_data = pd.read_csv(data_dir + "full_data/{}.csv".format(p_id), index_col=0)
    # check the completeness of each column and interpolate.
    # DO NOT fill vitals so that the Nan values can be used to sample with equal timestamp (5 min)
    for c in interp_list:
        # interpolate points
        int_points = patient_data['offset'].loc[patient_data[c].notna()]
        # not all nan
        if int_points.shape[0] >= 1:
            kind = "slinear" if int_points.shape[0] >= 2 else "zero"
            # interpolation function
            f = interpolate.interp1d(
                int_points,
                patient_data[c].loc[patient_data[c].notna()],
                kind=kind
            )
            # points on horizon
            x_new = patient_data['offset'].loc[
                (patient_data['offset'] <= int_points.max()) &\
                (patient_data['offset'] >= int_points.min())
            ]
            # interpolatio results
            y_new = f(x_new)
            # replace values in dataframe
            patient_data[c].loc[
                (patient_data['offset'] <= int_points.max()) &\
                (patient_data['offset'] >= int_points.min())
            ] = y_new
            # forwar / backward fill, exclude vitals
            if 'noninvasive' not in c:
                patient_data[c] = patient_data[c].fillna(method="ffill")
                patient_data[c] = patient_data[c].fillna(method="bfill")
    # ventilator
    patient_data = patient_data.rename({'nursingchartvalue': 'ventilator'}, axis=1)
    for j in range(patient_data.shape[0]):
        if patient_data['ventilator'].iloc[j] == 'ventilator':
            patient_data['ventilator'].iloc[j] = 1
        else:
            patient_data['ventilator'].iloc[j] = np.nan
    # forward fill
    patient_data['ventilator'] = patient_data['ventilator'].fillna(method="ffill")
    # fill with 0
    patient_data['ventilator'] = patient_data['ventilator'].fillna(value=0)
    # save data
    patient_data.to_csv("processed_data/0_filled_lab/{}.csv".format(p_id), index=False)
    i += 1


10% done.
25% done.
50% done.
75% done.
90% done.
