In [None]:
# import
import pickle
import numpy as np
import pandas as pd
from scipy import stats as st

In [None]:
# data diectory, modify before use
data_dir = "scripts/1_preprocessing"

In [None]:
# ==================================================================
# train data, 1h for sepsis, 6h for nonsep

In [None]:
# use 1 h data, for sepsis patients
# load patient ID
separate_id = pickle.load(open(
    data_dir + "/processed_data/ids/1h_ID.pickle", "rb"
))
sepsis_id, nonsep_id = separate_id['sepsis'], separate_id['nonsep']
# combine id
all_id = sepsis_id

In [None]:
# empty dataframe
feature_data = pd.DataFrame()
# physiological name
physio_name = [
    'heartrate', 'respiration', 'noninvasivesystolic', 'noninvasivediastolic', 'noninvasivemean'
]
# lab name
lab_name = [
    'paO2_FiO2', 'platelets_x_1000', 'total_bilirubin',
    'urinary_creatinine', 'creatinine', 'HCO3', 'pH', 'paCO2',
    'direct_bilirubin', 'excess', 'ast', 'bun', 'calcium', 'glucose',
    'lactate', 'magnesium', 'phosphate', 'potassium', 'hct', 'hgb', 'ptt',
    'wbc', 'fibrinogen', 'troponin', 'GCS_Score', 'ventilator'
]
for k in range(21):
    # for each patient, same step
    for p_id in all_id:
        # load data
        patient_data = pd.read_csv(
            data_dir + "processed_data/2_1_h/{}.csv".format(p_id),
            index_col=False
        )
        # feature
        feature = {}
        # for each vital
        for c in physio_name:
            for window in [15, 30, 60]:
                # data within window
                window_data = patient_data.loc[
                    (patient_data['offset'] <= 0) & (patient_data['offset'] >= -window), c
                ]
                # extract feature
                # mean
                feature['{}-{}-mean'.format(c, window)] = window_data.mean()
                # max
                feature['{}-{}-max'.format(c, window)] = window_data.max()
                # min
                feature['{}-{}-min'.format(c, window)] = window_data.min()
                # std
                feature['{}-{}-std'.format(c, window)] = window_data.std()
                # kurtosis
                feature['{}-{}-kurt'.format(c, window)] = window_data.kurtosis()
                # skewness
                feature['{}-{}-skew'.format(c, window)] = window_data.skew()
                # slope
                feature['{}-{}-slope'.format(c, window)] = st.linregress(
                    range(-window, 5, 5), window_data.values
                )[0]
        # add lab data
        for l in lab_name:
            feature[l] = patient_data.loc[patient_data['offset'] == 0, l].values[0]
        # add patient id
        feature['patientunitstayid'] = int(p_id)
        # add label
        feature['label'] = 1 if p_id in sepsis_id else 0
        # append data
        feature_data = feature_data.append(feature, ignore_index=True)

In [None]:
feature_data.shape

In [None]:
# use 6 h data,for nonsepsis patients
# load patient ID
separate_id = pickle.load(open(
    data_dir + "/processed_data/ids/6h_ID.pickle", "rb"
))
sepsis_id, nonsep_id = separate_id['sepsis'], separate_id['nonsep']
# combine id
all_id = np.random.choice(nonsep_id, 4750, False)  # Good: 4320, 7750, find middle ground

In [None]:
# empty dataframe
# feature_data = pd.DataFrame()  # pd.read_csv("processed_data/3_train/train_data.csv", index_col=False)
# physiological name
physio_name = [
    'heartrate', 'respiration', 'noninvasivesystolic', 'noninvasivediastolic', 'noninvasivemean'
]
# lab name
lab_name = [
    'paO2_FiO2', 'platelets_x_1000', 'total_bilirubin',
    'urinary_creatinine', 'creatinine', 'HCO3', 'pH', 'paCO2',
    'direct_bilirubin', 'excess', 'ast', 'bun', 'calcium', 'glucose',
    'lactate', 'magnesium', 'phosphate', 'potassium', 'hct', 'hgb', 'ptt',
    'wbc', 'fibrinogen', 'troponin', 'GCS_Score', 'ventilator'
]
# for each patient, same step
for p_id in all_id:
    # load data
    patient_data = pd.read_csv(
        data_dir + "processed_data/2_6_h/{}.csv".format(p_id),
        index_col=False
    )
    # timestamps for sepsis and nonsep patients
    if p_id in sepsis_id:
        timestamps = [0, -5, -10, -15, -20, -25, -30]
    else:
        timestamps = [0, -50, -100, -150, -200, -250, -300]
    # sample at multiple timestamps
    for t in timestamps:
        # feature
        feature = {}
        # for each vital
        for c in physio_name:
            for window in [15, 30, 60]:
                # data within window
                window_data = patient_data.loc[
                    (patient_data['offset'] <= 0 + t) & (patient_data['offset'] >= -window + t), c
                ]
                # extract feature
                # mean
                feature['{}-{}-mean'.format(c, window)] = window_data.mean()
                # max
                feature['{}-{}-max'.format(c, window)] = window_data.max()
                # min
                feature['{}-{}-min'.format(c, window)] = window_data.min()
                # std
                feature['{}-{}-std'.format(c, window)] = window_data.std()
                # kurtosis
                feature['{}-{}-kurt'.format(c, window)] = window_data.kurtosis()
                # skewness
                feature['{}-{}-skew'.format(c, window)] = window_data.skew()
                # slope
                feature['{}-{}-slope'.format(c, window)] = st.linregress(
                    range(-window, 5, 5), window_data.values
                )[0]
        # add lab data
        for l in lab_name:
            feature[l] = patient_data.loc[patient_data['offset'] == 0, l].values[0]
        # add patient id
        feature['patientunitstayid'] = int(p_id)
        # add label
        feature['label'] = 1 if p_id in sepsis_id else 0
        # append data
        feature_data = feature_data.append(feature, ignore_index=True)

In [None]:
feature_data.shape

In [None]:
feature_data.loc[feature_data['label'] == 0].shape

In [None]:
feature_data = feature_data.fillna(0)
feature_data.to_csv("processed_data/3_train/with_var/train_data.csv", index=False)

In [None]:
# =============================== Test data, feature for 12 h ================================

In [None]:
# use 12 h data, which contain the most patients
# load patient ID
separate_id = pickle.load(open(
    data_dir + "/processed_data/ids/12h_ID.pickle", "rb"
))
sepsis_id, nonsep_id = separate_id['sepsis'], separate_id['nonsep']
# combine id
all_id = sepsis_id + nonsep_id

In [None]:
# physiological name
physio_name = [
    'heartrate', 'respiration', 'noninvasivesystolic', 'noninvasivediastolic', 'noninvasivemean'
]
# lab name
lab_name = [
    'paO2_FiO2', 'platelets_x_1000', 'total_bilirubin',
    'urinary_creatinine', 'creatinine', 'HCO3', 'pH', 'paCO2',
    'direct_bilirubin', 'excess', 'ast', 'bun', 'calcium', 'glucose',
    'lactate', 'magnesium', 'phosphate', 'potassium', 'hct', 'hgb', 'ptt',
    'wbc', 'fibrinogen', 'troponin', 'GCS_Score', 'ventilator'
]
# for each patient, same step
for p_id in all_id:
    # load data
    patient_data = pd.read_csv(
        data_dir + "processed_data/2_12_h/{}.csv".format(p_id),
        index_col=False
    )
    # empty dataframe
    feature_data = pd.DataFrame()
    # for each time stamp
    for t in range(-660, 5, 5):
        # feature
        feature = {}
        # for each vital
        for c in physio_name:
            for window in [15, 30, 60]:
                # data within window
                window_data = patient_data.loc[
                    (patient_data['offset'] <= 0 + t) &\
                    (patient_data['offset'] >= -window + t), c
                ]
                # extract feature
                # mean
                feature['{}-{}-mean'.format(c, window)] = window_data.mean()
                # max
                feature['{}-{}-max'.format(c, window)] = window_data.max()
                # min
                feature['{}-{}-min'.format(c, window)] = window_data.min()
                # std
                feature['{}-{}-std'.format(c, window)] = window_data.std()
                # kurtosis
                feature['{}-{}-kurt'.format(c, window)] = window_data.kurtosis()
                # skewness
                feature['{}-{}-skew'.format(c, window)] = window_data.skew()
                # slope
                feature['{}-{}-slope'.format(c, window)] = st.linregress(
                    range(-window, 5, 5), window_data.values
                )[0]
        # add lab data
        for l in lab_name:
            feature[l] = patient_data.loc[patient_data['offset'] == 0, l].values[0]
        # add patient id
        feature['patientunitstayid'] = int(p_id)
        # add label
        feature['label'] = 1 if p_id in sepsis_id and t == 0 else 0
        # add time
        feature['offset'] = t
        # append data
        feature_data = feature_data.append(feature, ignore_index=True)
    feature_data = feature_data.fillna(0)
    feature_data.to_csv("processed_data/3_12h_feature/{}.csv".format(p_id), index=False)