In [1]:
import pandas as pd
import numpy as np

In [2]:
dynamic_data = pd.read_csv('sph_dynamic.csv')

In [3]:
static_data = pd.read_csv('sph_static.csv')

In [4]:
# Missing Values in Dynamic Table
dynamic_data.isnull().sum()

stay_id                   0
charttime                 0
total_protein          6930
calcium                 933
creatinine              261
glucose                 444
sodium                  214
chloride                241
heart_rate             6833
sbp                    6895
dbp                    6895
mbp                    6887
resp_rate              6832
temperature            6974
hemoglobin             1179
wbc                    1207
alt                    3964
ast                    3936
alp                    3976
bilirubin_total        3957
bilirubin_direct       6808
bilirubin_indirect     6812
ph                     7004
lactate                7012
pt                     3068
urineoutput            6942
sofa_respiration       7005
sofa_coagulation       7023
sofa_liver             7023
sofa_cardiovascular    6872
sofa_cns               6979
sofa_renal             7024
dtype: int64

In [5]:
# Drop the columns with more than 80%
for col in dynamic_data.columns:
    if dynamic_data[col].isnull().sum() > len(dynamic_data)*0.8:
        del dynamic_data[col]

In [6]:
dynamic_data.isnull().sum()

stay_id               0
charttime             0
calcium             933
creatinine          261
glucose             444
sodium              214
chloride            241
hemoglobin         1179
wbc                1207
alt                3964
ast                3936
alp                3976
bilirubin_total    3957
pt                 3068
dtype: int64

In [7]:
# ['alt','ast','alp','bilirubin_total','pt'] are liver function related test results
# create a new binary column 'liver_function_test', True/1 means have ever taken liver function test
liver_test_result = ['alt','ast','alp','bilirubin_total','pt']
def liver_categorize(group):
    flag = True
    for i in liver_test_result:
        if group[i].notnull().any():
            flag = False
    if flag:
        group['liver_function_test'] = False
    else:
        group['liver_function_test'] = True
    return group

dynamic_data = dynamic_data.groupby('stay_id').apply(liver_categorize)

In [8]:
dynamic_data

Unnamed: 0,stay_id,charttime,calcium,creatinine,glucose,sodium,chloride,hemoglobin,wbc,alt,ast,alp,bilirubin_total,pt,liver_function_test
0,35715575,2148-12-27 18:15:00.000,8.5,0.9,137.0,138.0,104.0,,,,,,,,True
1,34483718,2118-01-04 03:58:00.000,8.2,0.8,129.0,141.0,101.0,8.7,11.3,,,,,12.1,True
2,31826892,2163-03-10 19:59:00.000,7.7,0.4,112.0,136.0,98.0,,,,,,,,True
3,36154799,2131-12-02 19:14:00.000,,,,,,12.3,,,,,,,True
4,32732521,2116-08-12 12:45:00.000,,4.0,135.0,139.0,105.0,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,31292653,2192-03-18 03:14:00.000,,1.4,102.0,137.0,103.0,8.7,4.9,,,,,,True
7020,32964221,2127-01-30 10:00:00.000,8.6,0.5,112.0,139.0,107.0,8.9,14.3,14.0,32.0,148.0,2.6,,True
7021,33493321,2142-07-28 06:02:00.000,,1.1,130.0,142.0,105.0,8.4,4.0,,,,,,True
7022,38658392,2189-05-17 00:13:00.000,7.3,1.0,174.0,133.0,93.0,13.0,19.5,9.0,18.0,48.0,0.5,13.0,True


In [9]:
# Assume that patients with no relevant results recorded don't have liver issues
# so we impute these patients' missing values of these columns with random number in normal range

# but i can not find the unit and normal range for them so i drop them fisrt >_<
dynamic_data.drop(['alt','ast','alp','bilirubin_total','pt'], axis = 1, inplace = True)

In [10]:
dynamic_data.isnull().sum()

stay_id                   0
charttime                 0
calcium                 933
creatinine              261
glucose                 444
sodium                  214
chloride                241
hemoglobin             1179
wbc                    1207
liver_function_test       0
dtype: int64

In [11]:
# use KNN to impute the rest
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 10)
dynamic_data.iloc[:,2:] = imputer.fit_transform(dynamic_data.iloc[:,2:])

In [12]:
dynamic_data.isnull().sum()

stay_id                0
charttime              0
calcium                0
creatinine             0
glucose                0
sodium                 0
chloride               0
hemoglobin             0
wbc                    0
liver_function_test    0
dtype: int64

In [15]:
# take the mean values of all timepoints for each patient
dynamic_data_mean = dynamic_data.drop(['charttime'],axis = 1).groupby('stay_id').mean().reset_index()

In [21]:
# merge dynamic and static table
data = static_data.merge(dynamic_data_mean, on = 'stay_id')

In [19]:
dynamic_data_mean.shape

(1923, 9)

In [20]:
static_data.shape

(1923, 5)

In [22]:
data.shape

(1923, 13)

In [23]:
data

Unnamed: 0,stay_id,icu_intime,vent_start,vent_end,vent_duration,calcium,creatinine,glucose,sodium,chloride,hemoglobin,wbc,liver_function_test
0,30004144,2126-04-04 13:20:25.000,4/5/26 16:00,4/6/26 17:00,25.000000,7.400000,0.700000,123.500000,135.000000,102.500000,12.400000,4.950000,1.0
1,30005366,2202-12-27 17:36:59.000,12/28/02 14:00,12/28/02 20:00,6.000000,8.800000,6.700000,41.000000,139.000000,100.000000,9.200000,9.400000,1.0
2,30006983,2159-10-12 03:56:42.000,10/12/59 18:00,10/14/59 19:00,49.000000,7.550000,0.900000,122.500000,134.000000,105.000000,9.630000,7.935000,1.0
3,30023204,2124-07-09 16:43:55.000,7/11/24 16:00,7/12/24 16:10,24.166667,8.033333,1.466667,110.000000,131.000000,101.333333,7.666667,20.733333,1.0
4,30031418,2156-03-05 14:11:00.000,3/7/56 22:06,3/8/56 8:00,9.900000,7.400000,0.400000,133.000000,139.000000,106.000000,8.960000,5.720000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1918,39977971,2115-12-11 17:42:45.000,12/12/15 12:00,12/12/15 16:00,4.000000,8.762500,2.182500,136.175000,132.925000,97.850000,10.980000,9.350000,1.0
1919,39982332,2180-03-01 22:35:04.000,3/2/80 19:00,3/3/80 8:00,13.000000,8.800000,1.200000,119.000000,140.000000,103.000000,11.500000,4.700000,1.0
1920,39985110,2141-03-03 05:57:46.000,3/4/41 20:44,3/6/41 4:00,31.266667,10.233333,5.866667,133.333333,138.333333,98.000000,12.266667,8.366667,1.0
1921,39986206,2183-06-19 23:25:31.000,6/20/83 22:00,6/30/83 4:00,222.000000,7.550000,5.800000,111.000000,139.000000,103.000000,10.200000,18.750000,1.0
