In [213]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [214]:
df = pd.read_csv('time_series.csv')

In [215]:
df['stay_key'] = df['stay_id']

In [216]:
cols = ['Time', 'stay_id', 'stay_key', 'hadm_id', 'age', 'gender', 'Heart Rate',
       'Respiratory Rate', 'SpO2/SaO2', 'pH', 'Potassium', 'Calcium',
       'Glucose', 'Sodium', 'HCO3', 'White Blood Cells', 'Hemoglobin',
       'Red Blood Cells', 'Platelet Count', 'Weight', 'Urea Nitrogen',
       'Creatinine', 'Blood Pressure', '1 hours urine output',
       '6 hours urine output', 'AKI', 'gcs',
       'ventilation', 'vasoactive medications', 'sedative medications']

In [219]:
filled = df.groupby('stay_id')[cols].ffill().bfill()

In [220]:
first_dataset = filled.groupby('stay_key').first()

In [221]:
# filled['AKI_next_hour'] = filled.groupby('stay_key')['AKI'].shift(-1)
# filled = filled[~filled['AKI_next_hour'].isna()]
filled['AKI_next_6_hour'] = filled.groupby('stay_key')['AKI'].rolling(6).max().shift(-6).reset_index(0,drop=True)
filled = filled[~filled['AKI_next_6_hour'].isna()]

In [222]:
dataset = pd.merge(filled, first_dataset, left_on='stay_key', right_index=True, suffixes=('_current', '_first'))
dataset['time_in_ICU'] = (pd.to_datetime(dataset['Time_current']) - pd.to_datetime(dataset['Time_first'])) / np.timedelta64(1, 'h')

In [223]:
dataset.head()

Unnamed: 0,Time_current,stay_key,hadm_id_current,age_current,gender_current,Heart Rate_current,Respiratory Rate_current,SpO2/SaO2_current,pH_current,Potassium_current,...,Creatinine_first,Blood Pressure_first,1 hours urine output_first,6 hours urine output_first,AKI_first,gcs_first,ventilation_first,vasoactive medications_first,sedative medications_first,time_in_ICU
0,2111-09-04 04:00:00,33225641,27845175,32,0,104.0,21.0,97.0,7.38,4.4,...,0.4,99.0,0.0,0.0,0,15.0,0,0,1,0.0
1,2111-09-04 05:00:00,33225641,27845175,32,0,104.0,25.0,95.0,7.38,4.4,...,0.4,99.0,0.0,0.0,0,15.0,0,0,1,1.0
2,2111-09-04 06:00:00,33225641,27845175,32,0,120.0,18.0,98.0,7.38,4.4,...,0.4,99.0,0.0,0.0,0,15.0,0,0,1,2.0
3,2111-09-04 07:00:00,33225641,27845175,32,0,106.0,18.0,98.0,7.38,4.4,...,0.4,99.0,0.0,0.0,0,15.0,0,0,1,3.0
4,2111-09-04 08:00:00,33225641,27845175,32,0,103.0,17.5,98.0,7.28,4.4,...,0.4,99.0,0.0,0.0,0,15.0,0,0,1,4.0


In [257]:
first_ts_features = ['Heart Rate_first', 'Respiratory Rate_first',
       'SpO2/SaO2_first', 'pH_first', 'Potassium_first', 'Calcium_first',
       'Glucose_first', 'Sodium_first', 'HCO3_first',
       'White Blood Cells_first', 'Hemoglobin_first', 'Red Blood Cells_first',
       'Platelet Count_first', 'Weight_first', 'Urea Nitrogen_first',
       'Creatinine_first', 'Blood Pressure_first',
       '1 hours urine output_first', '6 hours urine output_first', 'AKI_first',
       'gcs_first', 'ventilation_first', 'vasoactive medications_first',
       'sedative medications_first']
current_ts_features = ['Heart Rate_current', 'Respiratory Rate_current',
       'SpO2/SaO2_current', 'pH_current', 'Potassium_current',
       'Calcium_current', 'Glucose_current', 'Sodium_current', 'HCO3_current',
       'White Blood Cells_current', 'Hemoglobin_current',
       'Red Blood Cells_current', 'Platelet Count_current', 'Weight_current',
       'Urea Nitrogen_current', 'Creatinine_current', 'Blood Pressure_current',
       '1 hours urine output_current', '6 hours urine output_current',
       'AKI_current', 'gcs_current', 'ventilation_current',
       'vasoactive medications_current', 'sedative medications_current'
#                        , 'time_in_ICU'
                      ]
demo_features = ['age_first', 'gender_first']
# features = first_ts_features + current_ts_features + demo_features
features = first_ts_features + current_ts_features + demo_features
target = 'AKI_next_6_hour'

In [258]:
stays = dataset['stay_key'].unique()
total_icu_stays = len(stays)
np.random.seed(42)
np.random.shuffle(stays)
train_stays = dataset[dataset['stay_key'].isin(stays[:30000])]
X_train, y_train = train_stays[features], train_stays[target]
test_stays = dataset[dataset['stay_key'].isin(stays[30000:])]
X_test, y_test = test_stays[features], test_stays[target]

In [259]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [260]:
# model = linear_model.LogisticRegression(max_iter=10000)
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

GradientBoostingClassifier()

In [261]:
# Training data performance
pred = model.predict(X_train)
true = y_train
print('Accuracy:', accuracy_score(true, pred))
print('AUROC:', roc_auc_score(true, pred))

Accuracy: 0.9611193603655054
AUROC: 0.6127712610038117


In [262]:
# Test data performance
pred = model.predict(X_test)
true = y_test
print('Accuracy:', accuracy_score(true, pred))
print('AUROC:', roc_auc_score(true, pred))

Accuracy: 0.9609303953186461
AUROC: 0.5993553866985276


In [85]:
# from lifelines import CoxPHFitter
# train, test = train_test_split(dataset[features +['duration', target_binary]], test_size=0.2)
# cph = CoxPHFitter()
# cph.fit(train, duration_col='duration', event_col=target_binary)
# cph.print_summary()
# cph.predict_survival_function(test)
# pd.DataFrame([cph.predict_median(train), train['duration'], train['AKI_happen']]).T
# cph.predict_partial_hazard(test)