In [62]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [63]:
df = pd.read_csv('time_series.csv')

In [64]:
df['stay_key'] = df['stay_id']

In [67]:
cols = ['Time', 'stay_id', 'stay_key', 'hadm_id', 'age', 'gender', 'Heart Rate',
       'Respiratory Rate', 'SpO2/SaO2', 'pH', 'Potassium', 'Calcium',
       'Glucose', 'Sodium', 'HCO3', 'White Blood Cells', 'Hemoglobin',
       'Red Blood Cells', 'Platelet Count', 'Weight', 'Urea Nitrogen',
       'Creatinine', 'Blood Pressure', '1 hours urine output',
       '6 hours urine output', 'AKI', 'gcs',
       'ventilation', 'vasoactive medications', 'sedative medications']

In [68]:
filled = df.groupby('stay_id')[cols].ffill().bfill()

In [69]:
filled['AKI_hour'] = filled.apply(lambda x: x['Time'] if x['AKI'] == 1 else 0, axis=1)
AKI_time = filled[filled['AKI'] == 1].groupby('stay_key')['AKI_hour'].first()

In [70]:
new = pd.merge(filled, AKI_time, left_on=['stay_key'], right_index=True, how='left').drop('AKI_hour_x', axis=1).rename({'AKI_hour_y': 'AKI_time'}, axis=1)

In [73]:
new['time_to_AKI'] = (pd.to_datetime(new['AKI_time']) - pd.to_datetime(new['Time'])) / np.timedelta64(1, 'h')
new = new[~(new['time_to_AKI'] < 0)]
new['AKI_happen'] = new['time_to_AKI'].apply(lambda x: 0 if pd.isna(x) else 1)

In [76]:
dataset = new.groupby('stay_key').first()
dataset['duration'] = (pd.to_datetime(new.groupby('stay_key')['Time'].max()) - pd.to_datetime(new.groupby('stay_key')['Time'].min())) / np.timedelta64(1, 'h')

In [79]:
features = ['age', 'gender', 'Heart Rate', 'Respiratory Rate',
       'SpO2/SaO2', 'pH', 'Potassium', 'Calcium', 'Glucose', 'Sodium', 'HCO3',
       'White Blood Cells', 'Hemoglobin', 'Red Blood Cells', 'Platelet Count',
       'Weight', 'Urea Nitrogen', 'Creatinine', 'Blood Pressure',
       '1 hours urine output', 'gcs',
       'ventilation', 'vasoactive medications', 'sedative medications']
target_binary = 'AKI_happen'
target_cont = 'time_to_AKI'

In [80]:
X_train, X_test, y_train, y_test = train_test_split(dataset[features], dataset[target_binary], test_size=0.2)

In [81]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [82]:
# model = linear_model.LogisticRegression(max_iter=10000)
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

GradientBoostingClassifier()

In [83]:
# Training data performance
pred = model.predict(X_train)
true = y_train
print('Accuracy:', accuracy_score(true, pred))
print('AUROC:', roc_auc_score(true, pred))

Accuracy: 0.6869122133803038
AUROC: 0.5841603888283142


In [84]:
# Test data performance
pred = model.predict(X_test)
true = y_test
print('Accuracy:', accuracy_score(true, pred))
print('AUROC:', roc_auc_score(true, pred))

Accuracy: 0.6857466347968297
AUROC: 0.5795564140759162


In [85]:
from lifelines import CoxPHFitter

In [86]:
train, test = train_test_split(dataset[features +['duration', target_binary]], test_size=0.2)

In [87]:
cph = CoxPHFitter()
cph.fit(train, duration_col='duration', event_col=target_binary)

<lifelines.CoxPHFitter: fitted with 31793 total observations, 20584 right-censored observations>

In [88]:
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'AKI_happen'
baseline estimation,breslow
number of observations,31793
number of events observed,11209
partial log-likelihood,-109023.58
time fit was run,2021-04-02 14:31:55 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age,0.01,1.01,0.0,0.01,0.01,1.01,1.01,15.55,<0.005,178.79
gender,-0.07,0.93,0.02,-0.11,-0.03,0.9,0.97,-3.5,<0.005,11.09
Heart Rate,0.0,1.0,0.0,0.0,0.0,1.0,1.0,3.05,<0.005,8.76
Respiratory Rate,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,-3.02,<0.005,8.62
SpO2/SaO2,0.01,1.01,0.0,0.0,0.01,1.0,1.01,1.99,0.05,4.41
pH,-0.2,0.82,0.12,-0.44,0.04,0.65,1.04,-1.61,0.11,3.21
Potassium,0.07,1.07,0.01,0.04,0.1,1.04,1.1,4.75,<0.005,18.92
Calcium,0.03,1.03,0.01,0.01,0.06,1.01,1.06,2.44,0.01,6.1
Glucose,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,-0.09,0.93,0.1
Sodium,-0.01,0.99,0.0,-0.01,-0.01,0.99,0.99,-4.5,<0.005,17.16

0,1
Concordance,0.65
Partial AIC,218095.17
log-likelihood ratio test,2490.17 on 24 df
-log2(p) of ll-ratio test,inf


In [90]:
# cph.predict_survival_function(test)
pd.DataFrame([cph.predict_median(train), train['duration'], train['AKI_happen']]).T
# cph.predict_partial_hazard(test)

Unnamed: 0,0.5,duration,AKI_happen
34415349,164.0,260.0,0.0
34490301,inf,18.0,0.0
30144147,167.0,449.0,0.0
36704580,122.0,331.0,0.0
36108309,82.0,68.0,0.0
34906773,311.0,29.0,0.0
30233109,312.0,95.0,0.0
31229480,224.0,0.0,1.0
37193209,inf,48.0,0.0
30608063,528.0,0.0,1.0
