In [None]:
import pandas as pd
import numpy as np
import torch


In [None]:
df = pd.read_csv('../data/trajectories_with_features.csv')
los_df = pd.read_csv('../data/length_of_stay.csv')


In [None]:
print(list(df.columns))
train_set = df[df['split'] == 'train']
test_set = df[df['split'] == 'test']

In [None]:
print(df.head)

In [None]:
print(los_df.columns)

In [None]:
# check number of unique hadm_ids in each df
print(len(los_df))
print(len(np.unique(los_df['hadm_id'])))
print(len(df))
print(len(np.unique(df['hadm_id']))) # only 5k unique hadms

In [None]:
def add_los(df, los_df):
    """
    add LOS as column to original df according to matching hadm_id
    if multiple matches in los_df, return the max length
    if day_cutoff, also add a binarized result 
    """
    lengths = []
    for i, row in df.iterrows():
        stay_len = np.max(los_df[los_df['hadm_id'] == row['hadm_id']]['length_of_stay'])
        lengths.append(stay_len)
    df['length_of_stay'] = lengths
    return df


DAY_CUTOFF = 14
df = add_los(df, los_df)
    
    
    
    

In [None]:
FIRST_FEAT_COL = 22
[print(col) for i, col in enumerate(df.columns) if i >= FIRST_FEAT_COL]
feature_cols = df.columns[FIRST_FEAT_COL:]

In [None]:
print(feature_cols)

In [None]:
def add_binary_los(df, day_cutoff):
    """
    from los in df, add binary label if los > day_cutoff
    """
    los = np.array(df['length_of_stay'])
    labels = los > day_cutoff
    df['binary_los'] = labels
    return df

df = add_binary_los(df, DAY_CUTOFF)
df.to_csv(f'../data/trajectories_los.csv')

In [None]:
# plot the distribution of labels
import matplotlib.pyplot as plt

plt.hist(df['length_of_stay'])
plt.xlabel('days in ICU')


In [None]:
vals, counts = np.unique(df['binary_los'], return_counts=True)
plt.bar(['<14 days', '>=14 days'], counts)
print(counts)

In [None]:
# replace missing values with 0
def replace_missing_vals(df, method='fill'):
    """
    either replace missing vals with zero or drop cols
    """
    if method == 'fill':
        df = df.fillna(0)
        df.replace([np.inf, -np.inf], 0, inplace=True)
    elif method == 'drop':
        pass
    return df

In [None]:
print(df.columns)
train_set = df[df['split'] == 'train']
test_set = df[df['split'] == 'test']
train_labels = train_set['binary_los']
test_labels = test_set['binary_los']
train_set = train_set.drop(columns=['binary_los', 'length_of_stay'])
test_set = test_set.drop(columns=['binary_los', 'length_of_stay'])
train_set = train_set.iloc[:, FIRST_FEAT_COL:]
test_set = test_set.iloc[:, FIRST_FEAT_COL:]
train_set = replace_missing_vals(train_set)
test_set = replace_missing_vals(test_set)

In [None]:
# XGBoost
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=None, random_state=0).fit(train_set, train_labels)

preds = clf.predict_proba(test_set)[:, 1]
print(f'\t AUC:: {roc_auc_score(test_labels, preds)}')  
print('Average precision-recall score: {0:0.2f}'.format(average_precision_score(test_labels, preds))) 

In [None]:

def autolabel(rects):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                height,
                ha='center', va='bottom')


In [None]:
tasks = ['ecg categorical', 'ecg continuous']
performance = [0.582, 0.591]
fig, ax = plt.subplots()
rects = plt.bar(tasks, performance)
autolabel(rects)
plt.xlabel('auc on LOS >= 14 days')
plt.show()