In [None]:
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn import metrics, utils, ensemble
from joblib import Parallel, delayed
import time 
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm 

In [None]:
###########
# TODO: CHANGE INSTITUTION NAME
institution = 'INSTITUTION'

In [None]:
# Input:  {INSTITUTION}-MASTER.csv
# Output: {INSTITUTION}-lead_times.csv

In [None]:
# Loads in data
master = pd.read_csv('{}-MASTER.csv'.format(institution))

req_cols = set(['hosp_id', 'y', 'y_score_fourvar', 'y_score_mcures',
                'y_scores_four_lst', 'y_scores_mcures_lst', 'race', 'age', 'sex', 'ethnicity',
                'outcome', 'outcome_time', 'admission_date',
                'final_time_min'])

master = master[req_cols]

In [None]:
from ast import literal_eval
master["y_scores_mcures_lst_"] = master["y_scores_mcures_lst"].apply(literal_eval)
master["y_scores_mcures_lst_eval1"] = master["y_scores_mcures_lst_"].apply(lambda L: L[1:]) # Exclude first window

In [None]:
# Extract relevant data from master table
all_scores = [np.array(x) for x in master["y_scores_mcures_lst_eval1"].values]
all_thresholds = np.sort(np.unique(master['y_score_mcures']))
max_scores = master['y_score_mcures']
final_times = master['outcome_time'].astype(float)
y = master['y'].astype(int)

# Sweep through all unique score thresholds
# Save the following quantities for each threshold
# - PPV
# - TPR
# - proportion of patients who were flagged positive at this threshold
# - list of all lead times of those that were flagged positive at this threshold
ppvs = []
tprs = []
props = []
all_lead_times = []

for thresh in tqdm([0] + list(all_thresholds) + [1]): 
    lead_times = []
    num_positive = 0
    
    # loop through all the patients
    for i in range(len(max_scores)):
        y_i = y[i]
        scores_i = all_scores[i]
        final_time_i = final_times[i]
        if sum(scores_i >= thresh) > 0 and y_i == 1: 
            first_window = np.argwhere(scores_i >= thresh).reshape(-1)[0] + 1 # add one back because we took out the first window
            time_of_first_window = ((first_window * 4) + 4) * 60  # in minutes, add four hours because the 0th window prediction occurs at 4h
            lead_times.append(int(final_time_i - time_of_first_window)) # cast to integer minutes
            num_positive += 1
            
    tprs.append(num_positive / sum(y))
    ppvs.append(num_positive / (max_scores >= thresh).sum())
            
    num_marked = (max_scores >= thresh).mean()
    props.append(num_marked)

    if len(lead_times) > 0: 
        all_lead_times.append(lead_times)
    else: 
        all_lead_times.append([0])

In [None]:
# Helper function for rounding numbers in nested list
def round_nested_list(lst, num=3):
    if isinstance(lst, str) or isinstance(lst, int):
        return lst
    elif isinstance(lst, float):
        return round(lst, num) # '%.{}f'.format(num) %lst
    else:
        return [round_nested_list(i) for i in lst]

In [None]:
# Save output
to_save_dct = {
    'PPVs': list(ppvs), 
    'TPRs': list(tprs), 
    'Proportion Flagged': list(props), 
    'Lead Times': list(all_lead_times),
}
df_results = pd.DataFrame(to_save_dct)

df_results['Lead Times'] = df_results['Lead Times'].astype(str)

# Round all numbers to 3 decimal places
for col in df_results.columns:
    df_results[col] = df_results[col].apply(round_nested_list)

df_results.to_csv('{}-lead_times.csv'.format(institution), index=False, float_format='%.3f')