In [None]:
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn import metrics, utils, ensemble
from joblib import Parallel, delayed
import time 
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm 

In [None]:
###########
# TODO: CHANGE INSTITUTION NAME
institution = 'INSTITUTION'

In [None]:
# Input:  {INSTITUTION}-MASTER.csv
# Output: {INSTITUTION}_usecase2.csv

In [None]:
# Loads in data
master = pd.read_csv('{}-MASTER.csv'.format(institution))

req_cols = set(['hosp_id', 'y', 'y_score_fourvar', 'y_score_mcures',
                'y_scores_four_lst', 'y_scores_mcures_lst', 'race', 'age', 'sex', 'ethnicity',
                'outcome', 'outcome_time', 'admission_date',
                'final_time_min'])

master = master[req_cols]

In [None]:
from ast import literal_eval
master["y_scores_mcures_lst_"] = master["y_scores_mcures_lst"].apply(literal_eval)
master["y_scores_four_lst_"] = master["y_scores_four_lst"].apply(literal_eval)

In [None]:
## Compute cohort, label, and scores for secondary use case

# cohort contains hospitalizations who have not met outcome or discharged at 48h
in_second_case = (master['final_time_min'] > 2880).values
master['in_second_use_case'] = in_second_case

# scores considered: first 48h (12 windows) excluding zeroth window
secondary_four = [np.mean(x[1:12]) if master['in_second_use_case'][i] else np.nan for i,x in enumerate(master['y_scores_four_lst_'].values)]
mcures_four = [np.mean(x[1:12]) if master['in_second_use_case'][i] else np.nan for i,x in enumerate(master['y_scores_mcures_lst_'].values)]
master['secondary_four'] = secondary_four
master['secondary_mcures'] = mcures_four

# secondary use case y value 
master['y_secondary'] = (1 - np.isnan(master['outcome_time'])).values

In [None]:
#Figure 4

from tqdm import tqdm 

def bootstrap_fn(i, df, replace = True, val = 48):
    df_Yte_agg = df.sample(frac = 1, replace=replace, random_state=i)
    
    scores = np.sort(df_Yte_agg['secondary_mcures'])
    for s in scores: 
        curr = df_Yte_agg[df_Yte_agg['secondary_mcures'] <= s]
        if 1 - curr['y_secondary'].mean() >= 0.95: 
            latest = curr
    try: 
        discharged_all = latest.shape[0] / len(scores)
        num_all = latest.shape[0]
        total_days_all = np.sum((latest['final_time_min'] / (60 * 24)) - (val / 24))
        
        latest = latest[latest['y_secondary'] == 0]
        discharged = latest.shape[0] / len(scores)
        num = latest.shape[0]
        total_days = np.sum((latest['final_time_min'] / (60 * 24)) - (val / 24))

    except: 
        return 0, 0, 0, 0
    
    return discharged, total_days, discharged_all, total_days_all


def get_roc_CI(df, val):
    discharged, days, discharged_all, days_all = zip(*Parallel(n_jobs=10)(delayed(bootstrap_fn)(i, df, val) for i in range(1000)))
    return discharged, days, discharged_all, days_all

    
secondary = master[master['in_second_use_case'] == True]

import time 

now = time.time() 

discharged, days, discharged_all, days_all = get_roc_CI(secondary, 48)
num_ex_usecase2 = secondary.shape[0]
prop_outcome_usecase2 = secondary['y_secondary'].mean()

df_Yte_agg = secondary
    
scores = np.sort(df_Yte_agg['secondary_mcures'])

# Loop through all score thresholds from smallest to largest
for s in scores: 
    curr = df_Yte_agg[df_Yte_agg['secondary_mcures'] <= s]
    if 1 - curr['y_secondary'].mean() >= 0.95: 
        latest = curr # find the largest threshold with NPV >= 95%
try: 
    # Predicted Negatives: proportion and days saved
    non_bootstrap_discharged_all = latest.shape[0] / len(scores)
    non_bootstrap_total_days_all = np.sum((latest['final_time_min'] / (60 * 24)) - (48 / 24))
    
    # True Negatives: proportion and days saved
    latest = latest[latest['y_secondary'] == 0]
    non_bootstrap_discharged = latest.shape[0] / len(scores)
    num = latest.shape[0]
    non_bootstrap_total_days = np.sum((latest['final_time_min'] / (60 * 24)) - (48 / 24))
except: 
    non_bootstrap_discharged = 0
    non_bootstrap_total_days = 0
    non_bootstrap_discharged_all = 0
    non_bootstrap_total_days_all = 0
        

print(time.time() - now)

In [None]:
D_results = []
D_results.append({
    'Institution': institution,    
    # Figure 4
    'days_saved_boostraps': list(days), 
    'discharged_boostraps': list(discharged),
    'days_saved_all_boostraps': list(days_all), 
    'discharged_all_boostraps': list(discharged_all),
    'N_use_case_2': num_ex_usecase2,
    'perc_use_case_2': prop_outcome_usecase2,
    'days_saved': non_bootstrap_total_days, 
    'discharged': non_bootstrap_discharged,
    'discharged_all': non_bootstrap_discharged_all,
    'days_saved_all': non_bootstrap_total_days_all,
})

In [None]:
# Helper function for rounding numbers in nested list
def round_nested_list(lst, num=3):
    if isinstance(lst, str) or isinstance(lst, int):
        return lst
    elif isinstance(lst, float):
        return round(lst, num) # '%.{}f'.format(num) %lst
    else:
        return [round_nested_list(i) for i in lst]

In [None]:
df_results = pd.DataFrame(D_results)

# Round all numbers to 3 decimal places
for col in df_results.columns:
    df_results[col] = df_results[col].apply(round_nested_list)

df_results.to_csv('{}_usecase2_.csv'.format(institution), index=False, float_format="%.3f")