In [None]:
import json
import pandas as pd
import numpy as np
import joblib
import scipy
import pickle

In [None]:
## This notebook assumes: 
#1: The loaded outcome is if the outcome happens ever, as opposed to the other evaluation, which was focused on the first 5 days
#2: num_windows is the number of hours // 4 in which we want to make the triaging decision. Our default is making predictions using 48 hours of data to triage

num_windows = 12

In [None]:
models_dict = joblib.load('models_dict.joblib')
models_dict.keys()

In [None]:
df_cohort = pd.read_csv('sample_cohort_outcome_past_2days.csv')

df_cohort = df_cohort[df_cohort['window_id'] < num_windows]
test_hosp, test_window, test_y = df_cohort['hosp_id'], df_cohort['window_id'], df_cohort['y']

cohort_IDs = df_cohort.set_index('ID')[[]]

In [None]:
len(np.unique(test_hosp))

## Simple Model

In [None]:
baseline_clfs = models_dict['Baseline']
df_baseline = pd.read_csv('../preprocessing/sample_output/baseline.csv').set_index('ID')

In [None]:
eval_matrix = scipy.sparse.csr_matrix(cohort_IDs.join(df_baseline).values.astype(float))
all_y = np.array([clf.predict_proba(eval_matrix)[:,1] for clf in baseline_clfs])
y_scores = all_y.mean(0)

df_Yte_all = pd.DataFrame({'hosp_id': test_hosp, 'window_id': test_window, 'y': test_y, 'y_score': y_scores})
df_Yte_agg = df_Yte_all.groupby('hosp_id').mean() #Can be changed to max, depending on how we want to aggregate scores

In [None]:
scores = np.sort(df_Yte_agg['y_score'])
total_negs = df_Yte_agg['y']
for s in scores: 
    curr = df_Yte_agg[df_Yte_agg['y_score'] <= s]
    #How many people do we flag with a perfect NPV (i.e. 0 people we flagged have the event)
    if 1 - curr['y'].mean() == 1.0: 
        print('NPV: {:.2f}, Population % Flagged {:.2%}'.format(1 - curr['y'].mean(), curr.shape[0] / len(scores))
        latest = curr


## Sweep over NPV

In [None]:
scores = np.sort(df_Yte_agg['result'])
baseline_npvs = []
baseline_flagged = []

for s in scores: 
    curr = df_Yte_agg[df_Yte_agg['y_score'] <= s]
    baseline_npvs.append(1 - curr['y'].mean())
    baseline_flagged.append(curr.shape[0] / len(scores))
    
fig, ax = plt.subplots(figsize=(3.5, 3.5))

plt.plot(baseline_flagged, baseline_npvs, label = 'Simple Model', lw = 1.25)

plt.xlabel('Percentage Flagged as Low-Risk')
plt.ylabel('Negative Predictive Value')