# Statistical Analysis of the Lab Normality Paper

### Author: Song Xu

In [1]:
import stats_utils
from scripts.LabTestAnalysis.machine_learning import LabNormalityLearner_Legacy as LNL
from scripts.LabTestAnalysis.machine_learning.ml_utils import map_lab

import LocalEnv
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import metrics 

import warnings
warnings.filterwarnings("ignore")

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'agg' by the following code:
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/Users/songxu/healthrex/CDSS/env/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/songxu/healthrex/CDSS/env/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/songxu/healthrex/CDSS/env/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 499, in start
    self.io_l

## Setting up variables

- Assign Data Set and Type here

In [11]:
data_source = 'Stanford'
lab_type = 'component'
curr_version = '10000-episodes-lastnormal'
inverse01 = True # Setting 'True' to interpret 'Normal' as 'Negative'

In [12]:
project_folderpath = os.path.join(LocalEnv.PATH_TO_CDSS, 'scripts/LabTestAnalysis')
stats_folderpath = os.path.join(project_folderpath, 'lab_statistics')
ML_folderpath = os.path.join(project_folderpath, 'machine_learning')

In [13]:
inverse_maker = '_inversed01' if inverse01 else ''

dataSet_foldername = 'data-%s-%s-%s'%(data_source, 
                                      lab_type, 
                                      curr_version)
dataML_folderpath = os.path.join(ML_folderpath, dataSet_foldername)
dataStats_folderpath = os.path.join(stats_folderpath, dataSet_foldername)


In [14]:
all_labs = stats_utils.get_all_labs(data_source, lab_type)

labDescriptions = stats_utils.get_lab_descriptions(data_source=data_source, 
                                                   lab_type=lab_type)

## Figure 1: plot_full_cartoon of LABLDH

In [7]:
lab = 'LABLDH'
score_thres = 0.756
include_threshold_colors = True

figure1_folderpath = os.path.join(stats_folderpath, 'Fig1_cartoon')
if not os.path.exists(figure1_folderpath):
    os.mkdir(figure1_folderpath)

In [58]:
xVal_base, yVal_base, score_base, xVal_best, yVal_best, score_best, p_val \
            = stats_utils.get_curve_onelab(lab,
                                           all_algs=['random-forest'],
                                           data_folder=dataML_folderpath,
                                           curve_type='ROC',
                                           get_pval=False)

In [59]:
plt.figure(figsize=(5, 4))
# plt.plot(xVal_base, yVal_base, label='baseline model, %0.2f' % (score_base), linewidth=2)
'''Representative ROC of LABLDH'''
if not inverse01:
    plt.plot(xVal_best, yVal_best, color='orange', linewidth=2) #, label='random forest', AUROC=%0.2f  % (score_best)
else:
    plt.plot(1-yVal_best, 1-xVal_best, color='orange', linewidth=2)

if include_threshold_colors:
    df_directcompare_rf = pd.read_csv(os.path.join(dataML_folderpath, lab, 'random-forest', 'direct_comparisons.csv'))
    actual_labels = df_directcompare_rf['actual'].values
    predict_probas = df_directcompare_rf['predict'].values

    sensitivity, specificity, LR_p, LR_n, PPV, NPV = stats_utils.get_confusion_metrics(actual_labels, predict_probas, score_thres, also_return_cnts=False)
    print "sensitivity", sensitivity
    print "specificity", specificity
    print "score_thres", score_thres

    '''The POINT of PPV=0.95'''
    if not inverse01:
        plt.scatter(1-specificity, sensitivity, s=50, color='orange')
    else:
        plt.scatter(1-sensitivity, specificity, s=50, color='orange')

    '''Reference line of AUC=0.5'''
    dash_num = 20
    # plt.plot([1-specificity]*dash_num, np.linspace(0,1,num=dash_num), 'k--')
    plt.plot(np.linspace(0,1,num=dash_num),np.linspace(0,1,num=dash_num), color='lightblue', linestyle='--')

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xticks([])
plt.yticks([])
plt.ylabel('Sensitivity', fontsize=16) #lab_descriptions.get(lab, lab)
plt.xlabel('1-Specificity', fontsize=16)
# plt.legend(fontsize=12)
plt.savefig(os.path.join(figure1_folderpath, 'ROC_%s%s.png'%(lab,inverse_maker)))

plt.clf()

df = pd.read_csv(dataML_folderpath + "/%s/baseline_comparisons.csv"
                 % (lab), keep_default_na=False)
scores_actual_0 = df.ix[df['actual'] == 0, 'predict'].values
scores_actual_1 = df.ix[df['actual'] == 1, 'predict'].values

plot_baseline = False
if plot_baseline:
    plt.figure(figsize=(5, 4))


    plt.hist(scores_actual_0, bins=30, alpha=0.8, color='b', label="Abnormal")
    plt.hist(scores_actual_1, bins=30, alpha=0.8, color='g', label="Normal")
    plt.xlim([0, 1])
    plt.ylim([0, 500])
    plt.xticks([])
    plt.yticks([])
    # plt.xlabel(lab_descriptions[lab] + 'auroc=%.2f' % auc)
    # plt.xlabel('baseline', fontsize=16)
    plt.xlabel('Score, baseline', fontsize=16)
    plt.ylabel('num of orders', fontsize=16)
    plt.legend(fontsize=12)
    plt.savefig(os.path.join(figure1_folderpath, 'cartoon_baseline_%s.png'%lab))
    plt.clf()

plt.figure(figsize=(5, 4))
alg = 'random-forest'
df = pd.read_csv(dataML_folderpath + "/%s/%s/direct_comparisons.csv"
                 % (lab, alg), keep_default_na=False)

df1 = pd.read_csv(dataML_folderpath + "/%s/%s/%s-normality-prediction-%s-report.tab"
                  % (lab, alg, lab, alg), sep='\t', keep_default_na=False)
auc = df1['roc_auc'].values[0]

if include_threshold_colors:
    scores_actual_trueNega = df.ix[(df['actual']==0) & (df['predict']<score_thres), 'predict'].values
    scores_actual_falsPosi = df.ix[(df['actual'] == 0) & (df['predict'] >= score_thres), 'predict'].values

    scores_actual_falsNega = df.ix[(df['actual'] == 1) & (df['predict'] < score_thres), 'predict'].values
    scores_actual_truePosi = df.ix[(df['actual'] == 1) & (df['predict'] >= score_thres), 'predict'].values

    if not inverse01:
        plt.hist(scores_actual_trueNega, bins=22, alpha=0.8, color='royalblue', label="true negatives")
        plt.hist(scores_actual_falsNega, bins=22, alpha=0.8, color='gold', label="false negatives")
        plt.hist(scores_actual_truePosi, bins=7, alpha=0.8, color='forestgreen', label="true positives")
        plt.hist(scores_actual_falsPosi, bins=7, alpha=0.8, color='orangered', label="false positives")

        plt.plot([score_thres] * dash_num, np.linspace(0, 800, num=dash_num), 'k--')
    else:
        plt.hist(1-scores_actual_trueNega, bins=22, alpha=0.8, color='royalblue', label="true positives")
        plt.hist(1-scores_actual_falsNega, bins=22, alpha=0.8, color='gold', label="false positives")
        plt.hist(1-scores_actual_truePosi, bins=7, alpha=0.8, color='forestgreen', label="true negatives")
        plt.hist(1-scores_actual_falsPosi, bins=7, alpha=0.8, color='orangered', label="false negatives")

        plt.plot([1-score_thres] * dash_num, np.linspace(0, 800, num=dash_num), 'k--')



    plt.legend(loc=(0.45,0.6), fontsize=12)

else:

    scores_actual_0 = df.ix[df['actual'] == 0, 'predict'].values
    scores_actual_1 = df.ix[df['actual'] == 1, 'predict'].values

    if not inverse01:
        plt.hist(scores_actual_0, bins=30, alpha=0.8, color='gray', label="Abnormal") #gray red
        plt.hist(scores_actual_1, bins=30, alpha=0.8, color='black', label="Normal") #black green
    else:
        plt.hist(1-scores_actual_0, bins=30, alpha=0.8, color='gray', label="Positive")
        plt.hist(1-scores_actual_1, bins=30, alpha=0.8, color='black', label="Negative")

    plt.legend(fontsize=12)

plt.xlim([0, 1])
plt.ylim([0, 800])
plt.xticks([])
plt.yticks([])
# plt.xlabel(lab_descriptions[lab])
# plt.xlabel('random forest', fontsize=16)
plt.xlabel('Score', fontsize=16)
plt.ylabel('Number of orders', fontsize=16)

if include_threshold_colors:
    plt.savefig(os.path.join(figure1_folderpath, 'cartoon_%s_thres%s.png' % (lab, inverse_maker)))
else:
    plt.savefig(os.path.join(figure1_folderpath, 'cartoon_%s%s.png' % (lab, inverse_maker)))

sensitivity 0.636603028308
specificity 0.96447467876
score_thres 0.756


## Figure 2: Stats of overuse

In [22]:
figure2_folderpath = os.path.join(stats_folderpath, 'Fig2_Order_Intensities')

max_repeat = 5
labs=['K', 'CR', 'NA', 'WBC', 'HGB']

if not os.path.exists(figure2_folderpath):
    os.mkdir(cached_result_foldername)

if os.path.exists(figure2_folderpath + 'lab2cnt.csv'):
    lab2cnt_pd = pd.read_csv(cached_result_foldername + 'lab2cnt.csv', keep_default_na=False)\
        .set_index('lab')
    lab2cnt_pd.columns = lab2cnt_pd.columns.astype(int)
    lab2cnt = lab2cnt_pd.to_dict(orient='index')

    lab2frac_pd = pd.read_csv(cached_result_foldername + 'lab2frac.csv', keep_default_na=False).set_index('lab')
    lab2frac_pd.columns = lab2frac_pd.columns.astype(int)
    lab2frac = lab2frac_pd.to_dict(orient='index')
else:

    lab2cnt, lab2frac = {}, {}
    import pickle
    cur_cnt_folderpath = 'Normality_Saturations_Cnts'
    if not os.path.exists(cur_cnt_folderpath):
        os.mkdir(cur_cnt_folderpath)

    def save_obj(obj, path):
        with open(path, 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

    def load_obj(path):
        with open(path, 'rb') as f:
            return pickle.load(f)

    for lab in labs:
        print 'Getting Normality Saturations for %s..' % lab

        cur_dict_name = "cur_dict_%s.pkl"%lab
        cur_dict_path = os.path.join(cur_cnt_folderpath, cur_dict_name)
        if not os.path.exists(cur_dict_path):
            df_lab = stats_utils.get_queried_lab(lab, lab_type, time_limit=stats_utils.DEFAULT_TIMELIMIT)

            if lab_type=='panel':
                df_lab = df_lab[df_lab['order_status'] == 'Completed']

            cur_dict = stats_utils.get_prevweek_normal__dict(df_lab, lab_type)

            save_obj(cur_dict, cur_dict_path)
        else:
            cur_dict = load_obj(cur_dict_path)

        normal_fractions = {}
        record_counts = {}
        for x in range(0, max_repeat + 1):
            if x in cur_dict:
                record_count = len(cur_dict[x])
                normal_fraction = np.divide(sum(cur_dict[x]), float(record_count))
            else:
                record_count = 0
                normal_fraction = float('nan')

            record_counts[x] = record_count
            normal_fractions[x] = (normal_fraction)
        lab2cnt[lab] = record_counts
        lab2frac[lab] = normal_fractions

    df_cnts = pd.DataFrame.from_dict(lab2cnt, orient='index').reset_index().rename(columns={'index': 'lab'})
    df_fracs = pd.DataFrame.from_dict(lab2frac, orient='index').reset_index().rename(columns={'index': 'lab'})
    #
    # print df_cnts
    # print df_fracs
    # quit()

    df_cnts.to_csv(figure2_folderpath + 'lab2cnt.csv', index=False)
    df_fracs.to_csv(figure2_folderpath + 'lab2frac.csv', index=False)

NameError: name 'cached_result_foldername' is not defined

In [16]:
labDescriptions

{'ALB': 'Albumin',
 'ALKP': 'Alk Phos',
 'ALT': 'ALT (SGPT)',
 'AST': 'AST (SGOT)',
 'BUN': 'Urea Nitrogen',
 'CA': 'Calcium',
 'CL': 'Chloride',
 'CO2': 'CO2',
 'CR': 'Creatinine',
 'DBIL': 'Conjugated Bili',
 'GLU': 'Glucose',
 'HGB': 'Hemoglobin',
 'IBIL': 'Bilirubin',
 'K': 'Potassium',
 'NA': 'Sodium',
 'PCO2A': 'PCO2',
 'PHA': 'PH (A)',
 'PLT': 'Platelet Count',
 'PO2A': 'PO2',
 'TBIL': 'Total Bilirubin',
 'TP': 'Protein',
 'WBC': 'White Blood Cells'}

In [26]:
fig, ax = plt.subplots(figsize=(6.5, 3.25)) #6, 4.5 # 7, 2.565 #6.5, 3.75

#, '<', '>'
marker_types = ('o', 'v', '^', '8', 's', 'P', '*', 'X', 'D', 'd')

for k, lab in enumerate(labs):  # :

    non_empty_inds = []
    for i in range(0,max_repeat+1):
        if lab2frac[lab][i]=='':
            break
        non_empty_inds.append(i)
    y_s = [float(lab2frac[lab][i]) for i in non_empty_inds]
    print 'lab, y_s', lab, y_s
    plt.plot(non_empty_inds, y_s, '-'+marker_types[k], label=labDescriptions[lab])
    # l2, = plt.scatter(non_empty_inds, y_s, marker=marker_types[k])
    # plt.plot(y_s[0], '-'+marker_types[k], color=l2.get_color(), markerfacecolor=l1.get_color(), label='My plots')

plt.xticks(range(0, max_repeat + 1))
plt.xlabel('Consecutive normal results in the past 7 days', fontsize=14)
plt.yticks([0,0.2,0.4,0.6,0.8,1], ['0%', '20%', '40%', '60%', '80%', '100%'])
plt.tick_params('x', labelsize=15)  # 12
plt.tick_params('y', labelsize=13)  # 10
plt.ylabel("Normal rate", fontsize=14)
plt.ylim([-0.05, 1.05])
plt.legend(fontsize=13)
ax.yaxis.tick_right()
ax.yaxis.set_label_position("right")
plt.tight_layout()
plt.savefig(figure2_folderpath + 'Negative_Saturations_%s'%(lab_type))
plt.clf()

lab, y_s K [0.685076521616475, 0.8401300202584382, 0.8842465860564064, 0.9054178529559538, 0.9187139986604153, 0.9240356473412477]
lab, y_s CR [0.25605016591420665, 0.8610904098842911, 0.9235285971472095, 0.9444551294380121, 0.9531478770131772, 0.9588342670107635]
lab, y_s NA [0.4074169752289101, 0.7886118397923033, 0.8531846872694483, 0.8751483855650523, 0.8889757623143081, 0.8940932416202643]
lab, y_s WBC [0.3003706487206641, 0.7865127958172813, 0.8594942228035753, 0.8804809911332443, 0.8983913898854299, 0.9033718389010302]
lab, y_s HGB [0.07035260138626022, 0.5437370317789669, 0.6757539614925882, 0.7615794735088304, 0.8041447752481028, 0.8341165413533834]


## Component Transfer Stats

- Setting the config

In [6]:
def statistic_analysis(lab, dataset_folder):
    

    direct_comparisons = pd.read_csv(os.path.join(dataset_folder, 'direct_comparisons.csv'))
    # print direct_comparisons
    prev = direct_comparisons[direct_comparisons['actual']==0].shape[0]/float(direct_comparisons.shape[0])
    AUC = metrics.roc_auc_score(direct_comparisons['actual'].values, direct_comparisons['predict'].values)
    return prev, AUC

lab_type = 'component'

all_sites = ['Stanford', 'UCSF', 'UMich']

res_folderpath = 'data-transferring-component-%s/'%curr_version
if not os.path.exists(res_folderpath):
    os.mkdir(res_folderpath)

res_filepath = res_folderpath + 'all_transfers_new.csv'


- Get data

In [21]:
if os.path.exists(res_filepath):
    df_res = pd.read_csv(res_filepath, keep_default_na=False)

else:

    labs = stats_utils.get_important_labs(lab_type='component')


    all_res_dicts = {}
    all_res_dicts['lab'] = labs

    diagonals = []
    off_diags = []

    columns = ['lab']
    for i in range(3): # Training sources
        for j in range(3): # Testing sources
            src = all_sites[i]
            dst = all_sites[j]


            '''
            '''
        
            LNL.transfer_labs(src_dataset=src, dst_dataset=dst, lab_type=lab_type,
                                              cur_version=curr_version)
            transfer_result_folderpath = ML_folderpath + '/data-%s-src-%s-dst-%s-%s/' \
                                         % (lab_type, src, dst, curr_version)
            AUCs = []
            Prevs = []
            for lab in labs:
                direct_comparisons_folderpath = os.path.join(transfer_result_folderpath, lab)

                if i!=j:
                    cur_prev, cur_AUC = statistic_analysis(lab=lab, dataset_folder=direct_comparisons_folderpath)
                    off_diags.append(cur_AUC)
                else:
                    tmp_df = pd.read_csv('data-%s-component-10000-episodes-lastnormal' % src
                                    + '/' + 'summary-stats-bestalg-fixTrainPPV.csv', keep_default_na=False)
                    mapped_lab = map_lab(lab=lab, data_source=src, lab_type=lab_type)
                    cur_df = tmp_df[(tmp_df['lab'] == mapped_lab) & (tmp_df['fixTrainPPV'] == 0.95)]
                    cur_AUC = cur_df['AUC'].values[0]
                    cur_prev = (cur_df['TP']+cur_df['FN']).values[0]
                    diagonals.append(cur_AUC)
                AUCs.append(cur_AUC)
                Prevs.append(cur_prev)

            col = '%s ->\n %s' % (src, dst)
            all_res_dicts[col] = AUCs
            
            all_res_dicts['prev'] = Prevs

            columns.append(col)
    import numpy as np
    print "diagonals avg:", np.mean(diagonals)
    print "off_diags avg:", np.mean(off_diags)

    df_res = pd.DataFrame.from_dict(all_res_dicts)

    descriptions = stats_utils.get_lab_descriptions(lab_type='component')
    df_res['lab'] = df_res['lab'].apply(lambda x:descriptions[x])
    df_res = df_res[columns]
    df_res.to_csv(res_filepath, index=False, float_format='%.2f')

### Table 5

In [27]:
cols = df_res.columns.values
df_res_show = df_res.copy()

for col in cols:
    if col=='lab' or col=='prev':
        continue
    df_res_show[col] = df_res_show[col].apply(lambda x: '%.2f'%x)

def shorten_site(site):
    return site.replace('Stanford','S').replace('UCSF','UC').replace('UMich','UM')
cols_map = dict(zip(cols, (shorten_site(x) for x in cols)))
df_res_show = df_res_show.rename(columns=cols_map)

df_res_show.drop(['prev'], axis=1).to_csv('transfer_table_basic.csv', index=False)

### Figure 5

In [None]:
# TODO: move this stats part away
import seaborn as sns; sns.set()
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(16, 12))
col = 5
for ind in range(df_res.shape[0]):
    cur_row = df_res.iloc[ind].values
    print cur_row
    cur_lab = cur_row[0]
    cur_aucs = cur_row[1:].astype(float).reshape(3,3)

    i, j = ind/col, ind%col
    plt.subplot2grid((3, col), (i, j))
    ax = sns.heatmap(cur_aucs, vmin=0, vmax=1, cbar=False, annot=True, cmap='ocean',
                     annot_kws={"size": 18},
                     xticklabels=['S', 'UC', 'UM'], yticklabels=['S', 'UC', 'UM'])
    plt.xlabel(cur_lab, fontsize=20)
    ax.xaxis.set_label_position('top')
    ax.xaxis.set_tick_params(labelsize=18)
    ax.yaxis.set_tick_params(labelsize=18)


plt.tight_layout()
fig.subplots_adjust(hspace=.5)

plt.savefig(res_folderpath + 'transfer_heatmap.png')