In [22]:
from scripts.LabTestAnalysis.machine_learning.LabNormalityPredictionPipeline \
        import NON_PANEL_TESTS_WITH_GT_500_ORDERS
import os
import pandas as pd
import numpy as np

import stats_utils
import matplotlib.pyplot as plt

from medinfo.ml.SupervisedClassifier import SupervisedClassifier

all_algs = SupervisedClassifier.SUPPORTED_ALGORITHMS

In [142]:
result_label="all_labs"
targeted_PPV=0.95
scale_by='enc'
use_cached_fig_data=True
inverse01=True

data_source = 'UCSF'
lab_type = 'panel'
curr_version = '10000-episodes-lastnormal'

all_labs = stats_utils.get_all_labs(data_source, lab_type)

In [143]:
stats_folderpath = os.path.join(stats_utils.main_folder, 'lab_statistics/')
ml_folderpath = os.path.join(stats_utils.main_folder, 'machine_learning')

import LocalEnv
statsByDataSet_foldername = 'data-%s-%s-%s' % (data_source, lab_type, curr_version)
statsByDataSet_folderpath = os.path.join(stats_folderpath, statsByDataSet_foldername)

dataset_foldername = 'data-%s-%s-%s'%(data_source, lab_type, curr_version)
labStats_folderpath = os.path.join(LocalEnv.PATH_TO_CDSS, 'scripts/LabTestAnalysis/lab_statistics')
statsByLab_folderpath = os.path.join(labStats_folderpath, dataset_foldername)
ml_folderpath = statsByLab_folderpath.replace("lab_statistics", "machine_learning")

In [147]:
reload(stats_utils)
def draw__stats_Curves(statsByLab_folderpath, labs, curve_type="ROC", algs=['random-forest'], result_label=None,
                       include_baseline=True, inverse01=False):
    result_foldername = 'Fig_stats_Curves'
    if result_label:
        result_foldername += '_' + result_label
    result_folderpath = os.path.join(statsByLab_folderpath, result_foldername)
    if not os.path.exists(result_folderpath):
        os.mkdir(result_folderpath)

    result_figname = '%s_%s_%s.png'%(data_source, lab_type, curve_type)
    result_figpath = os.path.join(result_folderpath, result_figname)

    result_tablename = '%s_%s_%s.csv'%(data_source, lab_type, curve_type)
    result_tablepath = os.path.join(result_folderpath, result_tablename)

    num_labs = len(labs)
    # fig, ax = plt.subplots(figsize=(12, 6))
    
    if lab_type == 'component':
        col_num=5
    else:
        col_num=7

    row, col, i_s, j_s = stats_utils.prepare_subfigs(num_labs, col=5) #7

    scores_base = []
    scores_best = []
    p_vals = []

    scores_diffs = {}

    lab_descriptions = stats_utils.get_lab_descriptions(lab_type=lab_type,
                                                        data_source=data_source,
                                                        line_break_at=18)
    for ind, lab in enumerate(labs):

        '''
        Getting p-values is slow
        '''

        xVal_base, yVal_base, score_base, xVal_best, yVal_best, score_best, p_val \
            = stats_utils.get_curve_onelab(lab,
                                           all_algs=algs,
                                           data_folder=statsByLab_folderpath.replace("lab_statistics", "machine_learning"),
                                           curve_type=curve_type,
                                           get_pval=True)
        # print lab, p_val
        scores_base.append(score_base)
        scores_best.append(score_best)
        p_vals.append(p_val)

        scores_diffs[lab] = score_best - score_base

        i, j = i_s[ind], j_s[ind]
        plt.subplot2grid((row, col), (i, j))

        dash_num = 20
        plt.plot(np.linspace(0, 1, num=dash_num), np.linspace(0, 1, num=dash_num), color='lightblue',
                 linestyle='--')
        
        
        lab_descrip = lab_descriptions.get(lab, lab)

        if not inverse01:
            
            plt.plot(xVal_best, yVal_best, label='%0.2f' % (score_best), color='orange')
            if include_baseline:
                plt.plot(xVal_base, yVal_base, label='%0.2f' % (score_base))
        else:
            
            label_best = '%0.2f' % (score_best)
#             if sig_dict[lab]:
            label_best = label_best + stats_utils.map_pval_significance(p_val) # significant
            plt.plot(1-yVal_best, 1-xVal_best, label=label_best, color='orange')
            if include_baseline:
                plt.plot(1 - yVal_base, 1 - xVal_base, label='%0.2f' % (score_base))

        plt.xlim([0,1])
        plt.ylim([0,1])
        plt.xticks([])
        plt.yticks([])

        
        if data_source == 'UCSF' and ('\n' not in lab_descrip):
            lab_descrip = lab_descrip[:18] + '\n' + lab_descrip[18:]
        plt.xlabel(lab_descrip)
        plt.legend()


    plt.tight_layout()
    plt.savefig(result_figpath)

    measures = {'ROC': 'AUC (Area Under Curve)', 'PRC': 'APS (Average Precision Score)'}
    avg_base, avg_best = np.mean(scores_base), np.mean(scores_best)
    print "Average %s among %i labs: %.3f baseline, %.3f bestalg (an improvement of %.3f)." \
          % (measures[curve_type], len(scores_base), avg_base, avg_best, avg_best - avg_base)

    df_output_table = pd.DataFrame({'lab':labs,
                                    curve_type+' benchmark':scores_base,
                                    curve_type + ' ML model':scores_best,
                                    curve_type + ' p value':p_vals
                       })
    df_output_table['lab'] = df_output_table['lab'].apply(lambda x: lab_descriptions.get(x,x)) #
    df_output_table[curve_type + ' significance'] = df_output_table[curve_type + ' p value'].apply(lambda x: stats_utils.map_pval_significance(x))
    df_output_table[['lab',curve_type+' benchmark',curve_type + ' ML model',curve_type + ' p value',curve_type + ' significance']]\
        .to_csv(result_tablepath, index=False, float_format="%.2f")

In [148]:
lab_set, set_label = all_labs, 'all_labs'

In [149]:
draw__stats_Curves(statsByDataSet_folderpath, lab_set, curve_type="ROC", algs=all_algs,
                                   result_label=set_label, inverse01=inverse01)

Average AUC (Area Under Curve) among 13 labs: 0.783 baseline, 0.849 bestalg (an improvement of 0.066).
