## Component Transfer Stats

In [10]:
import os, LocalEnv, stats_utils
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics
import numpy as np

In [11]:
curr_version = '10000-episodes-lastnormal'
inverse01 = True # Setting 'True' to interpret 'Normal' as 'Negative'

inverse_maker = '_inversed01' if inverse01 else ''

lab_type = 'component'

## Model transfer

In [12]:
def statistic_analysis(lab, dataset_folder):
    from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score

    direct_comparisons = pd.read_csv(os.path.join(dataset_folder, 'direct_comparisons.csv'))
    # print direct_comparisons
    return roc_auc_score(direct_comparisons['actual'].values, direct_comparisons['predict'].values)

all_sites = ['Stanford', 'UCSF', 'UMich']

res_folderpath = 'data-transferring-component-%s/'%curr_version
if not os.path.exists(res_folderpath):
    os.mkdir(res_folderpath)

res_filepath = res_folderpath + 'all_transfers.csv' #_withCI

main_folder = os.path.join(LocalEnv.PATH_TO_CDSS, 'scripts/LabTestAnalysis/')
ml_results_folderpath = os.path.join(main_folder, 'machine_learning')

if os.path.exists(res_filepath):
    print('%s exists!' % res_filepath)
    df_res = pd.read_csv(res_filepath, keep_default_na=False)

else:
    print('%s does not exist!' % res_filepath)
    labs = stats_utils.get_important_labs(lab_type='component')


    all_res_dicts = {}
    all_res_dicts['lab'] = labs

    diagonals = []
    off_diags = []

    columns = ['lab']
    for i in range(3): # Training sources
        for j in range(3): # Testing sources
            src = all_sites[i]
            dst = all_sites[j]


            '''
            '''
            ml_folder = ml_results_folderpath
#             LNL.transfer_labs(src_dataset=src, dst_dataset=dst, lab_type=lab_type,
#                                               cur_version=curr_version)
            transfer_result_folderpath = ml_folder + '/data-%s-src-%s-dst-%s-%s/' \
                                         % (lab_type, src, dst, curr_version)
            
            
            cur_res = []
            cur_res_left = []
            cur_res_right = []
            for lab in labs:
                direct_comparisons_folderpath = os.path.join(transfer_result_folderpath, lab)

                res_path = os.path.join(direct_comparisons_folderpath, 'direct_comparisons.csv')
                
                df_res = pd.read_csv(res_path)
                auc = metrics.roc_auc_score(df_res['actual'], 
                                            df_res['predict'])
#                 auc_left, auc_right = stats_utils.bootstrap_CI(actual_list=df_res['actual'], 
#                                                                predict_list=df_res['predict'],
#                                                               num_repeats=1000)
#                 if i!=j:
#                     off_diags.append(auc)
#                 else:
#                     diagonals.append(auc)
                
                cur_res.append(auc)
                cur_res_left.append(auc_left)
                cur_res_right.append(auc_right)
                    

            col = '%s -> %s' % (src, dst)
            all_res_dicts[col] = cur_res
            columns.append(col)
            
#             col_left = col + ', left'
#             all_res_dicts[col_left] = cur_res_left
#             columns.append(col_left)
            
#             col_right = col + ', right'
#             all_res_dicts[col_right] = cur_res_right
#             columns.append(col_right)

            
    import numpy as np
    print "diagonals avg:", np.mean(diagonals)
    print "off_diags avg:", np.mean(off_diags)

    df_res = pd.DataFrame.from_dict(all_res_dicts)

    descriptions = stats_utils.get_lab_descriptions(lab_type='component')
    df_res['lab'] = df_res['lab'].apply(lambda x:descriptions[x])
    df_res = df_res[columns]
    df_res.to_csv(res_filepath, index=False, float_format='%.2f')



data-transferring-component-10000-episodes-lastnormal/all_transfers.csv exists!


In [55]:
df_res

Unnamed: 0,lab,Stanford -> Stanford,Stanford -> UCSF,Stanford -> UMich,UCSF -> Stanford,UCSF -> UCSF,UCSF -> UMich,UMich -> Stanford,UMich -> UCSF,UMich -> UMich
0,White Blood Cells,0.89,0.88,0.79,0.87,0.88,0.81,0.87,0.88,0.83
1,Hemoglobin,0.93,0.94,0.89,0.86,0.9,0.79,0.92,0.94,0.9
2,Platelet Count,0.91,0.94,0.91,0.89,0.95,0.91,0.89,0.94,0.92
3,Sodium,0.87,0.88,0.91,0.85,0.89,0.91,0.86,0.86,0.91
4,Potassium,0.76,0.75,0.67,0.74,0.77,0.75,0.73,0.75,0.76
5,CO2,0.86,0.8,0.75,0.8,0.87,0.85,0.77,0.82,0.87
6,Urea Nitrogen,0.95,0.92,0.9,0.94,0.92,0.9,0.93,0.92,0.92
7,Creatinine,0.96,0.91,0.85,0.94,0.94,0.88,0.92,0.88,0.9
8,Calcium,0.88,0.86,0.81,0.87,0.87,0.85,0.85,0.86,0.89
9,Albumin,0.92,0.88,0.73,0.84,0.89,0.74,0.92,0.89,0.9


In [110]:
# fig, ax = plt.subplots(figsize=(10,6))
df_res[::-1].set_index('lab').rename(columns={'Stanford -> Stanford':'Stanford',
                                       'Stanford -> UCSF':'UCSF',
                                       'Stanford -> UMich':'UMich'})\
                [['UMich', 'UCSF', 'Stanford'][::-1]]\
                .plot.barh(width=0.8, figsize=(8,6), fontsize=18, legend=False,
                          color=['salmon','red','brown'][::-1])
# ax.set_ylabel('lab', fontsize=4)
plt.xlim([0.5,1])
plt.ylabel("")
plt.xticks(np.linspace(0.5,1,6))

# plt.legend(loc=(1,1))
plt.tight_layout()

plt.savefig('new_transfer.png')

In [14]:
# TODO: move this stats part away
import seaborn as sns; sns.set()
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(16, 12))
col = 5
for ind in range(df_res.shape[0]):
    cur_row = df_res.iloc[ind].values
    print cur_row
    cur_lab = cur_row[0]
    cur_aucs = cur_row[1:].astype(float).reshape(3,3)

    i, j = ind/col, ind%col
    plt.subplot2grid((3, col), (i, j))
    ax = sns.heatmap(cur_aucs, vmin=0, vmax=1, cbar=False, annot=True, cmap='ocean',
                     annot_kws={"size": 18},
                     xticklabels=['S', 'UC', 'UM'], yticklabels=['S', 'UC', 'UM'])
    
    plt.xlabel(cur_lab, fontsize=20)
    ax.xaxis.set_label_position('top')
    ax.xaxis.set_tick_params(labelsize=18)
    ax.yaxis.set_tick_params(labelsize=18)


plt.tight_layout()
fig.subplots_adjust(hspace=.5)

plt.savefig(res_folderpath + 'transfer_heatmap.png')

['White Blood Cells' 0.89 0.88 0.79 0.87 0.88 0.81 0.87 0.88 0.83]
['Hemoglobin' 0.93 0.94 0.89 0.86 0.9 0.79 0.92 0.94 0.9]
['Platelet Count' 0.91 0.94 0.91 0.89 0.95 0.91 0.89 0.94 0.92]
['Sodium' 0.87 0.88 0.91 0.85 0.89 0.91 0.86 0.86 0.91]
['Potassium' 0.76 0.75 0.67 0.74 0.77 0.75 0.73 0.75 0.76]
['CO2' 0.86 0.8 0.75 0.8 0.87 0.85 0.77 0.82 0.87]
['Urea Nitrogen' 0.95 0.92 0.9 0.94 0.92 0.9 0.93 0.92 0.92]
['Creatinine' 0.96 0.91 0.85 0.94 0.94 0.88 0.92 0.88 0.9]
['Calcium' 0.88 0.86 0.81 0.87 0.87 0.85 0.85 0.86 0.89]
['Albumin' 0.92 0.88 0.73 0.84 0.89 0.74 0.92 0.89 0.9]
['Protein' 0.91 0.89 0.87 0.88 0.89 0.85 0.89 0.88 0.9]
['Alk Phos' 0.94 0.91 0.89 0.92 0.93 0.89 0.92 0.93 0.92]
['Total Bilirubin' 0.96 0.91 0.91 0.95 0.93 0.91 0.96 0.92 0.92]
['AST (SGOT)' 0.92 0.81 0.86 0.85 0.77 0.73 0.88 0.77 0.86]
['ALT (SGPT)' 0.93 0.86 0.91 0.92 0.91 0.88 0.88 0.84 0.88]


In [15]:
1+1

2

In [16]:
ind = 0
cur_row = df_res.iloc[ind].values
print cur_row
cur_lab = cur_row[0]
cur_aucs = cur_row[1:].astype(float).reshape(3,3)

sns.set(font_scale=3)
fig, ax = plt.subplots(figsize=(16, 12))
ax = sns.heatmap(cur_aucs, vmin=0, vmax=1, cbar=True, annot=True, cmap='ocean',
                 annot_kws={"size": 18},
                 cbar_kws = {'orientation': 'horizontal'},
                 xticklabels=['S', 'UC', 'UM'], yticklabels=['S', 'UC', 'UM'])

plt.xlabel(cur_lab, fontsize=20)
ax.xaxis.set_label_position('top')
ax.xaxis.set_tick_params(labelsize=18)
ax.yaxis.set_tick_params(labelsize=18)


plt.tight_layout()
plt.savefig(res_folderpath + 'colorbar.png')

['White Blood Cells' 0.89 0.88 0.79 0.87 0.88 0.81 0.87 0.88 0.83]


In [105]:
df_CI = pd.read_csv('data-transferring-component-10000-episodes-lastnormal/all_transfers_withCI.csv').rename(columns={'lab':'Lab Test'})
df_CI.head()

Unnamed: 0,Lab Test,Stanford -> Stanford,"Stanford -> Stanford, left","Stanford -> Stanford, right",Stanford -> UCSF,"Stanford -> UCSF, left","Stanford -> UCSF, right",Stanford -> UMich,"Stanford -> UMich, left","Stanford -> UMich, right",UCSF -> Stanford,"UCSF -> Stanford, left","UCSF -> Stanford, right",UCSF -> UCSF,"UCSF -> UCSF, left","UCSF -> UCSF, right",UCSF -> UMich,"UCSF -> UMich, left","UCSF -> UMich, right",UMich -> Stanford,"UMich -> Stanford, left","UMich -> Stanford, right",UMich -> UCSF,"UMich -> UCSF, left","UMich -> UCSF, right",UMich -> UMich,"UMich -> UMich, left","UMich -> UMich, right"
0,White Blood Cells,0.89,0.88,0.91,0.88,0.86,0.89,0.79,0.77,0.81,0.87,0.86,0.89,0.88,0.87,0.9,0.81,0.8,0.83,0.87,0.86,0.89,0.88,0.86,0.89,0.83,0.81,0.84
1,Hemoglobin,0.93,0.92,0.94,0.94,0.93,0.95,0.89,0.88,0.91,0.86,0.84,0.88,0.9,0.89,0.92,0.79,0.77,0.81,0.92,0.9,0.93,0.94,0.93,0.95,0.9,0.89,0.91
2,Platelet Count,0.91,0.9,0.92,0.94,0.93,0.95,0.91,0.9,0.93,0.89,0.88,0.91,0.95,0.94,0.96,0.91,0.89,0.92,0.89,0.88,0.91,0.94,0.93,0.95,0.92,0.9,0.93
3,Sodium,0.87,0.85,0.88,0.88,0.86,0.89,0.91,0.9,0.93,0.85,0.84,0.87,0.89,0.87,0.9,0.91,0.89,0.92,0.86,0.84,0.88,0.86,0.84,0.88,0.91,0.9,0.93
4,Potassium,0.76,0.73,0.79,0.75,0.73,0.78,0.67,0.63,0.7,0.74,0.71,0.77,0.77,0.74,0.79,0.75,0.72,0.78,0.73,0.69,0.76,0.75,0.72,0.77,0.76,0.73,0.79


In [106]:
shorten_dict = {'Stanford':'S', 'UCSF':'UC', 'UMich':'UM'}
sources = ['Stanford', 'UCSF', 'UMich']#shorten_dict.keys()
cols_CI = ['Lab Test']
for src in sources:
    for dst in sources:
        transfer = '%s -> %s' %(src, dst)
        cols_CI.append(transfer)
        df_CI['%s'%transfer] = df_CI[transfer].astype(str) + '\n[' + df_CI['%s, left'%transfer].astype(str) + ', ' + df_CI['%s, right'%transfer].astype(str) + ']'
df_CI[cols_CI]

Unnamed: 0,Lab Test,Stanford -> Stanford,Stanford -> UCSF,Stanford -> UMich,UCSF -> Stanford,UCSF -> UCSF,UCSF -> UMich,UMich -> Stanford,UMich -> UCSF,UMich -> UMich
0,White Blood Cells,"0.89\n[0.88, 0.91]","0.88\n[0.86, 0.89]","0.79\n[0.77, 0.81]","0.87\n[0.86, 0.89]","0.88\n[0.87, 0.9]","0.81\n[0.8, 0.83]","0.87\n[0.86, 0.89]","0.88\n[0.86, 0.89]","0.83\n[0.81, 0.84]"
1,Hemoglobin,"0.93\n[0.92, 0.94]","0.94\n[0.93, 0.95]","0.89\n[0.88, 0.91]","0.86\n[0.84, 0.88]","0.9\n[0.89, 0.92]","0.79\n[0.77, 0.81]","0.92\n[0.9, 0.93]","0.94\n[0.93, 0.95]","0.9\n[0.89, 0.91]"
2,Platelet Count,"0.91\n[0.9, 0.92]","0.94\n[0.93, 0.95]","0.91\n[0.9, 0.93]","0.89\n[0.88, 0.91]","0.95\n[0.94, 0.96]","0.91\n[0.89, 0.92]","0.89\n[0.88, 0.91]","0.94\n[0.93, 0.95]","0.92\n[0.9, 0.93]"
3,Sodium,"0.87\n[0.85, 0.88]","0.88\n[0.86, 0.89]","0.91\n[0.9, 0.93]","0.85\n[0.84, 0.87]","0.89\n[0.87, 0.9]","0.91\n[0.89, 0.92]","0.86\n[0.84, 0.88]","0.86\n[0.84, 0.88]","0.91\n[0.9, 0.93]"
4,Potassium,"0.76\n[0.73, 0.79]","0.75\n[0.73, 0.78]","0.67\n[0.63, 0.7]","0.74\n[0.71, 0.77]","0.77\n[0.74, 0.79]","0.75\n[0.72, 0.78]","0.73\n[0.69, 0.76]","0.75\n[0.72, 0.77]","0.76\n[0.73, 0.79]"
5,CO2,"0.86\n[0.84, 0.88]","0.8\n[0.78, 0.82]","0.75\n[0.71, 0.78]","0.8\n[0.77, 0.82]","0.87\n[0.86, 0.89]","0.85\n[0.82, 0.87]","0.77\n[0.74, 0.8]","0.82\n[0.8, 0.84]","0.87\n[0.84, 0.88]"
6,Urea Nitrogen,"0.95\n[0.94, 0.96]","0.92\n[0.91, 0.93]","0.9\n[0.89, 0.92]","0.94\n[0.93, 0.95]","0.92\n[0.91, 0.93]","0.9\n[0.89, 0.92]","0.93\n[0.92, 0.94]","0.92\n[0.91, 0.93]","0.92\n[0.91, 0.93]"
7,Creatinine,"0.96\n[0.96, 0.97]","0.91\n[0.89, 0.92]","0.85\n[0.83, 0.86]","0.94\n[0.94, 0.95]","0.94\n[0.93, 0.95]","0.88\n[0.87, 0.9]","0.92\n[0.91, 0.93]","0.88\n[0.86, 0.89]","0.9\n[0.88, 0.91]"
8,Calcium,"0.88\n[0.87, 0.9]","0.86\n[0.85, 0.88]","0.81\n[0.79, 0.83]","0.87\n[0.85, 0.88]","0.87\n[0.86, 0.88]","0.85\n[0.83, 0.86]","0.85\n[0.83, 0.87]","0.86\n[0.84, 0.87]","0.89\n[0.88, 0.9]"
9,Albumin,"0.92\n[0.91, 0.93]","0.88\n[0.87, 0.9]","0.73\n[0.7, 0.75]","0.84\n[0.82, 0.86]","0.89\n[0.88, 0.9]","0.74\n[0.72, 0.76]","0.92\n[0.9, 0.93]","0.89\n[0.87, 0.9]","0.9\n[0.89, 0.92]"


In [108]:
df_CI[cols_CI].to_csv('data-transferring-component-10000-episodes-lastnormal/all_transfers_withCI_formatted.csv', index=False)