In [1]:
import pandas as pd
import numpy as np
import pickle
from glob import glob

In [3]:
def boostrapping_confidence_interval(pred_all, ci):
	""" Boostrapping to get a 95 confidence interval for prediction performance
	
	Params
	------
	pred_all: Numpy array
	ci: confidence interval
	
	Yields
	------
	cor_mean: float 
		middle bound
	lb: float
		lower bound
	ub: float
	upper bound
	
	"""
	import random
	# set random seed
	random.seed(0)
	
	# calculate overall correlation
	cor_mean = ma.corrcoef(ma.masked_invalid(pred_all[:,0]), ma.masked_invalid(pred_all[:,1]))[0,1]
	print("Overall prediction/gold standard correlation is %.4f" % cor_mean)
	# start boostrapping ...
	cor_all = [] 
	for i in range(100):
		pred_new = random.choices(pred_all, k = len(pred_all))
		pred_new = np.array(pred_new)
		cor = ma.corrcoef(ma.masked_invalid(pred_new[:,0]), ma.masked_invalid(pred_new[:,1]))[0,1]
		cor_all.append(cor)
	cor_all = sorted(cor_all)
	
	lb = cor_all[round(100*(0.5-ci*0.5))]
	ub = cor_all[round(100*(0.5+ci*0.5))]
	print("%d%s Confidence interval is: (%.4f, %.4f)" % (int(ci*100), '%', lb, ub))
	
	return cor_mean, lb, ub

In [4]:
all_p = sorted(glob('./M*/results/performances.csv'))
all_p

['./M0/results/performances.csv',
 './M1/results/performances.csv',
 './M10/results/performances.csv',
 './M11/results/performances.csv',
 './M12/results/performances.csv',
 './M13/results/performances.csv',
 './M14/results/performances.csv',
 './M15/results/performances.csv',
 './M16/results/performances.csv',
 './M17/results/performances.csv',
 './M18/results/performances.csv',
 './M19/results/performances.csv',
 './M2/results/performances.csv',
 './M3/results/performances.csv',
 './M4/results/performances.csv',
 './M5/results/performances.csv',
 './M6/results/performances.csv',
 './M7/results/performances.csv',
 './M8/results/performances.csv',
 './M9/results/performances.csv']

In [5]:
all_df = []
for p in all_p:
    df = pd.read_csv(p)
    #df_rmse = pd.read_csv(p.replace('performances.csv', 'performances_rmse.csv'))
    #df = df.merge(df_rmse, on = ['score','dataset_train','dataset_test'])
    print(df)
    all_df.append(df)
all_df = pd.concat(all_df)

    score            dataset_train             dataset_test  \
0     css                  ALMANAC                  ALMANAC   
1     css                  ALMANAC                    ONEIL   
2     css                  ALMANAC                  FORCINA   
3     css                  ALMANAC                  Mathews   
4     css                    ONEIL                  ALMANAC   
..    ...                      ...                      ...   
139     S    ALMANAC_Mathews_ONEIL    ALMANAC_Mathews_ONEIL   
140     S  ALMANAC_FORCINA_Mathews                    ONEIL   
141     S  ALMANAC_FORCINA_Mathews  ALMANAC_FORCINA_Mathews   
142     S    FORCINA_Mathews_ONEIL                  ALMANAC   
143     S    FORCINA_Mathews_ONEIL    FORCINA_Mathews_ONEIL   

                 pearsonr                 ci2.5                 ci97.5  \
0      0.8813208257350995    0.8806890844785239     0.8823196462777361   
1      0.1631367609866463    0.1569491198079668    0.17030359964129596   
2     0.1001074043406

In [10]:
df

Unnamed: 0,score,dataset_train,dataset_test,pearsonr,ci2.5,ci97.5,ci25,ci75,features
0,css,ALMANAC,ALMANAC,0.923448,0.922853,0.924317,0.923208,0.923734,monotherapy_ic50+monotherapy_ri+drc_intp_linear
1,css,ALMANAC,ONEIL,0.843479,0.841315,0.846122,0.842567,0.844222,monotherapy_ic50+monotherapy_ri+drc_intp_linear
2,css,ALMANAC,FORCINA,0.741207,0.715158,0.768327,0.730197,0.751084,monotherapy_ic50+monotherapy_ri+drc_intp_linear
3,css,ALMANAC,Mathews,0.847936,0.830381,0.864653,0.840578,0.853825,monotherapy_ic50+monotherapy_ri+drc_intp_linear
4,css,ONEIL,ALMANAC,0.882270,0.881342,0.883284,0.881974,0.882610,monotherapy_ic50+monotherapy_ri+drc_intp_linear
...,...,...,...,...,...,...,...,...,...
139,S,ALMANAC_Mathews_ONEIL,ALMANAC_Mathews_ONEIL,0.823965,0.822683,0.825676,0.823358,0.824386,monotherapy_ic50+monotherapy_ri+drc_intp_linear
140,S,ALMANAC_FORCINA_Mathews,ONEIL,0.822681,0.820606,0.825381,0.821815,0.823493,monotherapy_ic50+monotherapy_ri+drc_intp_linear
141,S,ALMANAC_FORCINA_Mathews,ALMANAC_FORCINA_Mathews,0.771060,0.769091,0.772982,0.770314,0.771828,monotherapy_ic50+monotherapy_ri+drc_intp_linear
142,S,FORCINA_Mathews_ONEIL,ALMANAC,0.538867,0.535165,0.542841,0.537612,0.540256,monotherapy_ic50+monotherapy_ri+drc_intp_linear


In [6]:
all_df.to_csv('all_performances.csv', index = False)