In [1]:
# coding:utf-8
import argparse
#===============================================================
import os,sys,glob,re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import math
from scipy.stats import ttest_ind,linregress;
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import matthews_corrcoef
#from sklearn.metrics import recall_score
#from sklearn.metrics import precision_score
from sklearn.metrics import auc, mean_absolute_error, mean_squared_error, precision_recall_curve, \
r2_score,roc_auc_score, accuracy_score, log_loss
from sklearn.metrics import confusion_matrix,cohen_kappa_score
from sklearn.metrics import f1_score,confusion_matrix

In [2]:
h1_list = ['r2_score','pearson_r2','rmse','mse','mae','roc','prc','p-stat','Recall',
'Precision','f1','BA','accuracy', 'TN', 'FP', 'FN','TP','SP','SE','NPV','MCC']

In [3]:
def polyfit(x, y, degree):
	#coeffs = numpy.polyfit(x, y, degree) # Polynomial Coefficients
	correlation = numpy.corrcoef(x, y)[0,1]
	return correlation**2

def calc_class_roc_prc(y_true,y_pred,pos_label=1):
	#print('y_true =', y_true)
	#y_ = [y for y in y_true if y not in [0,1]]
	#print('y_ = ',y_)   ## -9223372036854775808, must be NaN
	x_roc=roc_auc_score(y_true,y_pred)
	if pos_label==0:x_roc = 1 - x_roc		
	x_precision, x_recall, x_thresholds = precision_recall_curve(y_true,y_pred)
	x_prc = auc(x_recall, x_precision)
	return {'roc':x_roc,'prc':x_prc}

def calc_class_other_stats_with_threshold(y_true, y_pred, threshold=0.5):
	ind0=np.where(y_true==0)[0];ind1=np.where(y_true==1)[0]
	y_pred0=y_pred[ind0];y_pred1=y_pred[ind1];
	p_0_1 = ttest_ind(y_pred0, y_pred1)[1]	 #"%.2e"%
	#print('p_stat = ', p_0_1)
	y_hard_pred = [1 if p > threshold else 0 for p in y_pred] # binary prediction
	#x_cohen_kappa = cohen_kappa_score(y_true, y_pred, weights='linear')	
	### not understand, can problem, Can't handle mix of binary and continuous
	x_cohen_kappa = cohen_kappa_score(y_true, y_hard_pred, weights='linear')
	## True and false values
	TN, FP, FN, TP = confusion_matrix(y_true, y_hard_pred).ravel()	
	#TP = sum((y_true == 1) & (y_hard_pred == 1))
	#TN = sum((y_true == 0) & (y_hard_pred == 0))
	#FN = sum((y_true == 1) & (y_hard_pred == 0))
	#FP = sum((y_true == 0) & (y_hard_pred == 1))
	# SE (Sensitivity), hit rate, recall, or true positive rate
	x_Recall = TP/(TP+FN)
	SE=x_Recall
	# Precision or positive predictive value
	x_Precision = TP/(TP+FP)
	#f1_score = (2*x_Precision*x_Recall)/ (x_Precision + x_Recall)
	f1 = f1_score(y_true, y_hard_pred)
	# CCR (Correct classification rate), BA (balanced accuracy)
	# Specificity or true negative rate
	SP = TN/(TN+FP) 
	x_BA = (SE + SP)/2
	x_accuracy = accuracy_score(y_true, y_hard_pred)
	# Negative predictive value
	NPV = TN/(TN+FN)
	x_MCC = matthews_corrcoef(y_true, y_hard_pred)
	other_stats = {'p_stat':p_0_1,'Recall':x_Recall,'Precision':x_Precision, 'f1':f1,'BA':x_BA,
	'accuracy':x_accuracy, 'TN':TN, 'FP':FP, 'FN':FN,'TP':TP,'SP':SP,'SE':SE,
	'NPV':NPV,'MCC':x_MCC,'cohen_kappa':x_cohen_kappa}  # 
	return other_stats

In [4]:
def calc_ef_threholds(y_true,y_pred,ef_threholds=[0.01,0.05,0.1]):
	# y_true,y_pred are np.array type
	df = pd.DataFrame()   ## merge  y_true,y_pred as df 's  two cols
	df['y_true'] = y_true;df['y_pred'] = y_pred
	n_actives = len(df[df['y_true']==1].index)
	n_total = len(df.index)
	random_rate = n_actives/ n_total
	df.sort_values(by='y_pred',ascending=False,inplace=True)
	EFs = {}
	for ef_threhold in ef_threholds:
		screen_range = int(np.ceil(n_total * ef_threhold))
		screen_rate = sum(df['y_true'][:screen_range]) / screen_range
		#print('n_actives,n_total = ',n_actives,n_total)
		print('random_rate,screen_range,screen_rate = ', random_rate,screen_range,screen_rate)
		EF = screen_rate / random_rate
		EF_name = 'EF_{}'.format(ef_threhold)
		#print('{} = {}'.format(EF_name,EF))
		EFs.update({EF_name:EF})
	return EFs

In [5]:
def list_stat_for_class(true_file,pred_file,pos_label,ef_threholds):
    t_df = pd.read_csv(true_file)
    p_df = pd.read_csv(pred_file)
    #t_values = t_df[label_col].values
    t_values = t_df.iloc[:,1].values.astype(int)
    if model == 'DMPNN': p_values = p_df.iloc[:,1].values
    else: p_values = p_df.iloc[:,0].values
    ind = np.where((t_values==0) | (t_values==1))[0]	 ### to remove those NaN label
    #print('ind = ',ind)
    y_true = t_values[ind]
    y_pred = p_values[ind]
    print('y_true[:10], y_pred[:10] =',y_true[:10], y_pred[:10])
    dic = {}
    dic.update(calc_class_roc_prc(y_true,y_pred,pos_label))
    dic.update(calc_ef_threholds(y_true,y_pred,ef_threholds))
    dic.update(calc_class_other_stats_with_threshold(y_true, y_pred,threshold))
    w_line= pred_file +',' + ','.join([str(dic[x]) for x in header])+'\n'
    f = open(eva_csv_name, 'a'); f.write(w_line); f.close()
    return 

In [6]:
efs=ef_threholds=[0.01,0.02,0.05]
thre=threshold=0.5
pos_label = 1

In [7]:
fingerprint_list = ['MorganFP', 'RDkitFP', 'AtomPairFP', 'TorsionFP', 'AvalonFP', 
'EstateFP', 'MACCSFP', 'PharmacoErGFP', 'PharmacoPFP', 'PubChemFP', 'MHFP6', 'MAP4']
descriptor_list = ['Property', 'Constitution', 'Autocorr', 'Fragment', 'Charge', 
'Estate', 'MOE', 'Connectivity', 'Topology', 'Kappa', 'Path', 'Matrix', 'InfoContent']

models = ['xgb','fcnn','log','svm','rf','DMPNN']
num_folds = 5
#true_file = 'hide/rand_check/fold_0/train_full.csv'
b_list = ['train','val','test','Ext']   ## DMPNN use this

true_file_pat = 'rand_check_{feature}/fold_{i}/{b}_full.csv' 
true_file_e = 'Ext.csv'

pred_file_pat = '{model}/p_{b}_{feature}_fold{i}.csv'  # include DMPNN

In [8]:
eva_csv_name = 'eva_with_rmSim.csv'
print('eva_csv_name = ', eva_csv_name)
df = pd.read_csv(eva_csv_name)

eva_csv_name =  eva_with_rmSim.csv


In [9]:
col_names = df['col_names'].values
m_list = [];data_list = [];feat_list = [];fold_list = []
for col in col_names:
    m,tmp = col.split('/')
    tmp = tmp.replace('.csv','').replace('p_Ext_rm_sim_0_','p_ExtRmSim0')
    _,dt,ft,fold = tmp.split('_')
    m_list.append(m);data_list.append(dt);feat_list.append(ft);fold_list.append(fold)
df['model'] = m_list
df['data'] = data_list
df['feat'] = feat_list
df['fold'] = fold_list

In [10]:
df.head(5)

Unnamed: 0,col_names,roc,prc,Recall,Precision,f1,BA,accuracy,TN,FP,...,MCC,EF_0.01,EF_0.02,EF_0.05,p_stat,cohen_kappa,model,data,feat,fold
0,xgb/p_train_MorganFP_fold0.csv,0.996238,0.998955,0.992151,0.977707,0.984876,0.955446,0.976165,554,49,...,0.929228,1.278393,1.278393,1.278393,0.0,0.928672,xgb,train,MorganFP,fold0
1,xgb/p_train_MorganFP_fold1.csv,0.996262,0.998937,0.989777,0.97886,0.984288,0.957611,0.975442,571,46,...,0.928397,1.28671,1.28671,1.28671,0.0,0.928098,xgb,train,MorganFP,fold1
2,xgb/p_train_MorganFP_fold2.csv,0.996212,0.998933,0.993059,0.976787,0.984855,0.954589,0.976165,557,51,...,0.92964,1.281351,1.281351,1.281351,0.0,0.928941,xgb,train,MorganFP,fold2
3,xgb/p_train_MorganFP_fold3.csv,0.996273,0.998963,0.99169,0.977697,0.984644,0.955215,0.975804,554,49,...,0.928156,1.278393,1.278393,1.278393,0.0,0.927636,xgb,train,MorganFP,fold3
4,xgb/p_train_MorganFP_fold4.csv,0.996174,0.99895,0.99124,0.978607,0.984883,0.956453,0.976165,553,47,...,0.929007,1.276625,1.276625,1.276625,0.0,0.92858,xgb,train,MorganFP,fold4


In [11]:
df.tail(5)

Unnamed: 0,col_names,roc,prc,Recall,Precision,f1,BA,accuracy,TN,FP,...,MCC,EF_0.01,EF_0.02,EF_0.05,p_stat,cohen_kappa,model,data,feat,fold
7035,DMPNN/p_Ext_rm_sim_0_9__fold0.csv,0.877142,0.951462,0.99308,0.831884,0.905363,0.655363,0.839572,27,58,...,0.486887,1.294118,1.294118,1.294118,1.513044e-43,0.404869,DMPNN,ExtRmSim09,,fold0
7036,DMPNN/p_Ext_rm_sim_0_9__fold1.csv,0.901425,0.964824,0.99308,0.827089,0.902516,0.643599,0.834225,25,60,...,0.465042,1.294118,1.294118,1.294118,1.23433e-34,0.378304,DMPNN,ExtRmSim09,,fold1
7037,DMPNN/p_Ext_rm_sim_0_9__fold2.csv,0.865988,0.95098,0.989619,0.810198,0.890966,0.600692,0.812834,18,67,...,0.366595,1.294118,1.294118,1.294118,1.0124759999999999e-35,0.27427,DMPNN,ExtRmSim09,,fold2
7038,DMPNN/p_Ext_rm_sim_0_9__fold3.csv,0.842805,0.941455,0.972318,0.828909,0.894904,0.644983,0.823529,27,58,...,0.417225,1.294118,1.294118,1.294118,1.209108e-31,0.365939,DMPNN,ExtRmSim09,,fold3
7039,DMPNN/p_Ext_rm_sim_0_9__fold4.csv,0.867535,0.950479,0.986159,0.811966,0.890625,0.604844,0.812834,19,66,...,0.365776,1.294118,1.294118,1.294118,1.716272e-27,0.282386,DMPNN,ExtRmSim09,,fold4


In [12]:
eva_csv_name = 'eva_with_rmSim.csv'
df = pd.read_csv(eva_csv_name)
col_names = df['col_names'].values
m_list = [];data_list = [];feat_list = [];fold_list = []
for col in col_names:
    m,tmp = col.split('/')
    tmp = tmp.replace('.csv','').replace('p_Ext_rm_sim_0_','p_ExtRmSim0')
    _,dt,ft,fold = tmp.split('_')
    m_list.append(m);data_list.append(dt);feat_list.append(ft);fold_list.append(fold)
df['model'] = m_list
df['data'] = data_list
df['feat'] = feat_list
df['fold'] = fold_list

In [13]:
test_df=df[df['data']=='test']
ext_df=df[df['data']=='Ext']
train_df=df[df['data']=='train']
val_df=df[df['data']=='train']

In [16]:
def group_model_feat_for_one_performance_sort(xeva = 'roc'):
    group_test = test_df.groupby(['model','feat']).mean()[xeva]
    group_test_std = test_df.groupby(['model','feat']).std()[xeva]
    group_ext = ext_df.groupby(['model','feat']).mean()[xeva]
    group_ext_std = ext_df.groupby(['model','feat']).std()[xeva]

    group_train = train_df.groupby(['model','feat']).mean()[xeva]
    group_train_std = train_df.groupby(['model','feat']).std()[xeva]
    group_val = val_df.groupby(['model','feat']).mean()[xeva]
    group_val_std = val_df.groupby(['model','feat']).std()[xeva]

    dfl = [pd.DataFrame() for i in [6,7,8,9]] 
    groupl = [group_val for i in [6,7,8,9]] 
    group_stdl = [group_val_std for i in [6,7,8,9]] 
    tmp_list = []
    for i,sim in enumerate([6,7,8,9]):
        #dfl[0]=df[df['data']=='train']
        dfl[i]=df[df['data']==f'ExtRmSim0{sim}']
        groupl[i] = dfl[i].groupby(['model','feat']).mean()[xeva]
        group_stdl[i] = dfl[i].groupby(['model','feat']).std()[xeva]
        tmp_list.append(groupl[i]);tmp_list.append(group_stdl[i])
    grouped_model_feat = pd.concat([group_train,group_train_std,group_val,group_val_std,
                group_test,group_test_std,group_ext,group_ext_std]+tmp_list,axis=1)
    grouped_model_feat.columns = [f'train_{xeva}',f'train_{xeva}_std',f'val_{xeva}',f'val_{xeva}_std',
              f'test_{xeva}',f'test_{xeva}_std', f'ext_{xeva}',f'ext_{xeva}_std'] + \
            ['rmSim_0.6','rmSim_0.6_std','rmSim_0.7','rmSim_0.7_std',
             'rmSim_0.8','rmSim_0.8_std', 'rmSim_0.9','rmSim_0.9_std']
    grouped_model_feat.to_csv(f'grouped_metric/grouped_{xeva}.csv')
    grouped_model_feat = pd.read_csv(f'grouped_metric/grouped_{xeva}.csv')
    grouped_sort=grouped_model_feat.sort_values(by=[f'ext_{xeva}',f'test_{xeva}',],ascending=False)
    grouped_sort.to_csv(f'grouped_metric/grouped_sort_{xeva}.csv',index=False)
    return grouped_sort

In [12]:
#df.columns.to_list()
#['col_names', 'roc', 'prc', 'Recall', 'Precision', 'f1', 'BA', 'accuracy', 'TN', 'FP', 'FN', 'TP', 'SP', 'SE',
# 'NPV', 'MCC', 'EF_0.01', 'EF_0.02', 'EF_0.05', 'p_stat', 'cohen_kappa']
#grouped_sort_roc = group_model_feat_for_one_performance_sort(xeva = 'roc')

In [18]:
grouped = []
if not os.path.exists('grouped_metric'):os.mkdir('grouped_metric')
for x_metric in ['roc', 'prc', 'Recall', 'Precision', 'f1', 'BA', 'accuracy', 'TN', 'FP', 'FN', 'TP', 'SP', 'SE',
'NPV', 'MCC', 'EF_0.01', 'EF_0.02', 'EF_0.05', 'p_stat', 'cohen_kappa']:
    grouped.append(group_model_feat_for_one_performance_sort(xeva = x_metric))  

In [19]:
"""
test_df=df[df['data']=='test']
group_test = test_df.groupby(['model','feat']).mean()['roc']
group_test_std = test_df.groupby(['model','feat']).std()['roc']
ext_df=df[df['data']=='Ext']
group_ext = ext_df.groupby(['model','feat']).mean()['roc']
#group_ext.to_csv('group_model_feat_Ext_roc.csv')
group_ext_std = ext_df.groupby(['model','feat']).std()['roc']

train_df=df[df['data']=='train']
group_train = train_df.groupby(['model','feat']).mean()['roc']
group_train_std = train_df.groupby(['model','feat']).std()['roc']

val_df=df[df['data']=='train']
group_val = val_df.groupby(['model','feat']).mean()['roc']
group_val_std = val_df.groupby(['model','feat']).std()['roc']
"""


In [24]:
"""
dfl = [pd.DataFrame() for i in [6,7,8,9]] 
groupl = [group_val for i in [6,7,8,9]] 
group_stdl = [group_val_std for i in [6,7,8,9]] 
for i,sim in enumerate([6,7,8,9]):
    #dfl[0]=df[df['data']=='train']
    dfl[i]=df[df['data']==f'ExtRmSim0{sim}']
    groupl[i] = dfl[i].groupby(['model','feat']).mean()['roc']
    group_stdl[i] = dfl[i].groupby(['model','feat']).std()['roc']
    
grouped_model_feat = pd.concat([group_train,group_train_std,group_val,group_val_std,
                group_test,group_test_std,group_ext,group_ext_std]+tmp_list,axis=1)
grouped_model_feat.columns = ['train_roc','train_roc_std','val_roc','val_roc_std',
              'test_roc','test_roc_std', 'ext_roc','ext_roc_std'] + \
            ['rmSim_0.6','rmSim_0.6_std','rmSim_0.7','rmSim_0.7_std',
             'rmSim_0.8','rmSim_0.8_std', 'rmSim_0.9','rmSim_0.9_std']
grouped_model_feat.to_csv('grouped_model_feat.csv')
grouped_model_feat = pd.read_csv('grouped_model_feat.csv')
print(grouped_model_feat)
"""