In [1]:
from sklearn.ensemble import RandomForestClassifier
#import deepchem as dc
import numpy as np
import pandas as pd
import tempfile
#import chemprop
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import os
from tensorflow.keras.callbacks import EarlyStopping
import joblib

In [2]:
def ROC_AUC(y_true, y_score):
	auc = roc_auc_score(y_true, y_score)
	return auc

In [3]:
fingerprint_list = ['MorganFP', 'RDkitFP', 'AtomPairFP', 'TorsionFP', 'AvalonFP', 
'EstateFP', 'MACCSFP', 'PharmacoErGFP', 'PharmacoPFP', 'PubChemFP', 'MHFP6', 'MAP4']
descriptor_list = ['Property', 'Constitution', 'Autocorr', 'Fragment', 'Charge', 
'Estate', 'MOE', 'Connectivity', 'Topology', 'Kappa', 'Path', 'Matrix', 'InfoContent']

In [4]:
# free GPU memory
from numba import cuda 
device = cuda.get_current_device()
device.reset()
cuda.close()

In [5]:
def fetch_y_train_val_test_Ext(data_type,fold):
    i = fold; b = data_type
    #if b=='Ext': true_file = 'Ext.csv'
    if b[:3]=='Ext': true_file = f'{b}.csv'
    else: true_file = f'rand_/fold_{i}/{b}_full.csv' 
    print('true_file = ',true_file)
    df = pd.read_csv(true_file)
    y = df.iloc[:,1].astype('int').values
    return y

In [6]:
def fetch_X_train_val_test_Ext(data_type,fold):
    #i = 0;b = data_type = 'train'
    i = fold; b = data_type
    y_file = f'rand_/fold_{i}/{b}_full.csv'    
    X_train_files = []
    X_train_dfl = []
    for model,feature in zip(models,features):
        pred_file = f'{model}/p_{data_type}_{feature}_fold{i}.csv'
        X_train_files.append(pred_file)
        if model=='DMPNN':X_train_dfl.append(pd.read_csv(pred_file).iloc[:,1])
        else:X_train_dfl.append(pd.read_csv(pred_file))
    X_train_df = pd.concat(X_train_dfl,axis=1)
    return X_train_df.values

In [7]:
def fetch_X_train_val_test_Ext_topN(data_type,fold,topN):
    #i = 0;b = data_type = 'train'
    i = fold; b = data_type
    y_file = f'rand_/fold_{i}/{b}_full.csv'    
    X_train_files = []
    X_train_dfl = []
    for model,feature in zip(models[:topN],features[:topN]):
        pred_file = f'{model}/p_{data_type}_{feature}_fold{i}.csv'
        X_train_files.append(pred_file)
        if model=='DMPNN':X_train_dfl.append(pd.read_csv(pred_file).iloc[:,1])
        else:X_train_dfl.append(pd.read_csv(pred_file))
    #print('X_train_files = ',X_train_files)
    X_train_df = pd.concat(X_train_dfl,axis=1)
    return X_train_df.values

In [8]:
df = pd.read_csv('grouped_sort_roc.csv')
features = df['feat'].fillna('').values  ## change 'nan' to ''
models = df['model'].values

# Train fcnn ensemble model

test  n_Dense, seem  increase before 600, after that litte grow

In [10]:
# seem the keras model can only run once inside a function, otherwise will get GPU errors.
def run_one_keras_fcnn(n_Dense,i):
    model = Sequential()
    model.add(Dense(n_Dense, input_shape=(n_feats,), activation='relu'),)
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    #print(model.summary)
    #get X and Y (main set and external set)
    y_train_2dim=y_train[:,np.newaxis]  ### change shape from(2769,) to (2769, 1)
    y_val_2dim = y_val[:,np.newaxis]
    print(y_train_2dim.shape)
    #model.fit(X_T, y_T_2dim,batch_size=4096,verbose=0, epochs=100)
    ### using early stop by monitor val_loss of X_valid, y_valid_2dim
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    model.fit(X_train, y_train_2dim, batch_size=4096, verbose=0, epochs=200, 
              validation_data=(X_val, y_val_2dim), callbacks=[early_stopping])    
    ## prediction
    y_prob_train = model.predict(X_train) 
    train_roc_auc = ROC_AUC(y_train,y_prob_train[:, 0])
    print('train_roc_auc = ', train_roc_auc )
    y_prob_val=model.predict(X_val)  ## no need other parameters here
    val_roc_auc = ROC_AUC(y_val,y_prob_val[:, 0])
    y_prob_test=model.predict(X_test)  
    test_roc_auc = ROC_AUC(y_test,y_prob_test[:, 0])
    print('test_roc_auc, val_roc_auc = ', test_roc_auc, val_roc_auc)
    y_prob_Ext=model.predict(X_Ext)  ## no need other parameters here
    Ext_roc_auc = ROC_AUC(y_Ext,y_prob_Ext[:, 0])
    print('Ext_roc_auc = ', Ext_roc_auc)    
    ## save predition
    if not os.path.exists(f'esb/{n_Dense}'):os.mkdir(f'esb/{n_Dense}')
    pd.DataFrame(y_prob_train[:,0],columns=['prob']).to_csv(f'esb/{n_Dense}/p_train_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_val[:, 0],columns=['prob']).to_csv(f'esb/{n_Dense}/p_val_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_test[:,0],columns=['prob']).to_csv(f'esb/{n_Dense}/p_test_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_Ext[:,0],columns=['prob']).to_csv(f'esb/{n_Dense}/p_Ext_Top{n_feats}_fold{i}.csv',index=False)   
    ## save model
    model_save_name = f'esb/{n_Dense}/model_esbTop{n_feats}_fold{i}.h5'
    model.save(model_save_name)    
    model = ''
    return train_roc_auc,val_roc_auc,test_roc_auc,Ext_roc_auc

In [11]:
if not os.path.exists('esb'):os.mkdir('esb')
performance = []
num_folds = 5
#n_Dense = 800
for n_Dense in [200,400,600,800,1000,1200]:
    for i in range(num_folds): 
        print(f'processing n_Dense:{n_Dense}, i: {i}')
        X_train = fetch_X_train_val_test_Ext('train',i)
        y_train = fetch_y_train_val_test_Ext('train',i)
        X_val = fetch_X_train_val_test_Ext('val',i)
        y_val = fetch_y_train_val_test_Ext('val',i)
        X_test = fetch_X_train_val_test_Ext('test',i)
        y_test = fetch_y_train_val_test_Ext('test',i)
        X_Ext = fetch_X_train_val_test_Ext('Ext',i)
        y_Ext = fetch_y_train_val_test_Ext('Ext',i)
        n_feats = X_train.shape[1]
        train_roc_auc,val_roc_auc,test_roc_auc,Ext_roc_auc = run_one_keras_fcnn(n_Dense,i)
        results = {'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense,'train_roc_auc':train_roc_auc,
        'val_roc_auc':val_roc_auc,"test_roc_auc":test_roc_auc,'Ext_roc_auc':Ext_roc_auc}
        print('results = ',results)
        performance.append(results)
        pd.DataFrame([results]).to_csv('esb/esb_append.csv',mode='a')
pd.DataFrame(performance).to_csv('esb/esb.csv',mode='a')    

processing n_Dense:200, i: 0
true_file =  rand_/fold_0/train_full.csv
true_file =  rand_/fold_0/val_full.csv
true_file =  rand_/fold_0/test_full.csv
true_file =  Ext.csv


2023-01-26 17:04:19.984735: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-26 17:04:22.636299: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8089 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:25:00.0, compute capability: 8.6
2023-01-26 17:04:22.638762: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 8089 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:61:00.0, compute capability: 8.6
2023-01-26 17:04:22.640102: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/de

(2769, 1)


2023-01-26 17:04:25.714659: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


train_roc_auc =  0.9999800933773729
test_roc_auc, val_roc_auc =  0.9728431372549018 0.9345446075005824
Ext_roc_auc =  0.9312624141187198
results =  {'fold': 0, 'Top_n_feats': 151, 'n_Dense': 200, 'train_roc_auc': 0.9999800933773729, 'val_roc_auc': 0.9345446075005824, 'test_roc_auc': 0.9728431372549018, 'Ext_roc_auc': 0.9312624141187198}
processing n_Dense:200, i: 1
true_file =  rand_/fold_1/train_full.csv
true_file =  rand_/fold_1/val_full.csv
true_file =  rand_/fold_1/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9996919679706942
test_roc_auc, val_roc_auc =  0.9117647058823529 0.9226448403145562
Ext_roc_auc =  0.9393702582136694
results =  {'fold': 1, 'Top_n_feats': 151, 'n_Dense': 200, 'train_roc_auc': 0.9996919679706942, 'val_roc_auc': 0.9226448403145562, 'test_roc_auc': 0.9117647058823529, 'Ext_roc_auc': 0.9393702582136694}
processing n_Dense:200, i: 2
true_file =  rand_/fold_2/train_full.csv
true_file =  rand_/fold_2/val_full.csv
true_file =  rand_/fold_2/test_fu

test_roc_auc, val_roc_auc =  0.9678691678691678 0.9551868044515104
Ext_roc_auc =  0.9472827325713914
results =  {'fold': 2, 'Top_n_feats': 151, 'n_Dense': 1000, 'train_roc_auc': 0.9999764059036995, 'val_roc_auc': 0.9551868044515104, 'test_roc_auc': 0.9678691678691678, 'Ext_roc_auc': 0.9472827325713914}
processing n_Dense:1000, i: 3
true_file =  rand_/fold_3/train_full.csv
true_file =  rand_/fold_3/val_full.csv
true_file =  rand_/fold_3/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999525303614277
test_roc_auc, val_roc_auc =  0.9303370786516854 0.9389376218323587
Ext_roc_auc =  0.9493666764351534
results =  {'fold': 3, 'Top_n_feats': 151, 'n_Dense': 1000, 'train_roc_auc': 0.9999525303614277, 'val_roc_auc': 0.9389376218323587, 'test_roc_auc': 0.9303370786516854, 'Ext_roc_auc': 0.9493666764351534}
processing n_Dense:1000, i: 4
true_file =  rand_/fold_4/train_full.csv
true_file =  rand_/fold_4/val_full.csv
true_file =  rand_/fold_4/test_full.csv
true_file =  Ext.csv
(276

# load model and pred

In [28]:
# seem the keras model can only run once inside a function, otherwise will get GPU errors.
def load_one_keras_fcnn_pred_ext_files(n_Dense,i,ext_files):
    model_save_name = f'esb/{n_Dense}/model_esbTop{n_feats}_fold{i}.h5'
    model = tf.keras.models.load_model(model_save_name)
    rocs_dic = {}
    for ext_file in ext_files:
        ext_base = ext_file.replace('.csv','')
        print(f'processing n_Dense:{n_Dense}, i: {i}, ext_base:{ext_base}')
        X_e = fetch_X_train_val_test_Ext_topN(ext_base,i,topN)
        #fetch_X_train_val_test_Ext(ext_base,i)                
        y_e= fetch_y_train_val_test_Ext(ext_base,i)
        #print('n_feats,X_e.shape = ',n_feats, X_e.shape)
        assert n_feats == X_e.shape[1]
        y_prob_e=model.predict(X_e)  ## no need other parameters here
        e_roc_auc = ROC_AUC(y_e,y_prob_e[:, 0])
        print(f'{ext_base}_roc_auc = ', e_roc_auc) 
        rocs_dic.update({f'{ext_base}_roc_auc':e_roc_auc})
        ## save predition
        if not os.path.exists(f'esb/{n_Dense}'):os.mkdir(f'esb/{n_Dense}')
        pd.DataFrame(y_prob_e[:,0],columns=['prob']).to_csv(f'esb/{n_Dense}/p_{ext_base}_Top{n_feats}_fold{i}.csv',index=False)   
    return rocs_dic

In [20]:
if not os.path.exists('esb'):os.mkdir('esb')
performance = []
num_folds = 5
n_Dense = 800
n_feats = topN = 151
ext_files = ['Ext_rm_sim_0_6.csv','Ext_rm_sim_0_7.csv','Ext_rm_sim_0_8.csv','Ext_rm_sim_0_9.csv']
#for n_Dense in [200,400,600,800,1200]:
for i in range(num_folds): 
    results = {'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense}
    rocs_dic = load_one_keras_fcnn_pred_ext_files(n_Dense,i,ext_files)
    results.update(rocs_dic)
    performance.append(results)
    pd.DataFrame([results]).to_csv('esb/esb_append.csv',mode='a')
pd.DataFrame(performance).to_csv('esb/esb.csv',mode='a')    

processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.03703703703703705
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7789473684210526
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8561502347417841
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9234276409525748
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.0740740740740741
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7221052631578948
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8547417840375586
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.

selecting baseline models from 140 to 10

In [21]:
if not os.path.exists('esb'):os.mkdir('esb')
performance = []
num_folds = 5
n_Dense = 800
for topN in [140,130,120,100,90,80,70,60,50,40,30,25,20,15,10]:
    for i in range(num_folds): 
        print(f'processing topN:{topN}, i: {i}')
        X_train = fetch_X_train_val_test_Ext_topN('train',i,topN)
        y_train = fetch_y_train_val_test_Ext('train',i)
        X_val = fetch_X_train_val_test_Ext_topN('val',i,topN)
        y_val = fetch_y_train_val_test_Ext('val',i)
        X_test = fetch_X_train_val_test_Ext_topN('test',i,topN)
        y_test = fetch_y_train_val_test_Ext('test',i)
        X_Ext = fetch_X_train_val_test_Ext_topN('Ext',i,topN)
        y_Ext = fetch_y_train_val_test_Ext('Ext',i)
        n_feats = X_train.shape[1]
        assert n_feats==topN
        train_roc_auc,val_roc_auc,test_roc_auc,Ext_roc_auc = run_one_keras_fcnn(n_Dense,i)
        results = {'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense,'train_roc_auc':train_roc_auc,
        'val_roc_auc':val_roc_auc,"test_roc_auc":test_roc_auc,'Ext_roc_auc':Ext_roc_auc}
        print('results = ',results)
        performance.append(results)
        pd.DataFrame([results]).to_csv('esb/esb_append_topN.csv',mode='a')
pd.DataFrame(performance).to_csv('esb/esb_topN.csv',mode='a')    

processing topN:140, i: 0
true_file =  rand_/fold_0/train_full.csv
true_file =  rand_/fold_0/val_full.csv
true_file =  rand_/fold_0/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999946405246773
test_roc_auc, val_roc_auc =  0.974264705882353 0.9313766596785465
Ext_roc_auc =  0.9328253720165414
results =  {'fold': 0, 'Top_n_feats': 140, 'n_Dense': 800, 'train_roc_auc': 0.9999946405246773, 'val_roc_auc': 0.9313766596785465, 'test_roc_auc': 0.974264705882353, 'Ext_roc_auc': 0.9328253720165414}
processing topN:140, i: 1
true_file =  rand_/fold_1/train_full.csv
true_file =  rand_/fold_1/val_full.csv
true_file =  rand_/fold_1/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9998275321889705
test_roc_auc, val_roc_auc =  0.9104411764705882 0.9269780131600064
Ext_roc_auc =  0.9278434437172348
results =  {'fold': 1, 'Top_n_feats': 140, 'n_Dense': 800, 'train_roc_auc': 0.9998275321889705, 'val_roc_auc': 0.9269780131600064, 'test_roc_auc': 0.9104411764705882, 'Ext_

(2769, 1)
train_roc_auc =  0.9998938080290168
test_roc_auc, val_roc_auc =  0.9099999999999999 0.9244102070293693
Ext_roc_auc =  0.9334440428510956
results =  {'fold': 1, 'Top_n_feats': 120, 'n_Dense': 800, 'train_roc_auc': 0.9998938080290168, 'val_roc_auc': 0.9244102070293693, 'test_roc_auc': 0.9099999999999999, 'Ext_roc_auc': 0.9334440428510956}
processing topN:120, i: 2
true_file =  rand_/fold_2/train_full.csv
true_file =  rand_/fold_2/val_full.csv
true_file =  rand_/fold_2/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999741226040576
test_roc_auc, val_roc_auc =  0.9662337662337662 0.954391891891892
Ext_roc_auc =  0.9386213408876299
results =  {'fold': 2, 'Top_n_feats': 120, 'n_Dense': 800, 'train_roc_auc': 0.9999741226040576, 'val_roc_auc': 0.954391891891892, 'test_roc_auc': 0.9662337662337662, 'Ext_roc_auc': 0.9386213408876299}
processing topN:120, i: 3
true_file =  rand_/fold_3/train_full.csv
true_file =  rand_/fold_3/val_full.csv
true_file =  rand_/fold_3/test_

Ext_roc_auc =  0.9343557682914917
results =  {'fold': 3, 'Top_n_feats': 70, 'n_Dense': 800, 'train_roc_auc': 0.9999609523940776, 'val_roc_auc': 0.9381578947368421, 'test_roc_auc': 0.9291900749063671, 'Ext_roc_auc': 0.9343557682914917}
processing topN:70, i: 4
true_file =  rand_/fold_4/train_full.csv
true_file =  rand_/fold_4/val_full.csv
true_file =  rand_/fold_4/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999869371446134
test_roc_auc, val_roc_auc =  0.9517019735650913 0.9329397293972939
Ext_roc_auc =  0.9202891472110969
results =  {'fold': 4, 'Top_n_feats': 70, 'n_Dense': 800, 'train_roc_auc': 0.9999869371446134, 'val_roc_auc': 0.9329397293972939, 'test_roc_auc': 0.9517019735650913, 'Ext_roc_auc': 0.9202891472110969}
processing topN:60, i: 0
true_file =  rand_/fold_0/train_full.csv
true_file =  rand_/fold_0/val_full.csv
true_file =  rand_/fold_0/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999869841313592
test_roc_auc, val_roc_auc =  0.97348039

true_file =  rand_/fold_0/train_full.csv
true_file =  rand_/fold_0/val_full.csv
true_file =  rand_/fold_0/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999647805907367
test_roc_auc, val_roc_auc =  0.9750490196078431 0.9300722105753553
Ext_roc_auc =  0.9383282862817883
results =  {'fold': 0, 'Top_n_feats': 40, 'n_Dense': 800, 'train_roc_auc': 0.9999647805907367, 'val_roc_auc': 0.9300722105753553, 'test_roc_auc': 0.9750490196078431, 'Ext_roc_auc': 0.9383282862817883}
processing topN:40, i: 1
true_file =  rand_/fold_1/train_full.csv
true_file =  rand_/fold_1/val_full.csv
true_file =  rand_/fold_1/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999389961017756
test_roc_auc, val_roc_auc =  0.9375980392156863 0.9482159096988177
Ext_roc_auc =  0.9405424766370356
results =  {'fold': 1, 'Top_n_feats': 40, 'n_Dense': 800, 'train_roc_auc': 0.9999389961017756, 'val_roc_auc': 0.9482159096988177, 'test_roc_auc': 0.9375980392156863, 'Ext_roc_auc': 0.940542476637035

(2769, 1)
train_roc_auc =  0.9998689545889995
test_roc_auc, val_roc_auc =  0.9362254901960785 0.947948429893543
Ext_roc_auc =  0.9484875126176289
results =  {'fold': 1, 'Top_n_feats': 25, 'n_Dense': 800, 'train_roc_auc': 0.9998689545889995, 'val_roc_auc': 0.947948429893543, 'test_roc_auc': 0.9362254901960785, 'Ext_roc_auc': 0.9484875126176289}
processing topN:25, i: 2
true_file =  rand_/fold_2/train_full.csv
true_file =  rand_/fold_2/val_full.csv
true_file =  rand_/fold_2/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9999330232105019
test_roc_auc, val_roc_auc =  0.9669552669552669 0.9711347376788553
Ext_roc_auc =  0.9473478558171339
results =  {'fold': 2, 'Top_n_feats': 25, 'n_Dense': 800, 'train_roc_auc': 0.9999330232105019, 'val_roc_auc': 0.9711347376788553, 'test_roc_auc': 0.9669552669552669, 'Ext_roc_auc': 0.9473478558171339}
processing topN:25, i: 3
true_file =  rand_/fold_3/train_full.csv
true_file =  rand_/fold_3/val_full.csv
true_file =  rand_/fold_3/test_full

above shows n_feat(ie topN) set 20 or 25 was better than others

# load model and pred ext, set n_Dense=800, topN = n_feats = 20, 

In [29]:
if not os.path.exists('esb'):os.mkdir('esb')
performance = []
num_folds = 5
n_Dense = 800
n_feats = topN = 20
ext_files = ['Ext_rm_sim_0_6.csv','Ext_rm_sim_0_7.csv','Ext_rm_sim_0_8.csv','Ext_rm_sim_0_9.csv']
#for n_Dense in [200,400,600,800,1200]:
for i in range(num_folds): 
    results = {'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense}
    rocs_dic = load_one_keras_fcnn_pred_ext_files(n_Dense,i,ext_files)
    results.update(rocs_dic)
    performance.append(results)
    pd.DataFrame([results]).to_csv('esb/esb_append.csv',mode='a')
pd.DataFrame(performance).to_csv('esb/esb.csv',mode='a') 

processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.4074074074074074
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.8147368421052632
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8821596244131455
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9337675554650926
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.4444444444444444
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.8252631578947369
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.9010328638497653
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.c

 can also include train, val, test. Ext these base. 

In [32]:
if not os.path.exists('esb'):os.mkdir('esb')
performance = []
num_folds = 5
n_Dense = 800
#n_feats = topN = 20
ext_files = ['train','val','test','Ext','Ext_rm_sim_0_6.csv','Ext_rm_sim_0_7.csv',
             'Ext_rm_sim_0_8.csv','Ext_rm_sim_0_9.csv']
## also can include train, val, test. Ext these base. 
#for n_Dense in [200,400,600,800,1200]:
for topN in [140,130,120,100,90,80,70,60,50,40,30,25,20,15,10]:
    n_feats = topN
    for i in range(num_folds): 
        results = {'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense}
        rocs_dic = load_one_keras_fcnn_pred_ext_files(n_Dense,i,ext_files)
        results.update(rocs_dic)
        performance.append(results)
        pd.DataFrame([results]).to_csv('esb/esb_rmSim_append.csv',mode='a')
pd.DataFrame(performance).to_csv('esb/esb_rmSim.csv',mode='a') 

processing n_Dense:800, i: 0, ext_base:train
true_file =  rand_/fold_0/train_full.csv
train_roc_auc =  0.9999946405246773
processing n_Dense:800, i: 0, ext_base:val
true_file =  rand_/fold_0/val_full.csv
val_roc_auc =  0.9313766596785465
processing n_Dense:800, i: 0, ext_base:test
true_file =  rand_/fold_0/test_full.csv
test_roc_auc =  0.974264705882353
processing n_Dense:800, i: 0, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9328253720165414
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.03703703703703705
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7926315789473684
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8549295774647887
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9218400162833299
processing n_Dense:80

true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9248931406472624
processing n_Dense:800, i: 1, ext_base:train
true_file =  rand_/fold_1/train_full.csv
train_roc_auc =  0.9998139757671428
processing n_Dense:800, i: 1, ext_base:val
true_file =  rand_/fold_1/val_full.csv
val_roc_auc =  0.927245492965281
processing n_Dense:800, i: 1, ext_base:test
true_file =  rand_/fold_1/test_full.csv
test_roc_auc =  0.9092156862745098
processing n_Dense:800, i: 1, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.940705284751392
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.1111111111111111
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7821052631578947
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8896713615023474
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim

true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8630046948356807
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9253409322206392
processing n_Dense:800, i: 2, ext_base:train
true_file =  rand_/fold_2/train_full.csv
train_roc_auc =  0.9999741226040576
processing n_Dense:800, i: 2, ext_base:val
true_file =  rand_/fold_2/val_full.csv
val_roc_auc =  0.954391891891892
processing n_Dense:800, i: 2, ext_base:test
true_file =  rand_/fold_2/test_full.csv
test_roc_auc =  0.9662337662337662
processing n_Dense:800, i: 2, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9386213408876299
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.0
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7905263157894736
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_r

Ext_rm_sim_0_7_roc_auc =  0.7789473684210527
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8710798122065728
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9307551394260127
processing n_Dense:800, i: 3, ext_base:train
true_file =  rand_/fold_3/train_full.csv
train_roc_auc =  0.9999724369840547
processing n_Dense:800, i: 3, ext_base:val
true_file =  rand_/fold_3/val_full.csv
val_roc_auc =  0.9345029239766082
processing n_Dense:800, i: 3, ext_base:test
true_file =  rand_/fold_3/test_full.csv
test_roc_auc =  0.9262172284644195
processing n_Dense:800, i: 3, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9430171599752531
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.0740740740740741
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_au

Ext_roc_auc =  0.9125720425906028
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.0
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.736842105263158
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8297652582159625
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9083248524323225
processing n_Dense:800, i: 0, ext_base:train
true_file =  rand_/fold_0/train_full.csv
train_roc_auc =  0.9999969374426728
processing n_Dense:800, i: 0, ext_base:val
true_file =  rand_/fold_0/val_full.csv
val_roc_auc =  0.9328674586536222
processing n_Dense:800, i: 0, ext_base:test
true_file =  rand_/fold_0/test_full.csv
test_roc_auc =  0.972156862745098
processing n_Dense:800, i: 0, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.929699456220898
proce

test_roc_auc =  0.9734803921568627
processing n_Dense:800, i: 0, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9346488228973332
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.03703703703703705
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7736842105263159
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8659154929577465
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9235904742519845
processing n_Dense:800, i: 1, ext_base:train
true_file =  rand_/fold_1/train_full.csv
train_roc_auc =  0.9999405023708675
processing n_Dense:800, i: 1, ext_base:val
true_file =  rand_/fold_1/val_full.csv
val_roc_auc =  0.9474134702829937
processing n_Dense:800, i: 1, ext_base:test
true_file =  rand_/fold_1/test_full.csv
test_roc_auc =  0.93

train_roc_auc =  0.9999718393044156
processing n_Dense:800, i: 2, ext_base:val
true_file =  rand_/fold_2/val_full.csv
val_roc_auc =  0.9680544515103338
processing n_Dense:800, i: 2, ext_base:test
true_file =  rand_/fold_2/test_full.csv
test_roc_auc =  0.9712842712842713
processing n_Dense:800, i: 2, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9420728729119858
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.11111111111111116
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.8021052631578948
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8754929577464788
processing n_Dense:800, i: 2, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9321799307958477
processing n_Dense:800, i: 3, ext_base:train
true_file =  rand_/fold_3/train_full.csv
train_roc_auc =  0.9

Ext_rm_sim_0_9_roc_auc =  0.9379605129248931
processing n_Dense:800, i: 3, ext_base:train
true_file =  rand_/fold_3/train_full.csv
train_roc_auc =  0.9997894491837519
processing n_Dense:800, i: 3, ext_base:val
true_file =  rand_/fold_3/val_full.csv
val_roc_auc =  0.9376218323586745
processing n_Dense:800, i: 3, ext_base:test
true_file =  rand_/fold_3/test_full.csv
test_roc_auc =  0.9312734082397004
processing n_Dense:800, i: 3, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9459151444107974
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.14814814814814817
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7852631578947369
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.883849765258216
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_au

Ext_rm_sim_0_8_roc_auc =  0.9046009389671361
processing n_Dense:800, i: 3, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9466720944433137
processing n_Dense:800, i: 4, ext_base:train
true_file =  rand_/fold_4/train_full.csv
train_roc_auc =  0.9998340248962655
processing n_Dense:800, i: 4, ext_base:val
true_file =  rand_/fold_4/val_full.csv
val_roc_auc =  0.9389421894218942
processing n_Dense:800, i: 4, ext_base:test
true_file =  rand_/fold_4/test_full.csv
test_roc_auc =  0.9577675176534493
processing n_Dense:800, i: 4, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9371886294812933
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.29629629629629634
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.8010526315789475
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_a

Ext_rm_sim_0_7_roc_auc =  0.8200000000000001
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.9012206572769953
processing n_Dense:800, i: 4, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9429269285568899
processing n_Dense:800, i: 0, ext_base:train
true_file =  rand_/fold_0/train_full.csv
train_roc_auc =  0.9994717088610503
processing n_Dense:800, i: 0, ext_base:val
true_file =  rand_/fold_0/val_full.csv
val_roc_auc =  0.929140461215933
processing n_Dense:800, i: 0, ext_base:test
true_file =  rand_/fold_0/test_full.csv
test_roc_auc =  0.9729901960784314
processing n_Dense:800, i: 0, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9480967731431735
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_auc =  0.37037037037037035
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_au

Ext_rm_sim_0_6_roc_auc =  0.22222222222222224
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_7
true_file =  Ext_rm_sim_0_7.csv
Ext_rm_sim_0_7_roc_auc =  0.7105263157894737
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_8
true_file =  Ext_rm_sim_0_8.csv
Ext_rm_sim_0_8_roc_auc =  0.8636619718309859
processing n_Dense:800, i: 0, ext_base:Ext_rm_sim_0_9
true_file =  Ext_rm_sim_0_9.csv
Ext_rm_sim_0_9_roc_auc =  0.9222878078567067
processing n_Dense:800, i: 1, ext_base:train
true_file =  rand_/fold_1/train_full.csv
train_roc_auc =  0.9998358166689763
processing n_Dense:800, i: 1, ext_base:val
true_file =  rand_/fold_1/val_full.csv
val_roc_auc =  0.9401915155405768
processing n_Dense:800, i: 1, ext_base:test
true_file =  rand_/fold_1/test_full.csv
test_roc_auc =  0.928578431372549
processing n_Dense:800, i: 1, ext_base:Ext
true_file =  Ext.csv
Ext_roc_auc =  0.9353651786005014
processing n_Dense:800, i: 1, ext_base:Ext_rm_sim_0_6
true_file =  Ext_rm_sim_0_6.csv
Ext_rm_sim_0_6_roc_au

## voting

In [33]:
def voting_np_array(arr):
    results=[]
    m,n=arr.shape
    half_n = n/2
    for i in range(m):
        if len(arr[i][arr[i]<0.5])>half_n: 
            results.append(0)
        else:
            results.append(1)
    return results

In [46]:
# save prob and return roc_auc
def voting_one_base_one_fold(i,ext_base):
    ## prediction
    y_prob_e = voting_np_array(X_e) 
    e_roc_auc = ROC_AUC(y_e,y_prob_e)
    pd.DataFrame(y_prob_e,columns=['prob']).to_csv(f'voting/p_{ext_base}_Top{n_feats}_fold{i}.csv',index=False)
    return e_roc_auc

In [47]:
if not os.path.exists('voting'):os.mkdir('voting')
ext_bases = ['train','val','test','Ext','Ext_rm_sim_0_6','Ext_rm_sim_0_7',
             'Ext_rm_sim_0_8','Ext_rm_sim_0_9']
performance = []
num_folds = 5
for topN in [140,130,120,100,90,80,70,60,50,40,30,25,20,15,10]:
    n_feats = topN
    for i in range(num_folds): 
        results = {'fold':i,'Top_n_feats':n_feats,}
        for ext_base in ext_bases:
            print(f'processing topN:{topN}, i: {i}')
            X_e = fetch_X_train_val_test_Ext_topN(ext_base,i,topN)
            y_e = fetch_y_train_val_test_Ext(ext_base,i)
            print('n_feats, X_e.shape = ', n_feats, X_e.shape)
            n_feats = X_e.shape[1]
            assert n_feats==topN
            e_roc_auc = voting_one_base_one_fold(i,ext_base)
            results.update({ext_base:e_roc_auc})
        performance.append(results)
        pd.DataFrame([results]).to_csv('voting/vot_append_topN.csv',mode='a')
pd.DataFrame(performance).to_csv('voting/vot_topN.csv',mode='a')    

processing topN:140, i: 0
true_file =  rand_/fold_0/train_full.csv
n_feats, X_e.shape =  140 (2769, 140)
processing topN:140, i: 0
true_file =  rand_/fold_0/val_full.csv
n_feats, X_e.shape =  140 (346, 140)
processing topN:140, i: 0
true_file =  rand_/fold_0/test_full.csv
n_feats, X_e.shape =  140 (347, 140)
processing topN:140, i: 0
true_file =  Ext.csv
n_feats, X_e.shape =  140 (440, 140)
processing topN:140, i: 0
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  140 (12, 140)
processing topN:140, i: 0
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  140 (63, 140)
processing topN:140, i: 0
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  140 (221, 140)
processing topN:140, i: 0
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  140 (374, 140)
processing topN:140, i: 1
true_file =  rand_/fold_1/train_full.csv
n_feats, X_e.shape =  140 (2769, 140)
processing topN:140, i: 1
true_file =  rand_/fold_1/val_full.csv
n_feats, X_e.shape =  140 (346, 140)
processing topN:140, 

true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  120 (221, 120)
processing topN:120, i: 0
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  120 (374, 120)
processing topN:120, i: 1
true_file =  rand_/fold_1/train_full.csv
n_feats, X_e.shape =  120 (2769, 120)
processing topN:120, i: 1
true_file =  rand_/fold_1/val_full.csv
n_feats, X_e.shape =  120 (346, 120)
processing topN:120, i: 1
true_file =  rand_/fold_1/test_full.csv
n_feats, X_e.shape =  120 (347, 120)
processing topN:120, i: 1
true_file =  Ext.csv
n_feats, X_e.shape =  120 (440, 120)
processing topN:120, i: 1
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  120 (12, 120)
processing topN:120, i: 1
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  120 (63, 120)
processing topN:120, i: 1
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  120 (221, 120)
processing topN:120, i: 1
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  120 (374, 120)
processing topN:120, i: 2
true_file =  rand_/fold_2/train_full.c

true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  90 (221, 90)
processing topN:90, i: 1
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  90 (374, 90)
processing topN:90, i: 2
true_file =  rand_/fold_2/train_full.csv
n_feats, X_e.shape =  90 (2769, 90)
processing topN:90, i: 2
true_file =  rand_/fold_2/val_full.csv
n_feats, X_e.shape =  90 (346, 90)
processing topN:90, i: 2
true_file =  rand_/fold_2/test_full.csv
n_feats, X_e.shape =  90 (347, 90)
processing topN:90, i: 2
true_file =  Ext.csv
n_feats, X_e.shape =  90 (440, 90)
processing topN:90, i: 2
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  90 (12, 90)
processing topN:90, i: 2
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  90 (63, 90)
processing topN:90, i: 2
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  90 (221, 90)
processing topN:90, i: 2
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  90 (374, 90)
processing topN:90, i: 3
true_file =  rand_/fold_3/train_full.csv
n_feats, X_e.shape =  90 (2

true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  70 (374, 70)
processing topN:70, i: 3
true_file =  rand_/fold_3/train_full.csv
n_feats, X_e.shape =  70 (2769, 70)
processing topN:70, i: 3
true_file =  rand_/fold_3/val_full.csv
n_feats, X_e.shape =  70 (346, 70)
processing topN:70, i: 3
true_file =  rand_/fold_3/test_full.csv
n_feats, X_e.shape =  70 (347, 70)
processing topN:70, i: 3
true_file =  Ext.csv
n_feats, X_e.shape =  70 (440, 70)
processing topN:70, i: 3
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  70 (12, 70)
processing topN:70, i: 3
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  70 (63, 70)
processing topN:70, i: 3
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  70 (221, 70)
processing topN:70, i: 3
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  70 (374, 70)
processing topN:70, i: 4
true_file =  rand_/fold_4/train_full.csv
n_feats, X_e.shape =  70 (2769, 70)
processing topN:70, i: 4
true_file =  rand_/fold_4/val_full.csv
n_feats, X_e.shape 

true_file =  rand_/fold_4/train_full.csv
n_feats, X_e.shape =  50 (2769, 50)
processing topN:50, i: 4
true_file =  rand_/fold_4/val_full.csv
n_feats, X_e.shape =  50 (346, 50)
processing topN:50, i: 4
true_file =  rand_/fold_4/test_full.csv
n_feats, X_e.shape =  50 (347, 50)
processing topN:50, i: 4
true_file =  Ext.csv
n_feats, X_e.shape =  50 (440, 50)
processing topN:50, i: 4
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  50 (12, 50)
processing topN:50, i: 4
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  50 (63, 50)
processing topN:50, i: 4
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  50 (221, 50)
processing topN:50, i: 4
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  50 (374, 50)
processing topN:40, i: 0
true_file =  rand_/fold_0/train_full.csv
n_feats, X_e.shape =  40 (2769, 40)
processing topN:40, i: 0
true_file =  rand_/fold_0/val_full.csv
n_feats, X_e.shape =  40 (346, 40)
processing topN:40, i: 0
true_file =  rand_/fold_0/test_full.csv
n_feats, X_

true_file =  rand_/fold_0/val_full.csv
n_feats, X_e.shape =  25 (346, 25)
processing topN:25, i: 0
true_file =  rand_/fold_0/test_full.csv
n_feats, X_e.shape =  25 (347, 25)
processing topN:25, i: 0
true_file =  Ext.csv
n_feats, X_e.shape =  25 (440, 25)
processing topN:25, i: 0
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  25 (12, 25)
processing topN:25, i: 0
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  25 (63, 25)
processing topN:25, i: 0
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  25 (221, 25)
processing topN:25, i: 0
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  25 (374, 25)
processing topN:25, i: 1
true_file =  rand_/fold_1/train_full.csv
n_feats, X_e.shape =  25 (2769, 25)
processing topN:25, i: 1
true_file =  rand_/fold_1/val_full.csv
n_feats, X_e.shape =  25 (346, 25)
processing topN:25, i: 1
true_file =  rand_/fold_1/test_full.csv
n_feats, X_e.shape =  25 (347, 25)
processing topN:25, i: 1
true_file =  Ext.csv
n_feats, X_e.shape =  25 (440, 2

true_file =  rand_/fold_1/test_full.csv
n_feats, X_e.shape =  15 (347, 15)
processing topN:15, i: 1
true_file =  Ext.csv
n_feats, X_e.shape =  15 (440, 15)
processing topN:15, i: 1
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  15 (12, 15)
processing topN:15, i: 1
true_file =  Ext_rm_sim_0_7.csv
n_feats, X_e.shape =  15 (63, 15)
processing topN:15, i: 1
true_file =  Ext_rm_sim_0_8.csv
n_feats, X_e.shape =  15 (221, 15)
processing topN:15, i: 1
true_file =  Ext_rm_sim_0_9.csv
n_feats, X_e.shape =  15 (374, 15)
processing topN:15, i: 2
true_file =  rand_/fold_2/train_full.csv
n_feats, X_e.shape =  15 (2769, 15)
processing topN:15, i: 2
true_file =  rand_/fold_2/val_full.csv
n_feats, X_e.shape =  15 (346, 15)
processing topN:15, i: 2
true_file =  rand_/fold_2/test_full.csv
n_feats, X_e.shape =  15 (347, 15)
processing topN:15, i: 2
true_file =  Ext.csv
n_feats, X_e.shape =  15 (440, 15)
processing topN:15, i: 2
true_file =  Ext_rm_sim_0_6.csv
n_feats, X_e.shape =  15 (12, 15)
proce

# binarize prob in layer 2

In [48]:
arr = np.array([[0.9,0.8,0.1],[0.6,0.4,0.3]])
print(arr)
arr1=np.ones_like(arr)
arr1[arr<0.5]=0
arr1=arr1.astype(int)
print(arr1)

[[0.9 0.8 0.1]
 [0.6 0.4 0.3]]
[[1 1 0]
 [1 0 0]]


In [49]:
def fetch_X_train_val_test_Ext_topN_prob_to_0_or_1(data_type,fold,topN,threshold=0.5):
    #i = 0;b = data_type = 'train'
    i = fold; b = data_type
    y_file = f'rand_/fold_{i}/{b}_full.csv'    
    X_train_files = []
    X_train_dfl = []
    for model,feature in zip(models[:topN],features[:topN]):
        pred_file = f'{model}/p_{data_type}_{feature}_fold{i}.csv'
        X_train_files.append(pred_file)
        if model=='DMPNN':X_train_dfl.append(pd.read_csv(pred_file).iloc[:,1])
        else:X_train_dfl.append(pd.read_csv(pred_file))
    #print('X_train_files = ',X_train_files)
    X_train_df = pd.concat(X_train_dfl,axis=1)
    X_train = X_train_df.values
    arr1=np.ones_like(X_train)
    #arr1[X_train<0.5]=0
    arr1[X_train<threshold]=0
    arr1=arr1.astype(int)
    #print(arr1)
    return arr1

In [50]:
# seem the keras model can only run once inside a function, otherwise will get GPU errors.
def run_one_keras_fcnn_for_01(n_Dense,i):
    model = Sequential()
    model.add(Dense(n_Dense, input_shape=(n_feats,), activation='relu'),)
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    #print(model.summary)
    #get X and Y (main set and external set)
    y_train_2dim=y_train[:,np.newaxis]  ### change shape from(2769,) to (2769, 1)
    y_val_2dim = y_val[:,np.newaxis]
    print(y_train_2dim.shape)
    #model.fit(X_T, y_T_2dim,batch_size=4096,verbose=0, epochs=100)
    ### using early stop by monitor val_loss of X_valid, y_valid_2dim
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    model.fit(X_train, y_train_2dim, batch_size=4096, verbose=0, epochs=200, 
              validation_data=(X_val, y_val_2dim), callbacks=[early_stopping])    
    ## prediction
    y_prob_train = model.predict(X_train) 
    train_roc_auc = ROC_AUC(y_train,y_prob_train[:, 0])
    print('train_roc_auc = ', train_roc_auc )
    y_prob_val=model.predict(X_val)  ## no need other parameters here
    val_roc_auc = ROC_AUC(y_val,y_prob_val[:, 0])
    y_prob_test=model.predict(X_test)  
    test_roc_auc = ROC_AUC(y_test,y_prob_test[:, 0])
    print('test_roc_auc, val_roc_auc = ', test_roc_auc, val_roc_auc)
    y_prob_Ext=model.predict(X_Ext)  ## no need other parameters here
    Ext_roc_auc = ROC_AUC(y_Ext,y_prob_Ext[:, 0])
    print('Ext_roc_auc = ', Ext_roc_auc)    
    ## save predition
    if not os.path.exists(f'esb_01/{n_Dense}'):os.mkdir(f'esb_01/{n_Dense}')
    pd.DataFrame(y_prob_train[:,0],columns=['prob']).to_csv(f'esb_01/{n_Dense}/p_train_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_val[:, 0],columns=['prob']).to_csv(f'esb_01/{n_Dense}/p_val_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_test[:,0],columns=['prob']).to_csv(f'esb_01/{n_Dense}/p_test_Top{n_feats}_fold{i}.csv',index=False)
    pd.DataFrame(y_prob_Ext[:,0],columns=['prob']).to_csv(f'esb_01/{n_Dense}/p_Ext_Top{n_feats}_fold{i}.csv',index=False)   
    ## save model
    model_save_name = f'esb_01/{n_Dense}/model_esbTop{n_feats}_fold{i}.h5'
    model.save(model_save_name)    
    model = ''
    return train_roc_auc,val_roc_auc,test_roc_auc,Ext_roc_auc

In [51]:
if not os.path.exists('esb_01'):os.mkdir('esb_01')
performance = []
num_folds = 5
n_Dense = 800
#for topN in [50,40,30,25,20,15,10]:
topN = 20
thresholds = [0.3,0.4,0.5,0.6,0.7] 
for threshold in thresholds:
    for i in range(num_folds): 
        print(f'processing topN:{topN}, i: {i}')
        X_train = fetch_X_train_val_test_Ext_topN_prob_to_0_or_1('train',i,topN)
        y_train = fetch_y_train_val_test_Ext('train',i)
        X_val = fetch_X_train_val_test_Ext_topN_prob_to_0_or_1('val',i,topN)
        y_val = fetch_y_train_val_test_Ext('val',i)
        X_test = fetch_X_train_val_test_Ext_topN_prob_to_0_or_1('test',i,topN)
        y_test = fetch_y_train_val_test_Ext('test',i)
        X_Ext = fetch_X_train_val_test_Ext_topN_prob_to_0_or_1('Ext',i,topN)
        y_Ext = fetch_y_train_val_test_Ext('Ext',i)
        n_feats = X_train.shape[1]
        assert n_feats==topN
        train_roc_auc,val_roc_auc,test_roc_auc,Ext_roc_auc = run_one_keras_fcnn_for_01(n_Dense,i)
        results = {'threshold':threshold,'fold':i,'Top_n_feats':n_feats,'n_Dense':n_Dense,'train_roc_auc':train_roc_auc,
        'val_roc_auc':val_roc_auc,"test_roc_auc":test_roc_auc,'Ext_roc_auc':Ext_roc_auc}
        print('results = ',results)
        performance.append(results)
        pd.DataFrame([results]).to_csv('esb_01/esb_append_topN.csv',mode='a')
pd.DataFrame(performance).to_csv('esb_01/esb_topN.csv',mode='a')   

processing topN:20, i: 0
true_file =  rand_/fold_0/train_full.csv
true_file =  rand_/fold_0/val_full.csv
true_file =  rand_/fold_0/test_full.csv
true_file =  Ext.csv


2023-01-27 01:18:26.631159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-27 01:18:28.812118: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8089 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:25:00.0, compute capability: 8.6
2023-01-27 01:18:28.813588: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 8089 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:61:00.0, compute capability: 8.6
2023-01-27 01:18:28.814862: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/de

(2769, 1)


2023-01-27 01:18:30.174038: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


train_roc_auc =  0.9998246685930152
test_roc_auc, val_roc_auc =  0.9563235294117647 0.918169112508735
Ext_roc_auc =  0.924294226824265
results =  {'threshold': 0.3, 'fold': 0, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.9998246685930152, 'val_roc_auc': 0.918169112508735, 'test_roc_auc': 0.9563235294117647, 'Ext_roc_auc': 0.924294226824265}
processing topN:20, i: 1
true_file =  rand_/fold_1/train_full.csv
true_file =  rand_/fold_1/val_full.csv
true_file =  rand_/fold_1/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9989907997083864
test_roc_auc, val_roc_auc =  0.8992401960784314 0.9151286577863371
Ext_roc_auc =  0.9347790693888184
results =  {'threshold': 0.3, 'fold': 1, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.9989907997083864, 'val_roc_auc': 0.9151286577863371, 'test_roc_auc': 0.8992401960784314, 'Ext_roc_auc': 0.9347790693888184}
processing topN:20, i: 2
true_file =  rand_/fold_2/train_full.csv
true_file =  rand_/fold_2/val_full.csv
true_file =

(2769, 1)
train_roc_auc =  0.9990005904574841
test_roc_auc, val_roc_auc =  0.8991421568627451 0.914326218370513
Ext_roc_auc =  0.9347790693888184
results =  {'threshold': 0.5, 'fold': 1, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.9990005904574841, 'val_roc_auc': 0.914326218370513, 'test_roc_auc': 0.8991421568627451, 'Ext_roc_auc': 0.9347790693888184}
processing topN:20, i: 2
true_file =  rand_/fold_2/train_full.csv
true_file =  rand_/fold_2/val_full.csv
true_file =  rand_/fold_2/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.9988758554762658
test_roc_auc, val_roc_auc =  0.9571669071669073 0.9476599761526231
Ext_roc_auc =  0.9342418026114422
results =  {'threshold': 0.5, 'fold': 2, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.9988758554762658, 'val_roc_auc': 0.9476599761526231, 'test_roc_auc': 0.9571669071669073, 'Ext_roc_auc': 0.9342418026114422}
processing topN:20, i: 3
true_file =  rand_/fold_3/train_full.csv
true_file =  rand_/fold_3/val_full.csv

train_roc_auc =  0.9988933607735211
test_roc_auc, val_roc_auc =  0.9568783068783069 0.9481071144674086
Ext_roc_auc =  0.9321252971248087
results =  {'threshold': 0.7, 'fold': 2, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.9988933607735211, 'val_roc_auc': 0.9481071144674086, 'test_roc_auc': 0.9568783068783069, 'Ext_roc_auc': 0.9321252971248087}
processing topN:20, i: 3
true_file =  rand_/fold_3/train_full.csv
true_file =  rand_/fold_3/val_full.csv
true_file =  rand_/fold_3/test_full.csv
true_file =  Ext.csv
(2769, 1)
train_roc_auc =  0.998892885526201
test_roc_auc, val_roc_auc =  0.907443820224719 0.9024122807017544
Ext_roc_auc =  0.9516948324704503
results =  {'threshold': 0.7, 'fold': 3, 'Top_n_feats': 20, 'n_Dense': 800, 'train_roc_auc': 0.998892885526201, 'val_roc_auc': 0.9024122807017544, 'test_roc_auc': 0.907443820224719, 'Ext_roc_auc': 0.9516948324704503}
processing topN:20, i: 4
true_file =  rand_/fold_4/train_full.csv
true_file =  rand_/fold_4/val_full.csv
true_file =

In [52]:
# free GPU memory
from numba import cuda 
device = cuda.get_current_device()
device.reset()
cuda.close()