# Importing Library

In [84]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
from keras import optimizers
from keras import backend as K
from keras.layers import Input, Dense, Dropout, Input, Activation, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.models import Model, load_model, Sequential 



# Helper libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
import os
from os import listdir

np.random.seed(777)

print(tf.__version__)

1.5.1


##  Functions library

In [73]:
def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
        model.fit(x_train,y_train)
        
        train_pred=np.append(train_pred,model.predict(x_val))
        test_pred=np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred

# 1. Preparation: import & preprocessing data

## Input path & name of raw data for ensemble

In [86]:
# change model_path & each model_name.

types = ["OV_six_fold_Annotation3000_400", 
         "OV_six_fold_CV_400", 
         "OV_six_fold_Var_400", 
         "OV_six_fold_new_Diff_400",
         "OV_six_fold_Clin", 
         "OV_six_fold_SNV_400" 
         ]

input_path = "../TC_six_fold_subsamples/"
save_model_path = "../models/"
save_prediction_path = "../predictions/"
save_result_path = "../result/"

## Import Data

In [152]:
file_1 = path+types[0]+".csv"
file_2 = path+types[1]+".csv"
file_3 = path+types[2]+".csv"
file_4 = path+types[3]+".csv"
file_5 = path+types[4]+".csv"
file_6 = path+types[5]+".csv"

idx_col = 0

full_data_1 = pd.read_csv(file_1,index_col=idx_col)
full_data_2 = pd.read_csv(file_2,index_col=idx_col)
full_data_3 = pd.read_csv(file_3,index_col=idx_col)
full_data_4 = pd.read_csv(file_4,index_col=idx_col)
full_data_5 = pd.read_csv(file_5,index_col=idx_col)
full_data_6 = pd.read_csv(file_6,index_col=idx_col)

inter_data_1 = full_data_1.iloc[list(full_data_1.iloc[:,-1]!=6)]
inter_data_2 = full_data_2.iloc[list(full_data_2.iloc[:,-1]!=6)]
inter_data_3 = full_data_3.iloc[list(full_data_3.iloc[:,-1]!=6)]
inter_data_4 = full_data_4.iloc[list(full_data_4.iloc[:,-1]!=6)]
inter_data_5 = full_data_5.iloc[list(full_data_5.iloc[:,-1]!=6)]
inter_data_6 = full_data_6.iloc[list(full_data_6.iloc[:,-1]!=6)]

full_ds_list = [full_data_1, full_data_2, full_data_3, full_data_4, full_data_5, full_data_6]
inter_ds_list = [inter_data_1, inter_data_2, inter_data_3, inter_data_4, inter_data_5, inter_data_6]

# Split Train Test Data & Make full & inter dataset
full_datasets=[]
inter_datasets=[]

for ts_i in range(1,6):
    full_dataset = {"types":[], "tr_data":[], "ts_data":[], "tr_y_val":[], "tr_x_val":[], "ts_y_val":[], "ts_x_val":[]}
    inter_dataset = {"types":[], "tr_data":[], "ts_data":[], "tr_y_val":[], "tr_x_val":[], "ts_y_val":[], "ts_x_val":[]}
    print("################################## test index is "+str(ts_i)+" ##################################")
    for t in range(len(types)):    
        full_tr_data, full_ts_data, full_tr_y_val, full_tr_x_val, full_ts_y_val, full_ts_x_val = data_split(raw_data = full_ds_list[t], index_col = -1, test_index = ts_i)
        print("["+str(t)+"]: "+types[t]+".\nfull tr & ts: "+str(full_tr_x_val.shape)+", "+str(full_ts_x_val.shape))
        full_dataset['types'].append(types[t])
        full_dataset['tr_data'].append(full_tr_data)
        full_dataset['ts_data'].append(full_ts_data)
        full_dataset['tr_x_val'].append(full_tr_x_val)
        full_dataset['tr_y_val'].append(full_tr_y_val)
        full_dataset['ts_x_val'].append(full_ts_x_val)
        full_dataset['ts_y_val'].append(full_ts_y_val)
        inter_tr_data, inter_ts_data, inter_tr_y_val, inter_tr_x_val, inter_ts_y_val, inter_ts_x_val = data_split(raw_data = inter_ds_list[t], index_col = -1, test_index = ts_i)
        print("inter tr & ts: "+str(inter_tr_x_val.shape)+", "+str(inter_ts_x_val.shape)+"\n")
        inter_dataset['types'].append(types[t])
        inter_dataset['tr_data'].append(inter_tr_data)
        inter_dataset['ts_data'].append(inter_ts_data)
        inter_dataset['tr_x_val'].append(inter_tr_x_val)
        inter_dataset['tr_y_val'].append(inter_tr_y_val)
        inter_dataset['ts_x_val'].append(inter_ts_x_val)
        inter_dataset['ts_y_val'].append(inter_ts_y_val)  
    full_datasets.append(full_dataset)
    inter_datasets.append(inter_dataset)
    

#  


################################## test index is 1 ##################################
[0]: OV_six_fold_Annotation3000_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[1]: OV_six_fold_CV_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[2]: OV_six_fold_Var_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[3]: OV_six_fold_new_Diff_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[4]: OV_six_fold_Clin.
full tr & ts: (256, 35), (31, 35)
inter tr & ts: (122, 35), (31, 35)

[5]: OV_six_fold_SNV_400.
full tr & ts: (182, 402), (31, 402)
inter tr & ts: (122, 402), (31, 402)

################################## test index is 2 ##################################
[0]: OV_six_fold_Annotation3000_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[1]: OV_six_fold_CV_400.
full tr & ts: (186, 400), (31, 400)
inter tr & ts: (122, 400), (31, 400)

[2]: OV_si

In [164]:
# datasets description
print("Each datasets variables (full_datasets & inter_datasets) have 5 splited data by each test index.\n-> "+str(len(full_datasets)))
print("There are 6 data types in one dataset that devided by one test index. Each data type consist of 7 list.\n-> "+str(len(full_datasets[0])))
print("-> "+str(full_datasets[0].keys()))
print("Each list has real values to train, test, and others.\n-> "+str(full_datasets[0]['tr_x_val'].shape))

Each datasets variables (full_datasets & inter_datasets) have 5 splited data by each test index.
-> 5
There are 6 data types in one dataset that devided by one test index. Each data type consist of 7 list.
-> 7
-> dict_keys(['ts_x_val', 'types', 'tr_x_val', 'ts_y_val', 'tr_y_val', 'tr_data', 'ts_data'])


AttributeError: 'list' object has no attribute 'shape'

In [157]:
full_datasets[0].keys()

dict_keys(['ts_x_val', 'types', 'tr_x_val', 'ts_y_val', 'tr_y_val', 'tr_data', 'ts_data'])

In [151]:
full_datasets[0]['tr_x_val'][2].shape

KeyError: 0

In [141]:
ts_i = 0
t = 0

In [144]:
print(full_datasets[ts_i]['types'][t])
print(full_datasets[ts_i]['tr_x_val'][t].shape)

OV_six_fold_Annotation3000_400
(186, 400)


In [97]:
for i in range(1,6):
    print(i)

1
2
3
4
5


In [24]:
inter_newDiff_dataset = {"tr_data":[], "ts_data":[], "tr_y_val":[], "tr_x_val":[], "ts_y_val":[], "ts_x_val":[]}
inter_tr_data, inter_ts_data, inter_tr_y_val, inter_tr_x_val, inter_ts_y_val, inter_ts_x_val = data_split(raw_data = inter_ds_list[3], index_col = -1, test_index = ts_i)
inter_newDiff_dataset['tr_data']= inter_tr_data
inter_newDiff_dataset['ts_data']= inter_ts_data
inter_newDiff_dataset['tr_x_val']= inter_tr_x_val
inter_newDiff_dataset['tr_y_val']= inter_tr_y_val
inter_newDiff_dataset['ts_x_val']= inter_ts_x_val
inter_newDiff_dataset['ts_y_val']= inter_ts_y_val    
#inter_new_Diff_dataset = inter_dataset[1]

## Import separate models & evaluation

In [25]:
# model load & evaluation. <model_n_l> is full-layer model, <model_n_l_new> is without-sigmoid-layer model.
'''
Each model's tr_accuracy can be differ to original model, but ts_accuracy should be same to original tested models.
Because we using full-size data(about 200 patients data used Transcriptome, Clinical, SNV models.) for train each models.
In contrast, in this code, we using ensemble-input data(intersected 153 patients).
For-training-patients may be different in ensemble data and whole size data, but for-test-patients are the same.
'''

model_list = []
model_output_list = {"tr_accuracy":[], "tr_sensitivity":[], "tr_specificity":[], "tr_predictions":[],
                 "labeled_tr_predictions":[], "tr_predictions_flat":[], "roc_auc_tr":[], 
                 "ts_accuracy":[], "ts_sensitivity":[], "ts_specificity":[], "ts_predictions":[],
                 "labeled_ts_predictions":[], "ts_predictions_flat":[], "roc_auc_ts":[], 
                 "roc_auc_total":[], "tr_result":[], "ts_result":[]}
tr_predictions = []
ts_predictions = []

for m in range(len(model_names)):
    
    model_l = load_model(input_model_path+model_names[m]+".h5")
    model_list.append(model_l)
    output_list = output_list = model_performance(
        information = False, using_model=model_l,Input_Prediction_Passively = False, 
        tr_x_val=inter_dataset['tr_x_val'][m], tr_y_val=inter_dataset['tr_y_val'][m], ts_x_val=inter_dataset['ts_x_val'][m], ts_y_val=inter_dataset['ts_y_val'][m],
        output_list=["tr_accuracy", "tr_sensitivity", "tr_specificity", "tr_predictions",
                     "labeled_tr_predictions", "tr_predictions_flat", "roc_auc_tr", 
                     "ts_accuracy", "ts_sensitivity", "ts_specificity", "ts_predictions",
                     "labeled_ts_predictions", "ts_predictions_flat", "roc_auc_ts", 
                     "roc_auc_total"])
    m_tr_accuracy, m_tr_sensitivity, m_tr_specificity, m_tr_predictions, m_labeled_tr_predictions, m_tr_predictions_flat, m_roc_auc_tr, m_ts_accuracy, m_ts_sensitivity, m_ts_specificity, m_ts_predictions,m_labeled_ts_predictions, m_ts_predictions_flat, m_roc_auc_ts, m_roc_auc_total = output_list
    print("\nmodel: "+model_names[m])
    print("tr & ts for inter data: "+str(m_tr_accuracy)+", "+str(m_ts_accuracy)+"\n")
    
    model_l_new = Model(inputs = model_l.input, outputs=model_l.get_layer(model_l.layers[-2].name).output)
    m_tr_result = model_l_new.predict([inter_dataset['tr_x_val'][m]])
    m_ts_result = model_l_new.predict([inter_dataset['ts_x_val'][m]])
    
    model_output_list["tr_accuracy"].append(m_tr_accuracy)
    model_output_list["tr_sensitivity"].append(m_tr_sensitivity)
    model_output_list["tr_specificity"].append(m_tr_specificity)
    model_output_list["ts_accuracy"].append(m_ts_accuracy)
    model_output_list["ts_sensitivity"].append(m_ts_sensitivity)
    model_output_list["ts_specificity"].append(m_ts_specificity)
    model_output_list["tr_result"].append(m_tr_result)
    
    model_output_list["tr_predictions"].append(m_tr_predictions)
    model_output_list["labeled_tr_predictions"].append(m_labeled_tr_predictions)
    model_output_list["tr_predictions_flat"].append(m_tr_predictions_flat)
    model_output_list["roc_auc_tr"].append(m_roc_auc_tr)
    model_output_list["ts_predictions"].append(m_ts_predictions)
    model_output_list["labeled_ts_predictions"].append(m_labeled_ts_predictions)
    model_output_list["ts_predictions_flat"].append(m_ts_predictions_flat)
    model_output_list["roc_auc_ts"].append(m_roc_auc_ts)
    model_output_list["ts_result"].append(m_ts_result)
    
    model_output_list["roc_auc_total"].append(m_roc_auc_total)  


model: m_0-1_49
tr & ts for inter data: 1.0, 0.774193525314331


model: m_0-1_62
tr & ts for inter data: 1.0, 0.7419354915618896


model: m_1-1_31
tr & ts for inter data: 0.9918032728257726, 0.6451612710952759


model: m_1-1_43
tr & ts for inter data: 1.0, 0.6451612710952759


model: m_1-1_6
tr & ts for inter data: 0.959016387579871, 0.6129032373428345


model: m_2-1_57
tr & ts for inter data: 1.0, 0.8387096524238586


model: m_2-1_89
tr & ts for inter data: 1.0, 0.8387096524238586


model: m_3-1_15
tr & ts for inter data: 0.9180327878623712, 0.8709677457809448


model: m_3-1_18
tr & ts for inter data: 0.9836065573770492, 0.8709677457809448


model: m_4-1_101
tr & ts for inter data: 0.9918032786885246, 0.774193525314331


model: m_4-1_40
tr & ts for inter data: 0.9754098370426991, 0.7419354915618896


model: m_5-1_2
tr & ts for inter data: 0.8360655747476171, 0.774193525314331


model: m_5-1_40
tr & ts for inter data: 0.8032786836389636, 0.774193525314331



### Evaluating seperate model's performance

In [26]:
for m in range(len(model_names)):
    print("#### ["+str(m+1)+"] "+model_names[m]+" ####")
    print("types: "+types[model_index[m]])
    print("tr: "+str(model_output_list["tr_accuracy"][m])+", ts: "+str(model_output_list["ts_accuracy"][m])+"\n")

select = []
while 1:
    ch = input("input numbers for selection(1 ~ "+str(len(model_names))+". q for quit.: ")
    if(ch == 'q'):
        break
    else:
        select.append(int(ch))

#### [1] m_0-1_49 ####
types: OV_six_fold_Annotation3000_400
tr: 1.0, ts: 0.774193525314331

#### [2] m_0-1_62 ####
types: OV_six_fold_Annotation3000_400
tr: 1.0, ts: 0.7419354915618896

#### [3] m_1-1_31 ####
types: OV_six_fold_CV_400
tr: 0.9918032728257726, ts: 0.6451612710952759

#### [4] m_1-1_43 ####
types: OV_six_fold_CV_400
tr: 1.0, ts: 0.6451612710952759

#### [5] m_1-1_6 ####
types: OV_six_fold_CV_400
tr: 0.959016387579871, ts: 0.6129032373428345

#### [6] m_2-1_57 ####
types: OV_six_fold_Var_400
tr: 1.0, ts: 0.8387096524238586

#### [7] m_2-1_89 ####
types: OV_six_fold_Var_400
tr: 1.0, ts: 0.8387096524238586

#### [8] m_3-1_15 ####
types: OV_six_fold_new_Diff_400
tr: 0.9180327878623712, ts: 0.8709677457809448

#### [9] m_3-1_18 ####
types: OV_six_fold_new_Diff_400
tr: 0.9836065573770492, ts: 0.8709677457809448

#### [10] m_4-1_101 ####
types: OV_six_fold_Clin
tr: 0.9918032786885246, ts: 0.774193525314331

#### [11] m_4-1_40 ####
types: OV_six_fold_Clin
tr: 0.9754098370426991,

# 2. Modeling Ensemble model

In [74]:
model1 = load_model(input_model_path+model_names[0]+".h5")
tr_x=full_dataset['tr_x_val'][0]
tr_y=full_dataset['tr_y_val'][0]
ts_x=full_dataset['ts_x_val'][0]
ts_y=inter_dataset['ts_y_val'][0]

test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=tr_x,test=ts_x,y=tr_y)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [80]:
print(train_pred1.shape)
print(test_pred1.shape)

(186,)
(341, 1)


In [82]:
print(tr_y.shape)
print(ts_y.shape)

(186,)
(31,)


In [83]:
test_pred

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.84154972e-01,
       2.12075129e-01, 4.37520474e-01, 2.81035215e-01, 1.80363525e-02,
       5.06485207e-03, 2.21067175e-01, 9.67483222e-01, 3.53960134e-03,
       7.11818099e-01, 3.52693931e-03, 2.41143152e-01, 6.77669048e-01,
       2.55317688e-01, 5.76208472e-01, 6.41319335e-01, 9.93684471e-01,
       2.76848599e-02, 6.85885176e-02, 7.77335048e-01, 3.57893878e-04,
       9.32377756e-01, 9.47473288e-01, 8.20726693e-01, 4.40062024e-02,
      

### Select models

In [None]:
e_models_select = ensemble_coverage(model_list,inter_dataset["tr_x_val"],inter_dataset["tr_y_val"][0])
print("model numbers: "+str(len(e_models_select)))

## 1) DNN-Combiner Ensmeble

### Ensemble Input listup

In [None]:
m_tr_predictions_select = []
m_ts_predictions_select = []   

for i in range(len(select)):
    m_tr_predictions_select.append(model_output_list["tr_predictions"][select[i]-1])
    m_ts_predictions_select.append(model_output_list["ts_predictions"][select[i]-1])
    #print(m_tr_predictions[select[i]-1].shape)
    
em_tr_x_val = np.concatenate(m_tr_predictions_select, axis=1)
em_ts_x_val = np.concatenate(m_ts_predictions_select, axis=1)

tr_y_val = inter_dataset["tr_y_val"][0]
ts_y_val = inter_dataset["ts_y_val"][0]

In [None]:
print(em_tr_x_val.shape)
print(em_ts_x_val.shape)

In [None]:
print("################################## DNN em ##################################")
print("select: "+str(select))
for select_i in select:
    print("\n"+types[model_index[select_i-1]])
    print(model_names[select_i-1])

    
print("#############################################################################################")

# 1) parameter setting
em_adam = optimizers.Adam(lr=0.05)                                   
em_input_drop_out = 0.3
em_drop_out = 0
em_batch_size = 5
em_BN = True                           

em_layers = [10]
em_tr_loss_best = 100 # for saving best loss value 
em_best_model=[] #for saving best model
count=0 # for early stopping

# 2) model build
em_input = Input(shape=(len(select),))
em_dp = Dropout(em_input_drop_out)(em_input)
for l in em_layers:
    if em_BN == True:
        em_m = Dense(l)(em_dp)
        em_bn = BatchNormalization(axis=1, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones')(em_m)
        em_dp = Activation("relu")(em_bn)
    else:
        em_m = Dense(l,activation='relu')(em_dp)
        em_dp = Dropout(drop_out_m)(em_m)

em_final = em_dp
em_output = Dense(1, activation="sigmoid")(em_final)
em_model = Model(inputs=em_input,outputs=em_output)
em_model.compile(optimizer=em_adam, 
                loss='binary_crossentropy',
                metrics=['accuracy'])

# 3) Training: if no increase of tr_loss three times, stop training.
while 1:
    em_model.fit(em_tr_x_val, tr_y_val, batch_size=em_batch_size, nb_epoch=1, verbose = 0)
    em_tr_loss=em_model.evaluate( em_tr_x_val, tr_y_val)[0]
    if em_tr_loss < em_tr_loss_best: # new best model. count reset.
        em_tr_loss_best = em_tr_loss
        count=0
        em_best_model = em_model
    if count>10: # no increase three time. stop.
        em_model = em_best_model
        break
    else: count=count+1
print("Model em" +"-"+str(ts_i)+" trained.")

# 4) save model
em_model.save(save_model_path+"/m_em-"+str(ts_i)+".h5")

### Evaluating _DNN Combiner_ ensemble model

In [None]:
em_output_list = model_performance(
    information = False, using_model=em_model,Input_Prediction_Passively = False, 
    tr_x_val=em_tr_x_val, tr_y_val=tr_y_val, ts_x_val=em_ts_x_val, ts_y_val=ts_y_val,
    output_list=["tr_loss", "tr_accuracy", "tr_sensitivity", "tr_specificity", "tr_predictions",
                 "labeled_tr_predictions", "tr_predictions_flat", "roc_auc_tr", 
                 "ts_loss", "ts_accuracy", "ts_sensitivity", "ts_specificity", "ts_predictions",
                 "labeled_ts_predictions", "ts_predictions_flat", "roc_auc_ts", 
                 "roc_auc_total"])

em_tr_loss, em_tr_accuracy, em_tr_sensitivity, em_tr_specificity, em_tr_predictions, em_labeled_tr_predictions, em_tr_predictions_flat, em_roc_auc_tr, em_ts_loss, em_ts_accuracy, em_ts_sensitivity, em_ts_specificity, em_ts_predictions,em_labeled_ts_predictions, em_ts_predictions_flat, em_roc_auc_ts, em_roc_auc_total = em_output_list

print("Overall AUC: ", em_roc_auc_total)
print("Train AUC: ", em_roc_auc_tr)
print("Test AUC: ", em_roc_auc_ts)

print("Train Accuracy: {}".format(em_tr_accuracy))
print("Train Sensitivities & Specificities : "+str(em_tr_sensitivity)+", "+str(em_tr_specificity))
print("Test Accuracy: {}".format(em_ts_accuracy))
print("Test Sensitivities & Specificities : "+str(em_ts_sensitivity)+", "+str(em_ts_specificity))

In [None]:
# save prediction result.

tr_df_em = pd.DataFrame(data={"patient":list(inter_dataset["tr_data"][0].index), "hypothesis 1": list(em_tr_predictions_flat), 
                        "prediction":list(em_labeled_tr_predictions), "Platinum_Status":list(tr_y_val)})
tr_df_em.to_csv(save_prediction_path+"m_em-"+str(ts_i)+"_tr.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

ts_df_em = pd.DataFrame(data={"patient":list(inter_dataset["ts_data"][0].index), "hypothesis 1": list(em_ts_predictions_flat), 
                        "prediction":list(em_labeled_ts_predictions), "Platinum_Status":list(ts_y_val)})
ts_df_em.to_csv(save_prediction_path+"m_em-"+str(ts_i)+"_ts.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])


## 2) Mean Ensemble

### Evaluating _mean_ ensemble model

In [None]:
mean_em_tr_predictions=sum(m_tr_predictions_select)/len(select)
mean_em_ts_predictions=sum(m_ts_predictions_select)/len(select)

mean_em_output_list = model_performance(
    information = False, using_model=None,Input_Prediction_Passively = True, 
    tr_predictions=mean_em_tr_predictions, ts_predictions=mean_em_ts_predictions, 
    tr_x_val=em_tr_x_val, tr_y_val=tr_y_val, ts_x_val=em_ts_x_val, ts_y_val=ts_y_val,
    output_list=["tr_sensitivity", "tr_specificity",
                 "labeled_tr_predictions", "tr_predictions_flat", "roc_auc_tr", 
                 "ts_sensitivity", "ts_specificity",
                 "labeled_ts_predictions", "ts_predictions_flat", "roc_auc_ts", 
                 "roc_auc_total"])
mean_em_tr_sensitivity, mean_em_tr_specificity,  mean_em_labeled_tr_predictions, mean_em_tr_predictions_flat, mean_em_roc_auc_tr, mean_em_ts_sensitivity, mean_em_ts_specificity, mean_em_labeled_ts_predictions, mean_em_ts_predictions_flat, mean_em_roc_auc_ts, mean_em_roc_auc_total = mean_em_output_list

mean_em_tr_accuracy = sum(mean_em_labeled_tr_predictions==tr_y_val.values)/len(tr_y_val)
mean_em_ts_accuracy = sum(mean_em_labeled_ts_predictions==ts_y_val.values)/len(ts_y_val)

print("Overall AUC: ", mean_em_roc_auc_total)
print("Train AUC: ", mean_em_roc_auc_tr)
print("Test AUC: ", mean_em_roc_auc_ts)

print("Train Accuracy: {}".format(mean_em_tr_accuracy))
print("Train Sensitivities & Specificities : "+str(mean_em_tr_sensitivity)+", "+str(mean_em_tr_specificity))
print("Test Accuracy: {}".format(mean_em_ts_accuracy))
print("Test Sensitivities & Specificities : "+str(mean_em_ts_sensitivity)+", "+str(mean_em_ts_specificity))

In [None]:
# save prediction result.

tr_df_mean = pd.DataFrame(data={"patient":list(inter_dataset["tr_data"][0].index), "hypothesis 1": list(mean_em_tr_predictions_flat), 
                        "prediction":list(mean_em_labeled_tr_predictions), "Platinum_Status":list(tr_y_val)})
tr_df_mean.to_csv(save_prediction_path+"m_mean-"+str(ts_i)+"_tr.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

ts_df_mean = pd.DataFrame(data={"patient":list(inter_dataset["ts_data"][0].index), "hypothesis 1": list(mean_em_ts_predictions_flat), 
                        "prediction":list(mean_em_labeled_ts_predictions), "Platinum_Status":list(ts_y_val)})
ts_df_mean.to_csv(save_prediction_path+"m_mean-"+str(ts_i)+"_ts.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

## 3) Transferred Ensemble Modeling 

### Making new input data for t-ensemble

In [None]:
m_tr_result_select = []
m_ts_result_select = []

for i in range(len(select)):
    m_tr_result_select.append(model_output_list["tr_result"][select[i]-1])
    m_ts_result_select.append(model_output_list["ts_result"][select[i]-1])

t_em_tr_x_val = np.concatenate(m_tr_result_select, axis=1)
t_em_ts_x_val = np.concatenate(m_ts_result_select, axis=1)
print("\n############################################### t-em x val merged. ###############################################\n")
print(t_em_tr_x_val.shape)
print(t_em_ts_x_val.shape)

### Modeling t-ensemble  

In [None]:
print("################################## Transferred em ##################################")
print("select: "+str(select))
for select_i in select:
    print("\n"+types[model_index[select_i-1]])
    print(model_names[select_i-1])

    
print("#############################################################################################")

# 1) parameter setting
t_em_adam = optimizers.Adam(lr=0.05)                                   
t_em_input_drop_out = 0.3
t_em_drop_out = 0
t_em_batch_size = 5
t_em_BN = True                           

t_em_layers = [100]
t_em_tr_loss_best = 100 # for saving best loss value 
t_em_best_model=[] #for saving best model
count=0 # for early stopping

# 2) model build
t_em_input = Input(shape=(t_em_ts_x_val.shape[1],))
t_em_dp = Dropout(t_em_input_drop_out)(t_em_input)
for l in t_em_layers:
    if t_em_BN == True:
        t_em_m = Dense(l)(t_em_dp)
        t_em_bn = BatchNormalization(axis=1, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones')(t_em_m)
        t_em_dp = Activation("relu")(t_em_bn)
    else:
        t_em_m = Dense(l,activation='relu')(t_em_dp)
        t_em_dp = Dropout(drop_out_m)(t_em_m)

t_em_final = t_em_dp
t_em_output = Dense(1, activation="sigmoid")(t_em_final)
t_em_model = Model(inputs=t_em_input,outputs=t_em_output)
t_em_model.compile(optimizer=t_em_adam, 
                loss='binary_crossentropy',
                metrics=['accuracy'])

# 3) Training: if no increase of tr_loss three times, stop training.
while 1:
    t_em_model.fit(t_em_tr_x_val, tr_y_val, batch_size=t_em_batch_size, nb_epoch=1, verbose = 0)
    t_em_tr_loss=t_em_model.evaluate( t_em_tr_x_val, tr_y_val)[0]
    if t_em_tr_loss < t_em_tr_loss_best: # new best model. count reset.
        t_em_tr_loss_best = t_em_tr_loss
        count=0
        t_em_best_model = t_em_model
    if count>10: # no increase three time. stop.
        t_em_model = t_em_best_model
        break
    else: count=count+1
        
print("Model t-em" +"-"+str(ts_i)+" trained.")

# 4) save model
em_model.save(save_model_path+"/m_t-em-"+str(ts_i)+".h5")

### Evaluating t-ensemble

In [None]:
t_em_output_list = model_performance(
    information = False, using_model=t_em_model,Input_Prediction_Passively = False, 
    tr_x_val=t_em_tr_x_val, tr_y_val=tr_y_val, ts_x_val=t_em_ts_x_val, ts_y_val=ts_y_val,
    output_list=["tr_loss", "tr_accuracy", "tr_sensitivity", "tr_specificity", "tr_predictions",
                 "labeled_tr_predictions", "tr_predictions_flat", "roc_auc_tr", 
                 "ts_loss", "ts_accuracy", "ts_sensitivity", "ts_specificity", "ts_predictions",
                 "labeled_ts_predictions", "ts_predictions_flat", "roc_auc_ts", 
                 "roc_auc_total"])

t_em_tr_loss, t_em_tr_accuracy, t_em_tr_sensitivity, t_em_tr_specificity, t_em_tr_predictions, t_em_labeled_tr_predictions, t_em_tr_predictions_flat, t_em_roc_auc_tr, t_em_ts_loss, t_em_ts_accuracy, t_em_ts_sensitivity, t_em_ts_specificity, t_em_ts_predictions,t_em_labeled_ts_predictions, t_em_ts_predictions_flat, t_em_roc_auc_ts, t_em_roc_auc_total = t_em_output_list

print("Overall AUC: ", t_em_roc_auc_total)
print("Train AUC: ", t_em_roc_auc_tr)
print("Test AUC: ", t_em_roc_auc_ts)

print("Train Accuracy: {}".format(t_em_tr_accuracy))
print("Train Sensitivities & Specificities : "+str(t_em_tr_sensitivity)+", "+str(t_em_tr_specificity))
print("Test Accuracy: {}".format(t_em_ts_accuracy))
print("Test Sensitivities & Specificities : "+str(t_em_ts_sensitivity)+", "+str(t_em_ts_specificity))

In [None]:
# save prediction result.

tr_df_t_em = pd.DataFrame(data={"patient":list(inter_dataset["tr_data"][0].index), "hypothesis 1": list(t_em_tr_predictions_flat), 
                        "prediction":list(t_em_labeled_tr_predictions), "Platinum_Status":list(tr_y_val)})
tr_df_t_em.to_csv(save_prediction_path+"m_t-em-"+str(ts_i)+"_tr.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

ts_df_t_em = pd.DataFrame(data={"patient":list(inter_dataset["ts_data"][0].index), "hypothesis 1": list(t_em_ts_predictions_flat), 
                        "prediction":list(t_em_labeled_ts_predictions), "Platinum_Status":list(ts_y_val)})
ts_df_t_em.to_csv(save_prediction_path+"m_t-em-"+str(ts_i)+"_ts.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

## Transferred Ensemble(Modified)

### mo_transferred ensemble input dataset

In [None]:
# dataset : raw data + prediction results
mo_em_tr_x_val = np.concatenate([inter_newDiff_dataset["tr_x_val"], em_tr_x_val], axis = 1)
mo_em_ts_x_val = np.concatenate([inter_newDiff_dataset["ts_x_val"], em_ts_x_val], axis = 1)

In [None]:
print(mo_em_tr_x_val.shape)
print(mo_em_ts_x_val.shape)

In [None]:
#full_em_matrix = np.concatenate([full_em_ts_x_val, full_em_tr_x_val], axis = 0)
#df_full_dataset = pd.DataFrame(full_em_matrix)
#df_full_dataset.to_csv(index=False, )

In [None]:
'''
df_full_dataset = pd.DataFrame(full_em_matrix)
df_full_dataset.to_csv("C:/test/merge_newDiff_400_with_predictions.csv",index=False)

'''
#df_full_dataset.loc[df_full_dataset.shape[1]] = patient

In [None]:
print("modified t-ensemble model")

# 1) parameter setting
mo_em_adam = optimizers.Adam(lr=0.05)                                   
mo_em_input_drop_out = 0.3
mo_em_drop_out = 0
mo_em_batch_size = 5
mo_em_BN = True                           

mo_em_layers = [100]
mo_em_tr_loss_best = 100 # for saving best loss value 
mo_em_best_model=[] #for saving best model
count=0 # for early stopping

# 2) model build
mo_em_input = Input(shape=(mo_em_ts_x_val.shape[1],))
mo_em_dp = Dropout(mo_em_input_drop_out)(mo_em_input)
for l in mo_em_layers:
    if mo_em_BN == True:
        mo_em_m = Dense(l)(mo_em_dp)
        mo_em_bn = BatchNormalization(axis=1, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones')(mo_em_m)
        mo_em_dp = Activation("relu")(mo_em_bn)
    else:
        mo_em_m = Dense(l,activation='relu')(mo_em_dp)
        mo_em_dp = Dropout(drop_out_m)(mo_em_m)

mo_em_final = mo_em_dp
mo_em_output = Dense(1, activation="sigmoid")(mo_em_final)
mo_em_model = Model(inputs=mo_em_input,outputs=mo_em_output)
mo_em_model.compile(optimizer=mo_em_adam, 
                loss='binary_crossentropy',
                metrics=['accuracy'])

# 3) Training: if no increase of tr_loss three times, stop training.
while 1:
    mo_em_model.fit(mo_em_tr_x_val, tr_y_val, batch_size=mo_em_batch_size, nb_epoch=1, verbose = 0)
    mo_em_tr_loss=mo_em_model.evaluate( mo_em_tr_x_val, tr_y_val)[0]
    if mo_em_tr_loss < mo_em_tr_loss_best: # new best model. count reset.
        mo_em_tr_loss_best = mo_em_tr_loss
        count=0
        mo_em_best_model = mo_em_model
    if count>10: # no increase three time. stop.
        mo_em_model = mo_em_best_model
        break
    else: count=count+1
        
print("Model mo-em" +"-"+str(ts_i)+" trained.")

# 4) save model
em_model.save(save_model_path+"/m_mo-em-"+str(ts_i)+".h5")

# 5) evaluate model
mo_em_output_list = model_performance(
    information = False, using_model=mo_em_model,Input_Prediction_Passively = False, 
    tr_x_val=mo_em_tr_x_val, tr_y_val=tr_y_val, ts_x_val=mo_em_ts_x_val, ts_y_val=ts_y_val,
    output_list=["tr_loss", "tr_accuracy", "tr_sensitivity", "tr_specificity", "tr_predictions",
                 "labeled_tr_predictions", "tr_predictions_flat", "roc_auc_tr", 
                 "ts_loss", "ts_accuracy", "ts_sensitivity", "ts_specificity", "ts_predictions",
                 "labeled_ts_predictions", "ts_predictions_flat", "roc_auc_ts", 
                 "roc_auc_total"])

mo_em_tr_loss, mo_em_tr_accuracy, mo_em_tr_sensitivity, mo_em_tr_specificity, mo_em_tr_predictions, mo_em_labeled_tr_predictions, mo_em_tr_predictions_flat, mo_em_roc_auc_tr, mo_em_ts_loss, mo_em_ts_accuracy, mo_em_ts_sensitivity, mo_em_ts_specificity, mo_em_ts_predictions,mo_em_labeled_ts_predictions, mo_em_ts_predictions_flat, mo_em_roc_auc_ts, mo_em_roc_auc_total = mo_em_output_list

print("Overall AUC: ", mo_em_roc_auc_total)
print("Train AUC: ", mo_em_roc_auc_tr)
print("Test AUC: ", mo_em_roc_auc_ts)

print("Train Accuracy: {}".format(mo_em_tr_accuracy))
print("Train Sensitivities & Specificities : "+str(mo_em_tr_sensitivity)+", "+str(mo_em_tr_specificity))
print("Test Accuracy: {}".format(mo_em_ts_accuracy))
print("Test Sensitivities & Specificities : "+str(mo_em_ts_sensitivity)+", "+str(mo_em_ts_specificity))

In [None]:
# save prediction result.

tr_df_mo_em = pd.DataFrame(data={"patient":list(inter_dataset["tr_data"][0].index), "hypothesis 1": list(mo_em_tr_predictions_flat), 
                        "prediction":list(mo_em_labeled_tr_predictions), "Platinum_Status":list(tr_y_val)})
tr_df_mo_em.to_csv(save_prediction_path+"m_mo-em-"+str(ts_i)+"_tr.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

ts_df_mo_em = pd.DataFrame(data={"patient":list(inter_dataset["ts_data"][0].index), "hypothesis 1": list(mo_em_ts_predictions_flat), 
                        "prediction":list(mo_em_labeled_ts_predictions), "Platinum_Status":list(ts_y_val)})
ts_df_mo_em.to_csv(save_prediction_path+"m_mo-em-"+str(ts_i)+"_ts.csv", index=False, header=True, columns = ["patient", "hypothesis 1", "prediction", "Platinum_Status"])

## Performance Comparison

In [None]:
select_model_index = str(model_index[select[0]-1]) 
select_model_names = str(model_names[select[0]-1]) 
for select_i in select[1:]:
    print(types[model_index[select_i-1]])
    print(model_names[select_i-1]+"\n")
    select_model_index = select_model_index+" & "+str(model_index[select_i-1])
    select_model_names = select_model_names+" & "+str(model_names[select_i-1]) 

print(select_model_index)
print(select_model_names)

In [None]:
tr_accuracy_list = [em_tr_accuracy, mean_em_tr_accuracy, t_em_tr_accuracy, mo_em_tr_accuracy]
ts_accuracy_list = [em_ts_accuracy, mean_em_ts_accuracy, t_em_ts_accuracy, mo_em_ts_accuracy]
tr_sensitivity_list = [em_tr_sensitivity, mean_em_tr_sensitivity, t_em_tr_sensitivity, mo_em_tr_sensitivity]
ts_sensitivity_list = [em_ts_sensitivity, mean_em_ts_sensitivity, t_em_ts_sensitivity, mo_em_ts_sensitivity]
tr_specificity_list = [em_tr_specificity, mean_em_tr_specificity, t_em_tr_specificity, mo_em_tr_specificity]
ts_specificity_list = [em_ts_specificity, mean_em_ts_specificity, t_em_ts_specificity, mo_em_ts_specificity]
tr_roc_list = [em_roc_auc_tr, mean_em_roc_auc_tr, t_em_roc_auc_tr, mo_em_roc_auc_tr]
ts_roc_list = [em_roc_auc_ts, mean_em_roc_auc_ts, t_em_roc_auc_ts, mo_em_roc_auc_ts]
total_roc_list = [em_roc_auc_total, mean_em_roc_auc_total, t_em_roc_auc_total, mo_em_roc_auc_total]

for type_index in range(4):
    em_output_list["em_type"].append(em_type[type_index])
    em_output_list["test_index"].append(ts_i)
    em_output_list["ensemble_comb"].append(select_model_index)
    em_output_list["ensemble_names"].append(select_model_names)
    em_output_list["tr_accuracy"].append(tr_accuracy_list[type_index] )
    em_output_list["ts_accuracy"].append(ts_accuracy_list[type_index] )
    em_output_list["tr_sensitivity"].append(tr_sensitivity_list[type_index] )
    em_output_list["ts_sensitivity"].append(ts_sensitivity_list[type_index] )
    em_output_list["tr_specificity"].append(tr_specificity_list[type_index] )
    em_output_list["ts_specificity"].append(ts_specificity_list[type_index] )
    em_output_list["roc_auc_tr"].append(tr_roc_list[type_index] )
    em_output_list["roc_auc_ts"].append(ts_roc_list[type_index] )

In [None]:
for type_index in range(4):
    df_sum = pd.DataFrame(data=em_output_list)
    df_sum.to_csv(save_result_path+em_type[type_index]+"_result_"+str(cycle_index)+".csv", index=False, header=True,
              columns = ["ensemble_comb", "ensemble_names","test_index",
                         "tr_accuracy", "tr_sensitivity", "tr_specificity", 
                         "ts_accuracy", "ts_sensitivity", "ts_specificity",
                         "roc_auc_total", "roc_auc_tr", "roc_auc_ts"])
cycle_index = cycle_index + 1

In [None]:
print(len(em_output_list['test_index']))