# 0. Import Packages & Helper Functions

## 1) Import packages

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import optimizers
from keras import backend as K
from keras.layers import Input, Dense, Dropout, Input, Activation, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.models import Model, load_model, Sequential 
import sklearn
from sklearn import metrics
import math

np.random.seed(777)

print(tf.__version__)

1.5.1


## 2) Helper functions

In [6]:
# Calculate sensitivity & specificity using Predicted Y & Real Y
def check_correct(predict, y):
    result = {}
    result['resistant-correct'] = 0
    result['resistant-wrong'] = 0
    result['sensitive-correct'] = 0
    result['sensitive-wrong'] = 0

    for i in range(len(predict)) :
        if predict[i] == y[i] :
            if y[i] == 0 :
                result['sensitive-correct'] += 1
            else :
                result['resistant-correct'] += 1
        else :
            if y[i] == 0 :
                result['sensitive-wrong'] += 1
            else :
                result['resistant-wrong'] += 1

    #for result_k, result_v in result.items():
    #    print(result_k +" : "+ str(result_v))
    sensitivity=result['resistant-correct']/(result['resistant-correct']+result['resistant-wrong'])
    specificity=result['sensitive-correct']/(result['sensitive-correct']+result['sensitive-wrong'])
    #print("Sensitivity :", sensitivity)
    #print("Specificity :", specificity)
    return sensitivity, specificity

In [7]:
# devide raw data into train / test & x_val / y_val
def data_split(raw_data, index_col, test_index):
    
    train_data = raw_data.iloc[list(raw_data.iloc[:,index_col]!=test_index)]
    test_data = raw_data.iloc[list(raw_data.iloc[:,index_col]==test_index)]
    
    y_val = train_data.Platinum_Status
    x_val = train_data.drop(["Platinum_Status","index"],axis=1)
    test_y_val = test_data.Platinum_Status
    test_x_val = test_data.drop(["Platinum_Status","index"],axis=1)
    
    return train_data, test_data, y_val, x_val, test_y_val, test_x_val

    # raw_data: have gene_expressions(maybe multiple columns), index column, Platinum_Status column.


In [8]:
# calculate all of model performance 
# - predictions(probability) / labeled predictions(0/1) / Loss / Accuracy / Sensitivity / Specificity / AUC values of Train / Test dataset.
# using trained models, or you can put predictions(probability) passively(in this case, Loss & Accuracy do not provided.)
def model_performance(information=False, Input_Prediction_Passively=False, using_model=None, tr_predictions=None, ts_predictions=None, tr_x_val=None, tr_y_val=None, ts_x_val=None, ts_y_val=None, output_list=None):
    
    if information == True:            
        print("options model_performance:\n1) using_model: keras models that you want to check performance. \"Input_Prediction_Passive\" option for input prediction list instead using models.\n3) tr_predictions & ts_predictions: prediction input passively. put this data only when not using keras model.\n4) tr_x_val & ts_x_val: input samples of train/test samples.\n4) tr_y_val & ts_y_val: results of train/test samples.\n5) output_list: return values that you want to recieve.\n CAUTION: Essential variable.\n\t tr_loss, tr_accuracy, tr_sensitivity, tr_specificity, tr_predictions, labeled_tr_predictions, tr_predictions_flat, roc_auc_tr,\nts_loss, ts_accuracy, ts_sensitivity, ts_specificity, ts_predictions, labeled_ts_predictions, ts_predictions_flat, roc_auc_ts,\nroc_auc_total\n\n* CAUTION: if 'None' value is returned, please check your input tr inputs(None value for tr outputs) or ts inputs(None value for ts outputs).") 
        return 0
    elif information != False:
        print("for using information options, please set 'information' variable for 'True'")
        return -1
    
    if using_model is None:
        if Input_Prediction_Passively == False:
            print("ERROR: There are no models for using.\nusing \"model_performance(information = True)\" for getting informations of this function.") 
            return -1
        elif (tr_predictions is None) and (ts_predictions is None): # No model/prediction input. no performance should be calculated.
                print("ERROR: Input prediction list instead using saved model.")
                return -1
        else: # No model input, but Input_Prediction_Passively is True & input prediction is valid.
            tr_loss,tr_accuracy= None, None
            ts_loss,ts_accuracy= None, None
            
    elif Input_Prediction_Passively == True: # both of model/prediction putted, could cause confusing.
        ch = input("You put both model and prediction. Select one method:\n'p' for using prediction only, 'm' using models only, 'n' for quit the function.")
        while 1:
            if ch == 'p':
                using_model = None
                break
            elif ch == 'm':
                tr_predictions = None
                ts_predictions = None
                break
            elif ch == 'e':
                return 0
            else:
                print("you put worng option: "+str(ch))
            ch = input("Select one method:\n'p' for using prediction only, 'm' using models only, 'n' for quit the function.")
                
    if output_list is None:
        print("ERROR: There are no output_list for return.\nusing \"model_performance(information = True)\" for getting informations of this function.")
        return -1
    
    if not(tr_x_val is None) and not(tr_y_val is None):
        # predict tr result only when no tr_prediction input
        if tr_predictions is None:
            tr_loss,tr_accuracy= using_model.evaluate(tr_x_val,tr_y_val,verbose=0)
            tr_predictions = using_model.predict(tr_x_val,verbose=0)
        # tr sensitivity / specificity
        labeled_tr_predictions = np.where(tr_predictions > 0.5, 1, 0).flatten()
        tr_sensitivity, tr_specificity = check_correct(labeled_tr_predictions, tr_y_val)
        tr_predictions_flat = tr_predictions[:,0]   
        # roc(tr)
        fpr_tr, tpr_tr, threshold_tr = metrics.roc_curve(tr_y_val, tr_predictions)
        roc_auc_tr = metrics.auc(fpr_tr, tpr_tr)
    
    if not(ts_x_val is None) and not(ts_y_val is None):
        # predict ts result only when no ts_prediction input
        if ts_predictions is None:
            ts_loss,ts_accuracy= using_model.evaluate(ts_x_val,ts_y_val,verbose=0)
            ts_predictions = using_model.predict(ts_x_val,verbose=0)
        labeled_ts_predictions = np.where(ts_predictions > 0.5, 1, 0).flatten()
        ts_sensitivity, ts_specificity = check_correct(labeled_ts_predictions, ts_y_val)
        ts_predictions_flat = ts_predictions[:,0]   
        # roc(ts)
        fpr_ts, tpr_ts, threshold_ts = metrics.roc_curve(ts_y_val, ts_predictions)
        roc_auc_ts = metrics.auc(fpr_ts, tpr_ts)    
    
    if (not(tr_x_val is None) and not(tr_y_val is None)) and (not(ts_x_val is None) and not(ts_y_val is None)):
        y_true = np.append(tr_y_val, ts_y_val)
        y_pred = np.append(tr_predictions, ts_predictions)
        fpr_total, tpr_total, threshold_total = metrics.roc_curve(y_true, y_pred)
        roc_auc_total = metrics.auc(fpr_total, tpr_total)
        
        
    return_list = []
    
    for output in output_list:
        
        if(output == "tr_loss"):
            return_list.append(tr_loss)
                               
        elif(output == "tr_accuracy"):
            return_list.append(tr_accuracy)
                               
        elif(output == "tr_sensitivity"):
            return_list.append(tr_sensitivity)
                               
        elif(output == "tr_specificity"):
            return_list.append(tr_specificity)
                               
        elif(output == "tr_predictions"):
            return_list.append(tr_predictions)
                               
        elif(output == "labeled_tr_predictions"):
            return_list.append(labeled_tr_predictions)
                               
        elif(output == "tr_predictions_flat"):
            return_list.append(tr_predictions_flat)
            
        elif(output == "roc_auc_tr"):
            return_list.append(roc_auc_tr)

        elif(output == "ts_loss"):
            return_list.append(ts_loss)
                               
        elif(output == "ts_accuracy"):
            return_list.append(ts_accuracy)
                               
        elif(output == "ts_sensitivity"):
            return_list.append(ts_sensitivity)
                               
        elif(output == "ts_specificity"):
            return_list.append(ts_specificity)
                               
        elif(output == "ts_predictions"):
            return_list.append(ts_predictions)
                               
        elif(output == "labeled_ts_predictions"):
            return_list.append(labeled_ts_predictions)
                               
        elif(output == "ts_predictions_flat"):
            return_list.append(ts_predictions_flat)
        
        elif(output == "roc_auc_ts"):
            return_list.append(roc_auc_ts)
            
        elif(output == "roc_auc_total"):
            return_list.append(roc_auc_total)
                               
        else:
            print("There are no options <"+str(output)+">. Please refer these output options:\ntr_loss, tr_accuracy, tr_sensitivity, tr_specificity, tr_predictions, labeled_tr_predictions, tr_predictions_flat, roc_auc_tr,\nts_loss, ts_accuracy, ts_sensitivity, ts_specificity, ts_predictions, labeled_ts_predictions, ts_predictions_flat, roc_auc_ts,\nroc_auc_total")
            break
    
    if len(return_list)==1:
        return_list = return_list[0]
    
    return return_list

In [9]:
early_stopping = EarlyStopping(patience=10)

In [59]:
# Training one NN model using train X & Y values
# Returns trained NN model
def train_NN_model(tr_x_val, tr_y_val, val_x_val, val_y_val, n_epoch):
    
    # 1) parameter setting
    lr=0.01
    input_drop_out = 0
    drop_out = 0.5
    layers = [10]
    BN = True
    batch_size = 5
    m_tr_loss_best = 100
    
    m_adam = optimizers.Adam(lr=lr)
    # 2) model build
    #m_input = Input(shape=(input_dim[1],))
    m_input = Input(shape=(tr_x_val.shape[1],))
    m_dp = Dropout(input_drop_out)(m_input)
    if BN == True:
        for i in layers:
            m_h = Dense(i)(m_dp)
            m_bn = BatchNormalization(axis=1, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones')(m_h)
            m_dp = Activation("relu")(m_bn)
    else:        
        for i in m_layers:
            m_h = Dense(i,activation='relu')(m_dp)
            m_dp = Dropout(drop_out)(m_h)
    m_final = m_dp
    m_output = Dense(1, activation="sigmoid")(m_final)
    m_model = Model(inputs=m_input,outputs=m_output)
    m_model.compile(optimizer=m_adam, 
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

    # Training method that maximize train accuracy does not fit into AdatBoost: because it makes too strong classifier
#    while 1:
#        m_model.fit(tr_x_val, tr_y_val, batch_size=batch_size, epochs=1, verbose = 0)
#        m_tr_loss=m_model.evaluate(tr_x_val, tr_y_val, verbose = 0)[0]
#        if m_tr_loss < m_tr_loss_best: # new best model. count reset.
#            m_tr_loss_best = m_tr_loss
#            count=0
#            best_m_model = m_model
#        if count>3: # no increase three time. stop.
#            m_model = best_m_model
#            break
#        else: count=count+1
    m_model.fit(tr_x_val, tr_y_val, batch_size=batch_size, epochs=n_epoch, verbose = 0, validation_data=(val_x_val, val_y_val), callbacks=[early_stopping])
    
    return m_model

# 1. Input Data & Preprocessing

## 1) Declaration of path name & type

In [12]:
types = ["OV_six_fold_Annotation3000_400", 
         "OV_six_fold_CV_400", 
         "OV_six_fold_Var_400", "OV_six_fold_new_Diff_400",
         "OV_six_fold_Clin", 
         "OV_six_fold_SNV" 
         ]

path = "../../../TC_six_fold_subsamples/"
save_model_path = "../best_models/model/"
save_prediction_path = "../best_models/predictions/"
save_result_path = "../best_models/results/"

## 2) Split data into train/test set

In [41]:
file_1 = path+types[0]+".csv"
file_2 = path+types[1]+".csv"
file_3 = path+types[2]+".csv"
file_4 = path+types[3]+".csv"
file_5 = path+types[4]+".csv"
file_6 = path+types[5]+".csv"

idx_col = 0

full_data_1 = pd.read_csv(file_1,index_col=idx_col)
full_data_2 = pd.read_csv(file_2,index_col=idx_col)
full_data_3 = pd.read_csv(file_3,index_col=idx_col)
full_data_4 = pd.read_csv(file_4,index_col=idx_col)
full_data_5 = pd.read_csv(file_5,index_col=idx_col)
full_data_6 = pd.read_csv(file_6,index_col=idx_col)

inter_data_1 = full_data_1.iloc[list(full_data_1.iloc[:,-1]!=6)]
inter_data_2 = full_data_2.iloc[list(full_data_2.iloc[:,-1]!=6)]
inter_data_3 = full_data_3.iloc[list(full_data_3.iloc[:,-1]!=6)]
inter_data_4 = full_data_4.iloc[list(full_data_4.iloc[:,-1]!=6)]
inter_data_5 = full_data_5.iloc[list(full_data_5.iloc[:,-1]!=6)]
inter_data_6 = full_data_6.iloc[list(full_data_6.iloc[:,-1]!=6)]

print("[1] file_name: ", types[0], "\nsample : {}  \nfeatures : {}".format(full_data_1.shape[0],full_data_1.shape[1]-2))
print("[2] file_name: ", types[1], "\nsample : {}  \nfeatures : {}".format(full_data_2.shape[0],full_data_2.shape[1]-2))
print("[3] file_name: ", types[2], "\nsample : {}  \nfeatures : {}".format(full_data_3.shape[0],full_data_3.shape[1]-2))
print("[4] file_name: ", types[3], "\nsample : {}  \nfeatures : {}".format(full_data_4.shape[0],full_data_4.shape[1]-2))
print("[5] file_name: ", types[4], "\nsample : {}  \nfeatures : {}".format(full_data_5.shape[0],full_data_5.shape[1]-2))
print("[6] file_name: ", types[5], "\nsample : {}  \nfeatures : {}".format(full_data_6.shape[0],full_data_6.shape[1]-2))


# Split Train Test Data

tr_data_1, ts_data_1, tr_y_1, tr_x_1, ts_y_1, ts_x_1 = data_split(raw_data = full_data_1, index_col = -1, test_index = 1)
tr_data_2, ts_data_2, tr_y_2, tr_x_2, ts_y_2, ts_x_2 = data_split(raw_data = full_data_2, index_col = -1, test_index = 1)
tr_data_3, ts_data_3, tr_y_3, tr_x_3, ts_y_3, ts_x_3 = data_split(raw_data = full_data_3, index_col = -1, test_index = 1)
tr_data_4, ts_data_4, tr_y_4, tr_x_4, ts_y_4, ts_x_4 = data_split(raw_data = full_data_4, index_col = -1, test_index = 1)
tr_data_5, ts_data_5, tr_y_5, tr_x_5, ts_y_5, ts_x_5 = data_split(raw_data = full_data_5, index_col = -1, test_index = 1)
tr_data_6, ts_data_6, tr_y_6, tr_x_6, ts_y_6, ts_x_6 = data_split(raw_data = full_data_6, index_col = -1, test_index = 1)

[1] file_name:  OV_six_fold_Annotation3000_400 
sample : 217  
features : 400
[2] file_name:  OV_six_fold_CV_400 
sample : 217  
features : 400
[3] file_name:  OV_six_fold_Var_400 
sample : 217  
features : 400
[4] file_name:  OV_six_fold_new_Diff_400 
sample : 217  
features : 400
[5] file_name:  OV_six_fold_Clin 
sample : 287  
features : 35
[6] file_name:  OV_six_fold_SNV 
sample : 213  
features : 13814


In [42]:
val_x_1 = tr_x_1.iloc[:int(tr_x_1.shape[0]/10),:]
tr_x_1 = tr_x_1.iloc[int(tr_x_1.shape[0]/10):,:]
val_y_1 = tr_y_1.iloc[:int(tr_y_1.shape[0]/10)]
tr_y_1 = tr_y_1.iloc[int(tr_y_1.shape[0]/10):]

val_x_2 = tr_x_2.iloc[:int(tr_x_2.shape[0]/10),:]
tr_x_2 = tr_x_2.iloc[int(tr_x_2.shape[0]/10):,:]
val_y_2 = tr_y_2.iloc[:int(tr_y_2.shape[0]/10)]
tr_y_2 = tr_y_2.iloc[int(tr_y_2.shape[0]/10):]

val_x_3 = tr_x_3.iloc[:int(tr_x_3.shape[0]/10),:]
tr_x_3 = tr_x_3.iloc[int(tr_x_3.shape[0]/10):,:]
val_y_3 = tr_y_3.iloc[:int(tr_y_3.shape[0]/10)]
tr_y_3 = tr_y_3.iloc[int(tr_y_3.shape[0]/10):]

val_x_4 = tr_x_4.iloc[:int(tr_x_4.shape[0]/10),:]
tr_x_4 = tr_x_4.iloc[int(tr_x_4.shape[0]/10):,:]
val_y_4 = tr_y_4.iloc[:int(tr_y_4.shape[0]/10)]
tr_y_4 = tr_y_4.iloc[int(tr_y_4.shape[0]/10):]

In [43]:
tr_x_list = [tr_x_1, tr_x_2, tr_x_3, tr_x_4]
tr_y_list = [tr_y_1, tr_y_2, tr_y_3, tr_y_4]
ts_x_list = [ts_x_1, ts_x_2, ts_x_3, ts_x_4]
ts_y_list = [ts_y_1, ts_y_2, ts_y_3, ts_y_4]
val_x_list = [val_x_1, val_x_2, val_x_3, val_x_4]
val_y_list = [val_y_1, val_y_2, val_y_3, val_y_4]

# 2. Training Models & Ensemble

## 1) Training models N stage

In [53]:
max_model_num = 5

In [54]:
sample_weight = np.array([1/tr_x_1.shape[0]]*tr_x_1.shape[0])
model_list = []
alpha_list = []
type_num_list = []
sample_weight_list = []
error_list = []

for step in range(max_model_num):
    print("# "+str(step+1)+"/"+str(max_model_num)+"th step\n")
    best_model = 0
    best_weighted_errors = 0
    best_weighted_error_sum = -1
    best_alpha = 0
    best_type_num = 0
    best_tr_acc = 0
    best_error = 0
    
    for t in range(4):
        
        tr_x = tr_x_list[t]
        tr_y = tr_y_list[t]
        val_x = val_x_list[t]
        val_y = val_y_list[t]
        #print(tr_x.iloc[:2][:2])

        model_t = train_NN_model(tr_x_val=tr_x, tr_y_val=tr_y, val_x_val=val_x, val_y_val=val_y, n_epoch=50)
        pred_Y = model_performance(using_model = model_t, tr_x_val=tr_x, tr_y_val=tr_y, output_list=["labeled_tr_predictions"])
        #if pred_Y is 0 or 1, all weighted predict Y of 0 samples will be just 0
        pred_Y_proc = pred_Y*2-1
        Y_proc = np.array(tr_y)*2-1
        error = abs(Y_proc - pred_Y_proc)/2
        error_sum = np.sum(error)
        weighted_errors = sample_weight*error.T
        weighted_error_sum = np.sum(weighted_errors)
        print("Weighted error of "+types[t]+":\n\t"+str(weighted_error_sum.round(3))+" (error: "+str((error_sum/tr_x_1.shape[0]).round(3))+")")
        
        if best_weighted_error_sum == -1 or best_weighted_error_sum > weighted_error_sum:
            #print(error)
            #print(weighted_errors)
            
            best_model = model_t
            best_error = error
            best_weighted_errors = weighted_errors
            best_weighted_error_sum = np.sum(weighted_errors)
            best_alpha = math.log((1-min(best_weighted_error_sum, (1-math.exp(-16))))/max(best_weighted_error_sum, math.exp(-16)))/2
            best_type_num = t
            best_tr_acc = model_t.evaluate(tr_x, tr_y, verbose = 0)[1]
            best_val_acc = model_t.evaluate(val_x, val_y, verbose = 0)[1]


    print("\n\t-> Selected: "+types[best_type_num])
    print("\t   tr_acc: "+str(best_tr_acc.round(3)))
    print("\t   val_acc: "+str(best_val_acc.round(3)))
    print("\t   Weighted error: "+str(best_weighted_error_sum.round(3))+", Alpha: "+str(np.float64(best_alpha).round(3))+"\n\n")
    error_term = (best_error*2)-1
    updated_weight = sample_weight*np.exp((-1)*best_alpha*error_term).T
    sample_weight = updated_weight / np.sum(updated_weight)
    
    model_list.append(best_model)
    alpha_list.append(best_alpha)
    type_num_list.append(best_type_num)
    sample_weight_list.append(sample_weight)
    error_list.append(best_error)



# 1/5th step

Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Weighted error of OV_six_fold_Annotation3000_400:
	0.018 (error: 0.018)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Weighted error of OV_six_fold_CV_400:
	0.0 (error: 0.0)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_Var_400:
	0.054 (error: 0.054)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_new_Diff_400:
	0.0 (error: 0.0)

	-

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_Annotation3000_400:
	0.131 (error: 0.131)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Weighted error of OV_six_fold_CV_400:
	0.0 (error: 0.0)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Weighted error of OV_six_fold_Var_400:
	0.012 (error: 0.012)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_new_Diff_400:
	0.0 (error: 0.0)

	-> Selected: OV_six_fold_CV_400
	   tr_acc: 1.0
	   val_acc: 0.556
	   Weighted error: 0.0, Alpha: 8.0


# 3/5th step

Train on 168 samples, valida

Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Weighted error of OV_six_fold_CV_400:
	0.006 (error: 0.006)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Weighted error of OV_six_fold_Var_400:
	0.0 (error: 0.0)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Weighted error of OV_six_fold_new_Diff_400:
	0.0 (error: 0.0)

	-> Selected: OV_six_fold_Var_400
	   tr_acc: 1.0
	   val_acc: 0.611
	   Weighted error: 0.0, Alpha: 8.0


# 4/5th step

Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_Annotation3000_400:
	0.036 (

Weighted error of OV_six_fold_CV_400:
	0.0 (error: 0.0)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_Var_400:
	0.0 (error: 0.0)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Weighted error of OV_six_fold_new_Diff_400:
	0.0 (error: 0.0)

	-> Selected: OV_six_fold_CV_400
	   tr_acc: 1.0
	   val_acc: 0.556
	   Weighted error: 0.0, Alpha: 8.0


# 5/5th step

Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Weighted error of OV_six_fold_Annotation3000_400:
	0.113 (error: 0.113)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Ep

Epoch 11/50
Epoch 12/50
Weighted error of OV_six_fold_Var_400:
	0.012 (error: 0.012)
Train on 168 samples, validate on 18 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Weighted error of OV_six_fold_new_Diff_400:
	0.0 (error: 0.0)

	-> Selected: OV_six_fold_CV_400
	   tr_acc: 1.0
	   val_acc: 0.5
	   Weighted error: 0.0, Alpha: 8.0




## *AdaBoost Using Weight of Validation Samples

In [60]:
sample_weight = np.array([1/val_x_1.shape[0]]*val_x_1.shape[0])
model_list = []
alpha_list = []
type_num_list = []
sample_weight_list = []
error_list = []

for step in range(max_model_num):
    print("# "+str(step+1)+"/"+str(max_model_num)+"th step\n")
    best_model = 0
    best_weighted_errors = 0
    best_weighted_error_sum = -1
    best_alpha = 0
    best_type_num = 0
    best_tr_acc = 0
    best_val_acc = 0
    best_error = 0
    
    for t in range(4):
        
        tr_x = tr_x_list[t]
        tr_y = tr_y_list[t]
        val_x = val_x_list[t]
        val_y = val_y_list[t]
        #print(tr_x.iloc[:2][:2])

        model_t = train_NN_model(tr_x_val=tr_x, tr_y_val=tr_y, val_x_val=val_x, val_y_val=val_y, n_epoch=50)
        pred_Y = model_performance(using_model = model_t, tr_x_val=val_x, tr_y_val=val_y, output_list=["labeled_tr_predictions"])
        #if pred_Y is 0 or 1, all weighted predict Y of 0 samples will be just 0
        pred_Y_proc = pred_Y*2-1
        Y_proc = np.array(val_y)*2-1
        error = abs(Y_proc - pred_Y_proc)/2
        error_sum = np.sum(error)
        weighted_errors = sample_weight*error.T
        weighted_error_sum = np.sum(weighted_errors)
        print("Weighted error of "+types[t]+":\n\t"+str(weighted_error_sum.round(3))+" (error: "+str((error_sum/val_x_1.shape[0]).round(3))+")")
        
        if best_weighted_error_sum == -1 or best_weighted_error_sum > weighted_error_sum:
            #print(error)
            #print(weighted_errors)
            
            best_model = model_t
            best_error = error
            best_weighted_errors = weighted_errors
            best_weighted_error_sum = np.sum(weighted_errors)
            best_alpha = math.log((1-min(best_weighted_error_sum, (1-math.exp(-16))))/max(best_weighted_error_sum, math.exp(-16)))/2
            best_type_num = t
            best_tr_acc = model_t.evaluate(tr_x, tr_y, verbose = 0)[1]
            best_val_acc = model_t.evaluate(val_x, val_y, verbose = 0)[1]


    print("\n\t-> Selected: "+types[best_type_num])
    print("\t   tr_acc: "+str(best_tr_acc.round(3)))
    print("\t   val_acc: "+str(best_val_acc.round(3)))
    print("\t   Weighted error: "+str(best_weighted_error_sum.round(3))+", Alpha: "+str(np.float64(best_alpha).round(3))+"\n\n")
    error_term = (best_error*2)-1
    updated_weight = sample_weight*np.exp((-1)*best_alpha*error_term).T
    sample_weight = updated_weight / np.sum(updated_weight)
    
    model_list.append(best_model)
    alpha_list.append(best_alpha)
    type_num_list.append(best_type_num)
    sample_weight_list.append(sample_weight)
    error_list.append(best_error)



# 1/5th step

Weighted error of OV_six_fold_Annotation3000_400:
	0.389 (error: 0.389)
Weighted error of OV_six_fold_CV_400:
	0.444 (error: 0.444)
Weighted error of OV_six_fold_Var_400:
	0.444 (error: 0.444)
Weighted error of OV_six_fold_new_Diff_400:
	0.389 (error: 0.389)

	-> Selected: OV_six_fold_Annotation3000_400
	   tr_acc: 0.952
	   val_acc: 0.611
	   Weighted error: 0.389, Alpha: 0.226


# 2/5th step

Weighted error of OV_six_fold_Annotation3000_400:
	0.312 (error: 0.389)
Weighted error of OV_six_fold_CV_400:
	0.359 (error: 0.389)


Exception ignored in: <bound method ScopedTFStatus.__del__ of <tensorflow.python.framework.c_api_util.ScopedTFStatus object at 0x0000020A4BF116A0>>
Traceback (most recent call last):
  File "c:\users\hgh97\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\c_api_util.py", line 37, in __del__
    c_api.TF_DeleteStatus(self.status)
AttributeError: 'ScopedTFStatus' object has no attribute 'status'


Weighted error of OV_six_fold_Var_400:
	0.441 (error: 0.5)
Weighted error of OV_six_fold_new_Diff_400:
	0.441 (error: 0.5)

	-> Selected: OV_six_fold_Annotation3000_400
	   tr_acc: 0.982
	   val_acc: 0.611
	   Weighted error: 0.312, Alpha: 0.396


# 3/5th step

Weighted error of OV_six_fold_Annotation3000_400:
	0.263 (error: 0.444)
Weighted error of OV_six_fold_CV_400:
	0.374 (error: 0.444)
Weighted error of OV_six_fold_Var_400:
	0.58 (error: 0.611)
Weighted error of OV_six_fold_new_Diff_400:
	0.474 (error: 0.556)

	-> Selected: OV_six_fold_Annotation3000_400
	   tr_acc: 0.869
	   val_acc: 0.556
	   Weighted error: 0.263, Alpha: 0.516


# 4/5th step

Weighted error of OV_six_fold_Annotation3000_400:
	0.103 (error: 0.389)
Weighted error of OV_six_fold_CV_400:
	0.341 (error: 0.444)
Weighted error of OV_six_fold_Var_400:
	0.521 (error: 0.667)
Weighted error of OV_six_fold_new_Diff_400:
	0.508 (error: 0.5)

	-> Selected: OV_six_fold_Annotation3000_400
	   tr_acc: 0.911
	   val_acc: 0.611
	

In [64]:
for k in range(4):
    print(types[k]+": "+str(type_num_list.count(k)))

OV_six_fold_Annotation3000_400: 4
OV_six_fold_CV_400: 0
OV_six_fold_Var_400: 1
OV_six_fold_new_Diff_400: 0


In [65]:
tr_sum = 0
ts_sum = 0
alpha_sum = 0

for m in range(len(model_list)):
    b = type_num_list[m]
    best_type = types[b]
    print(str(m+1)+"th best model is "+best_type)
    [tr_x, tr_y, ts_x, ts_y] = [tr_x_list[b], tr_y_list[b], ts_x_list[b], ts_y_list[b]]
    print("# "+str(m+1)+" th model: "+best_type)
    tr_pred_Y = np.array(model_performance(using_model = model_list[m], tr_x_val=tr_x, tr_y_val=tr_y, ts_x_val=ts_x, ts_y_val=ts_y, output_list=["labeled_tr_predictions"]))
    ts_pred_Y = np.array(model_performance(using_model = model_list[m], tr_x_val=tr_x, tr_y_val=tr_y, ts_x_val=ts_x, ts_y_val=ts_y, output_list=["labeled_ts_predictions"]))
    tr_acc, ts_acc = np.array(model_performance(using_model = model_list[m], tr_x_val=tr_x, tr_y_val=tr_y, ts_x_val=ts_x, ts_y_val=ts_y, output_list=["tr_accuracy", "ts_accuracy"]))
    best_val_acc = model_t.evaluate(val_x, val_y, verbose = 0)[1]
    
    print("tr_acc: "+str(tr_acc))
    print("ts_acc: "+str(ts_acc))

    tr_pred_Y_proc = tr_pred_Y*2 -1 
    ts_pred_Y_proc = ts_pred_Y*2 -1
    
    #print(tr_pred_Y)
    #print(tr_pred_Y_proc)
    
    tr_sum = tr_sum + alpha_list[m]*tr_pred_Y_proc
    ts_sum = ts_sum + alpha_list[m]*ts_pred_Y_proc
    alpha_sum = alpha_sum + alpha_list[m]
    
    tr_sum_tot = np.where(tr_sum / alpha_sum > 0, 1, 0).flatten()
    ts_sum_tot = np.where(ts_sum / alpha_sum > 0, 1, 0).flatten()
    
    tr_acc_tot = 1 - np.sum(np.abs(tr_sum_tot - np.asarray(tr_y))) / tr_sum_tot.shape[0]
    ts_acc_tot = 1 - np.sum(np.abs(ts_sum_tot - np.asarray(ts_y))) / ts_sum_tot.shape[0]

    print("tr_acc_tot: "+str(tr_acc_tot))
    print("ts_acc_tot: "+str(ts_acc_tot)+"\n")
    
    
print("####################### Final acc: "+str(tr_acc_tot)+", "+str(ts_acc_tot))

1th best model is OV_six_fold_Annotation3000_400
# 1 th model: OV_six_fold_Annotation3000_400
tr_acc: 0.9523809523809523
ts_acc: 0.6774193644523621
tr_acc_tot: 0.9523809523809523
ts_acc_tot: 0.6774193548387097

2th best model is OV_six_fold_Annotation3000_400
# 2 th model: OV_six_fold_Annotation3000_400
tr_acc: 0.9821428571428571
ts_acc: 0.6774193644523621
tr_acc_tot: 0.9821428571428571
ts_acc_tot: 0.6774193548387097

3th best model is OV_six_fold_Annotation3000_400
# 3 th model: OV_six_fold_Annotation3000_400
tr_acc: 0.8690476190476191
ts_acc: 0.6774193644523621
tr_acc_tot: 0.9583333333333334
ts_acc_tot: 0.6774193548387097

4th best model is OV_six_fold_Annotation3000_400
# 4 th model: OV_six_fold_Annotation3000_400
tr_acc: 0.9107142857142857
ts_acc: 0.6774193644523621
tr_acc_tot: 0.9464285714285714
ts_acc_tot: 0.6451612903225806

5th best model is OV_six_fold_Var_400
# 5 th model: OV_six_fold_Var_400
tr_acc: 0.9880952380952381
ts_acc: 0.4838709533214569
tr_acc_tot: 0.9523809523809523