In [1]:
import pandas as pd 
import numpy as np
from doepy import build
import seaborn as sns
import matplotlib.pyplot as plt
import operator
import random
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
from IISEPaper import categorise
from IISEPaper import get_labeled_index
from IISEPaper import get_feature_label
from IISEPaper import get_labeled_set
from IISEPaper import get_training_Set
from IISEPaper import get_unlabeled_set
from IISEPaper import c_prediction
from sklearn.gaussian_process import GaussianProcessClassifier
from IISEPaper import predicted_region_divide
from IISEPaper import get_labeled_feasible_sample
from IISEPaper import get_feasible_rep
from IISEPaper import get_div_term
from IISEPaper import find_indices
from IISEPaper import get_select_index
from IISEPaper import GP_predict
from IISEPaper import labeled_fea_sample
from IISEPaper import remove_label
from IISEPaper import get_index_div

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [4]:
from scipy.stats import norm

In [5]:
def remove_label(full_list,remove_list):
    for n in remove_list:
        while n in full_list:
            full_list.remove(n)
            
    return full_list

In [6]:
def get_index_div(div, unlabeled_index):
    index=[]
    max_div = max(div)
    temp = list(div).index(max_div)
    index.append(unlabeled_index[temp])
    return index

In [7]:
def get_div_norm(unlabeled_feature,labeled_feature):
    dis_div = euclidean_distances(unlabeled_feature,np.array(labeled_feature))
    div = dis_div.mean(axis=1)
    div = (div - div.min())/(div.max()-div.min())
    return div

In [8]:
def get_div_term(unlabeled_feature,labeled_feature):
    dis_div = euclidean_distances(unlabeled_feature,np.array(labeled_feature))
    div = dis_div.min(axis=1)
    #div = (div - div.min())/(div.max()-div.min())
    return div

In [9]:
def get_index_uncertainty(y_c_uncertainty, unlabeled_index,n):
    K = np.array(y_c_uncertainty)
    K.argsort()[-n:]
    selection_index = list(operator.itemgetter(*(list(K.argsort()[-n:])))(unlabeled_index))
    return selection_index

In [10]:
from sklearn.metrics.pairwise import euclidean_distances
def get_feasible_rep(c_pool,initial_feasible_sample):
    feasible_rep = euclidean_distances(c_pool,initial_feasible_sample)
    rep_new = feasible_rep.mean(axis=1)
    rep_new = 1- (rep_new - rep_new.min())/(rep_new.max()-rep_new.min())
    return rep_new

In [11]:
def get_rep_term(unlabeled_feature,labeled_feasible_sample):
    feasible_rep = euclidean_distances(unlabeled_feature,labeled_feasible_sample)
    rep_new = feasible_rep.mean(axis=1)
    rep_new = 1- (rep_new - rep_new.min())/(rep_new.max()-rep_new.min())
    return rep_new

In [12]:
def labeled_fea_sample(x_labeled,y_labeled,y_r_labeled):
    index_0 = np.where(y_labeled == 0)[0]
    index_1 = np.where(y_labeled == 1)[0]
    return x_labeled[index_0], x_labeled[index_1],y_r_labeled[index_0]

In [13]:
def predicted_fea_region(y_pred,unlabeled_feature):
    index_0 = np.where(y_pred == 0)[0]
    return unlabeled_feature[index_0]

In [14]:
def get_div_norm(unlabeled_feature,labeled_feature):
    dis_div = euclidean_distances(unlabeled_feature,np.array(labeled_feature))
    div = dis_div.mean(axis=1)
    div = (div - div.min())/(div.max()-div.min())
    return div

In [15]:
def find_rows(source, target):
    return np.where((source == target).all(axis=1))[0]

In [16]:
def get_rep_unlabeled(unlabeled_feature):
    A = euclidean_distances(unlabeled_feature,unlabeled_feature)
    m = A.shape[0]
    strided = np.lib.stride_tricks.as_strided
    s0,s1 = A.strides
    out = strided(A.ravel()[1:],shape=(m-1,m),strides =(s0+s1,s1)).reshape(m,-1)
    rep = out.mean(axis=1)
    rep_norm = 1- (rep - rep.min())/(rep.max()-rep.min())
    return rep_norm

In [17]:
def EI(mean,std, y_best):
    z = (y_best - mean )/std
    return (y_best - mean ) * norm.cdf(z) + std * norm.pdf(z)

In [18]:
def drop_na(x):
    return x[~np.isnan(x)]

In [19]:
from sklearn import preprocessing
s_scaler = preprocessing.StandardScaler()
data = pd.read_csv('dataset1101.csv')
ds,X_feature,y_ground,y_r = get_feature_label(data)
X_norm = s_scaler.fit_transform(X_feature)

In [20]:
model = GaussianProcessClassifier()
index_file = np.load('data1_10_index.npy',allow_pickle = True)
full_index = list(range(y_ground.shape[0]))

In [21]:
result=[]
acc=[]
for n in range(len(index_file)):
    labeled_index = list(index_file[n])
    full_index = list(np.arange(len(data)))
    x_labeled = X_feature[labeled_index]

    
    y_labeled = y_ground[labeled_index]
    y_r_labeled = y_r[labeled_index]
    unlabeled_index = remove_label(full_index,labeled_index)
    unlabeled_feature = X_feature[unlabeled_index]

    r_selected = y_r[labeled_index]

    f1=[]
    l=0

    labeled_feasible_sample, labeled_infeasible_sample,labeled_r = labeled_fea_sample(x_labeled,y_labeled,r_selected)
    r=[min(labeled_r)-1]
   

    #while min(r) > 0 and len(r)<50: 
    while l <20:
        #constraint model prediction
        print(x_labeled.shape)
        y_c_pred , y_prob , y_c_uncertainty = c_prediction(x_labeled, y_labeled, unlabeled_feature,model)
        
        y_c = y_ground.copy()
        y_c[unlabeled_index] = y_c_pred
        f1.append(accuracy_score(y_ground, y_c))

        #Set initial goast training set
        training_x = x_labeled
        training_y = r_selected.copy()
        labeled_feasible_sample, labeled_infeasible_sample,labeled_r = labeled_fea_sample(x_labeled,y_labeled,r_selected)
        y_pred_labeled_sudo,sigma_labeled_sudo = GP_predict(labeled_feasible_sample,labeled_r,labeled_infeasible_sample)
        training_y[np.where(y_labeled == 1)[0]]=y_pred_labeled_sudo


        y_pred_pool,sigma_pool = GP_predict(training_x,training_y,unlabeled_feature)
        y_best = min(r_selected)
        
        EIP = EI(y_pred_pool,sigma_pool, y_best)

        constrain_EIP = (1-y_c_pred) * (np.amax(y_prob, axis=1)) *EIP
        

        selection_index = get_index_uncertainty(constrain_EIP, unlabeled_index,3)
        
        #update selected set 
        for i in selection_index:
            labeled_index.append(i)

        x_labeled = X_feature[labeled_index]
       
        unlabeled_index = remove_label(full_index,labeled_index)
        unlabeled_feature = X_feature[unlabeled_index]
       
        y_labeled = y_ground[labeled_index]
        r_selected = y_r[labeled_index]
        y_r_labeled = y_r[labeled_index]
       
        r_selected = y_r[labeled_index]
        labeled_feasible_sample, labeled_infeasible_sample,labeled_r = labeled_fea_sample(x_labeled,y_labeled,r_selected)
        r.append(min(labeled_r)-1)
        
        l=l+1
    result.append(r)
    acc.append(f1)
    print('Round {} finished'.format(n))

(164, 2)
(167, 2)
(170, 2)
(173, 2)
(176, 2)
(179, 2)
(182, 2)
(185, 2)
(188, 2)
(191, 2)
(194, 2)
(197, 2)
(200, 2)
(203, 2)
(206, 2)
(209, 2)
(212, 2)
(215, 2)
(218, 2)
(221, 2)
Round 0 finished
(165, 2)
(168, 2)
(171, 2)
(174, 2)
(177, 2)
(180, 2)
(183, 2)
(186, 2)
(189, 2)
(192, 2)
(195, 2)
(198, 2)
(201, 2)
(204, 2)
(207, 2)
(210, 2)
(213, 2)
(216, 2)
(219, 2)
(222, 2)
Round 1 finished
(167, 2)
(170, 2)
(173, 2)
(176, 2)
(179, 2)
(182, 2)
(185, 2)
(188, 2)
(191, 2)
(194, 2)
(197, 2)
(200, 2)
(203, 2)
(206, 2)
(209, 2)
(212, 2)
(215, 2)
(218, 2)
(221, 2)
(224, 2)
Round 2 finished
(165, 2)
(168, 2)
(171, 2)
(174, 2)
(177, 2)
(180, 2)
(183, 2)
(186, 2)
(189, 2)
(192, 2)
(195, 2)
(198, 2)
(201, 2)
(204, 2)
(207, 2)
(210, 2)
(213, 2)
(216, 2)
(219, 2)
(222, 2)
Round 3 finished
(166, 2)
(169, 2)
(172, 2)
(175, 2)
(178, 2)
(181, 2)
(184, 2)
(187, 2)
(190, 2)
(193, 2)
(196, 2)
(199, 2)
(202, 2)
(205, 2)
(208, 2)
(211, 2)
(214, 2)
(217, 2)
(220, 2)
(223, 2)
Round 4 finished
(159, 2)
(162, 

In [None]:
from itertools import zip_longest

temp = np.array(list(zip_longest(*result, fillvalue=0))).T

In [None]:
mean = temp.mean(axis=0)
std = temp.std(axis=0)

In [22]:
mean_al=np.array(acc).mean(axis=0)
std_al = np.array(acc).std(axis=0)

In [23]:
mean_al

array([0.74584289, 0.75306649, 0.75312314, 0.75504943, 0.7557293 ,
       0.75536104, 0.7541996 , 0.75366137, 0.75210334, 0.75060197,
       0.75388799, 0.7562392 , 0.75768392, 0.75944024, 0.7603184 ,
       0.76176312, 0.76572902, 0.76796691, 0.76963825, 0.7704031 ])

In [24]:
std_al

array([0.03487051, 0.0375502 , 0.03651823, 0.03439806, 0.03451474,
       0.03213837, 0.0324404 , 0.03238908, 0.03239077, 0.03200702,
       0.03098902, 0.03086902, 0.03034399, 0.03064579, 0.03034375,
       0.02858193, 0.02765723, 0.02967016, 0.02873323, 0.0298728 ])