In [1]:
import math
import random
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import GradientBoostingClassifier



In [None]:
#预测“未标注”样本集属于特定类别的概率
def proba_func(x_init,y_init,x_choice,y_choice):
    
    #打乱“已标注”样本集
    choice_list = [i for i in range(len(x_init))]
    random.shuffle(choice_list)
    
    #将“已标注”样本集分为三份
    num1 = int(len(x_init)/3)
    num2 = int(len(x_init)*2/3)
    
    x_init1 = pd.concat([x_init.iloc[choice_list[:num1]]])
    x_init2 = pd.concat([x_init.iloc[choice_list[num1:num2]]])
    x_init3 = pd.concat([x_init.iloc[choice_list[num2:]]])
    
    y_init1 = pd.concat([y_init.iloc[choice_list[:num1]]])
    y_init2 = pd.concat([y_init.iloc[choice_list[num1:num2]]])
    y_init3 = pd.concat([y_init.iloc[choice_list[num2:]]])
    
    #创建三个分类器
    gb_clf1 = GradientBoostingClassifier()
    gb_clf2 = GradientBoostingClassifier()
    gb_clf3 = GradientBoostingClassifier()

    #训练分类器
    gb_clf1.fit(x_init1,y_init1)  
    gb_clf2.fit(x_init2,y_init2)
    gb_clf3.fit(x_init3,y_init3)
    
    #预测“未标注”样本集属于特定类别的概率
    proba1 = gb_clf1.predict_proba(x_choice)
    proba2 = gb_clf2.predict_proba(x_choice)
    proba3 = gb_clf3.predict_proba(x_choice)
    
    return proba1,proba2,proba3

In [None]:
#评估函数
def scores_func(proba):
    scores_sort = []
    #proba形如[[0.1,0.9],[0.4,0.6],[0.7,0.3]]
    for sc in proba:
        col = 0
        for p in sc:
            #避免出现log0
            if p in [0,1]:
                col += 0
            else:
                col += -p*math.log(p,math.e)
        scores_sort.append(col)
    return scores_sort

In [None]:
#返回新的样本集：分别为“已标注”特征矩阵、类标；“未标注”特征矩阵、类标
def new_dataset(x_init,y_init,x_choice,y_choice,scores_sort,each_size=-30):
    #scores_sorted为scores_sort由小到大的索引
    scores_sort = np.array(scores_sort)
    scores_sorted = np.argsort(scores_sort)
    
    #“scores_sorted[each_size:]”即为 选取最大的 -each_size 个scores_sort
    x_init = pd.concat([x_init,x_choice.iloc[scores_sorted[each_size:]]])
    y_init = pd.concat([y_init,y_choice.iloc[scores_sorted[each_size:]]])
    x_choice = pd.concat([x_choice.iloc[scores_sorted[:each_size]]])
    y_choice = pd.concat([y_choice.iloc[scores_sorted[:each_size]]])
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#基本qbc
def choice_tra(x_init,y_init,x_choice,y_choice):

    #（学习引擎）预测“未标注”样本集属于特定类别的概率
    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    #（选择引擎）三个分类器分别计算全体“未标注”样本集的分数
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #（选择引擎）关键：投票熵和类条件后验最大熵相结合，加入scores_sort
    scores_sort = []
    for i in range(len(scores1_sort)):
        col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])
        scores_sort.append(col)

    #根据scores_sort，选择“未标注”样本，交由专家标注，后加入“已标注”样本集，并从“未标注”样本集中剔除
    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#委员会加权qbc
def choice_committee_weighting(x_init,y_init,x_choice,y_choice):

    choice_list = [i for i in range(len(x_init))]
    random.shuffle(choice_list)
    
    num1 = int(len(x_init)/3)
    num2 = int(len(x_init)*2/3)
    
    x_init1 = pd.concat([x_init.iloc[choice_list[:num1]]])
    x_init2 = pd.concat([x_init.iloc[choice_list[num1:num2]]])
    x_init3 = pd.concat([x_init.iloc[choice_list[num2:]]])
    
    y_init1 = pd.concat([y_init.iloc[choice_list[:num1]]])
    y_init2 = pd.concat([y_init.iloc[choice_list[num1:num2]]])
    y_init3 = pd.concat([y_init.iloc[choice_list[num2:]]])
    
    gb_clf1 = GradientBoostingClassifier()
    gb_clf2 = GradientBoostingClassifier()
    gb_clf3 = GradientBoostingClassifier()

    gb_clf1.fit(x_init1,y_init1)
    gb_clf2.fit(x_init2,y_init2)
    gb_clf3.fit(x_init3,y_init3)
    
    #根据“已标注”样本集，计算三个分类器的精确度
    score_weight1 = gb_clf1.score(x_init,y_init)
    score_weight2 = gb_clf2.score(x_init,y_init)
    score_weight3 = gb_clf3.score(x_init,y_init)
    
    proba1 = gb_clf1.predict_proba(x_choice)
    proba2 = gb_clf2.predict_proba(x_choice)
    proba3 = gb_clf3.predict_proba(x_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #关键：每个scores_sort需乘上各自分类器的精度，即为委员会加权
    scores_sort = []
    for i in range(len(scores1_sort)):
        col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)
        scores_sort.append(col)

    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#多样性qbc
def choice_diversity(x_init,y_init,x_choice,y_choice):

    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #合并“已标注”和“未标注”特征矩阵
    x_all = pd.concat([x_init,x_choice])
    
    #计算“未标注”样本集在全体样本集中距离最近的样本的索引
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    #只需取最近的一个其他样本，因为训练集用x_all，所以返回值最近是x_choice本身，因此需要返回2个索引值
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))] ,2, return_distance=False)
    
    
    scores_sort = []
    for i in range(len(scores1_sort)):
        #多样性分数，计算与“未标记”样本最近的一个样本的相似度
        diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum()
        col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*diversity
        scores_sort.append(col)
            
    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#密度qbc
def choice_density(x_init,y_init,x_choice,y_choice,density_scope=11):

    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #合并“已标注”和“未标注”特征矩阵
    x_all = pd.concat([x_init,x_choice])
    
    #计算“未标注”样本集在全体样本集中距离最近的density_scope个样本的索引
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], density_scope, return_distance=False)
    
    scores_sort = []
    for i in range(len(scores1_sort)):
        #密度分数，计算“未标注”样本最近的density_scope个样本的平均密度
        density = ((density_scope-1)-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/density_scope
        col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*density
        scores_sort.append(col)
            
    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#平衡qbc
def choice_balance(x_init,y_init,x_choice,y_choice,bal_scale=0.35,bigger_par=0.6):

    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #计算数据集的平衡性，使用小样本集除大样本集
    distribution_normal = y_init.value_counts()[1]
    distribution_suspicious = y_init.value_counts()[2]
    distribution = distribution_suspicious/(distribution_normal+distribution_suspicious)
    
    scores_sort = []
    #若当前数据集的平衡性小于预期平衡，则分类器预测“未标注”样本为大样本集的样本需乘bal_scale，减小其分数
    if distribution >= bal_scale:
        for i in range(len(scores1_sort)):
            col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])
            scores_sort.append(col)
    else:
        for i in range(len(scores1_sort)):
            if (proba1[i][0]+proba2[i][0]+proba3[i][0])/3 < 0.5:
                col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])
            else:
                col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*bigger_par
            scores_sort.append(col)

    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#密度+平衡 qbc，仅在选择分数那里混合两种因子
def choice_density_balance(x_init,y_init,x_choice,y_choice,bal_scale=0.35,bigger_par=0.6,density_scope=11):

    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    distribution_normal = y_init.value_counts()[1]
    distribution_suspicious = y_init.value_counts()[2]
    distribution = distribution_suspicious/(distribution_normal+distribution_suspicious)
    
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], density_scope, return_distance=False)
    
    scores_sort = []
    if distribution >= bal_scale:
        for i in range(len(scores1_sort)):
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*density
            scores_sort.append(col)
    else:
        for i in range(len(scores1_sort)):
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            if (proba1[i][0]+proba2[i][0]+proba3[i][0])/3 < 0.5:
                col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*density
            else:
                col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*density*bigger_par
            scores_sort.append(col)

    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#多样性+密度
def choice_diversity_density(x_init,y_init,x_choice,y_choice,density_scope=11):

    proba1,proba2,proba3 = proba_func(x_init,y_init,x_choice,y_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    x_all = pd.concat([x_init,x_choice])
    
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))] ,density_scope, return_distance=False)
    
    scores_sort = []
    for i in range(len(scores1_sort)):
        diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum()
        density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
        col = max(scores1_sort[i],scores2_sort[i],scores3_sort[i])*diversity*density
        scores_sort.append(col)
            
    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#密度+平衡+委员会加权 qbc，同上
def choice_density_balance_committee_weighting(x_init,y_init,x_choice,y_choice,bal_scale=0.35,bigger_par=0.6,density_scope=11):

    choice_list = [i for i in range(len(x_init))]
    random.shuffle(choice_list)
    
    num1 = int(len(x_init)/3)
    num2 = int(len(x_init)*2/3)
    
    x_init1 = pd.concat([x_init.iloc[choice_list[:num1]]])
    x_init2 = pd.concat([x_init.iloc[choice_list[num1:num2]]])
    x_init3 = pd.concat([x_init.iloc[choice_list[num2:]]])
    
    y_init1 = pd.concat([y_init.iloc[choice_list[:num1]]])
    y_init2 = pd.concat([y_init.iloc[choice_list[num1:num2]]])
    y_init3 = pd.concat([y_init.iloc[choice_list[num2:]]])
    
    gb_clf1 = GradientBoostingClassifier()
    gb_clf2 = GradientBoostingClassifier()
    gb_clf3 = GradientBoostingClassifier()

    gb_clf1.fit(x_init1,y_init1)
    score_weight1 = gb_clf1.score(x_init,y_init)
    
    gb_clf2.fit(x_init2,y_init2)
    score_weight2 = gb_clf2.score(x_init,y_init)
    
    gb_clf3.fit(x_init3,y_init3)
    score_weight3 = gb_clf3.score(x_init,y_init)
    
    proba1 = gb_clf1.predict_proba(x_choice)
    proba2 = gb_clf2.predict_proba(x_choice)
    proba3 = gb_clf3.predict_proba(x_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    distribution_normal = y_init.value_counts()[1]
    distribution_suspicious = y_init.value_counts()[2]
    distribution = distribution_suspicious/(distribution_normal+distribution_suspicious)
    
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], density_scope, return_distance=False)
    
    scores_sort = []
    if distribution >= bal_scale:
        for i in range(len(scores1_sort)):
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*density
            scores_sort.append(col)
    else:
        for i in range(len(scores1_sort)):
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            if (proba1[i][0]+proba2[i][0]+proba3[i][0])/3 < 0.5:
                col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*density
            else:
                col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*density*bigger_par
            scores_sort.append(col)

    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice

In [None]:
#多样性+密度+平衡+委员会加权 qbc
def choice_diversity_density_balance_committee_weighting(x_init,y_init,x_choice,y_choice,bal_scale=0.35,bigger_par=0.6,density_scope=11):

    choice_list = [i for i in range(len(x_init))]
    random.shuffle(choice_list)
    
    num1 = int(len(x_init)/3)
    num2 = int(len(x_init)*2/3)
    
    x_init1 = pd.concat([x_init.iloc[choice_list[:num1]]])
    x_init2 = pd.concat([x_init.iloc[choice_list[num1:num2]]])
    x_init3 = pd.concat([x_init.iloc[choice_list[num2:]]])
    
    y_init1 = pd.concat([y_init.iloc[choice_list[:num1]]])
    y_init2 = pd.concat([y_init.iloc[choice_list[num1:num2]]])
    y_init3 = pd.concat([y_init.iloc[choice_list[num2:]]])
    
    gb_clf1 = GradientBoostingClassifier()
    gb_clf2 = GradientBoostingClassifier()
    gb_clf3 = GradientBoostingClassifier()

    gb_clf1.fit(x_init1,y_init1)
    score_weight1 = gb_clf1.score(x_init,y_init)
    
    gb_clf2.fit(x_init2,y_init2)
    score_weight2 = gb_clf2.score(x_init,y_init)
    
    gb_clf3.fit(x_init3,y_init3)
    score_weight3 = gb_clf3.score(x_init,y_init)
    
    proba1 = gb_clf1.predict_proba(x_choice)
    proba2 = gb_clf2.predict_proba(x_choice)
    proba3 = gb_clf3.predict_proba(x_choice)
    
    scores1_sort = scores_func(proba1)  
    scores2_sort = scores_func(proba2)
    scores3_sort = scores_func(proba3)
    
    #关键
    distribution_normal = y_init.value_counts()[1]
    distribution_suspicious = y_init.value_counts()[2]
    distribution = distribution_suspicious/(distribution_normal+distribution_suspicious)
    
    neigh = NearestNeighbors()
    neigh.fit(x_all)
    distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], density_scope, return_distance=False)
    
    scores_sort = []
    if distribution >= bal_scale:
        for i in range(len(scores1_sort)):
            diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum()
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*diversity*density
            scores_sort.append(col)
    else:
        for i in range(len(scores1_sort)):
            diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum()
            density = (10-pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:]],metric="cosine").sum())/10
            if (proba1[i][0]+proba2[i][0]+proba3[i][0])/3 < 0.5:
                col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*diversity*density
            else:
                col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)*diversity*density*bigger_par
            scores_sort.append(col)

    x_init,y_init,x_choice,y_choice = new_dataset(x_init,y_init,x_choice,y_choice,scores_sort)
    
    return x_init,y_init,x_choice,y_choice