In [14]:
"""
执行代码，输入数据集csv文件路径，准确度与时间开销结果输出在末尾
"""
import numpy as np
import pandas as pd
import time
import os
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import itertools
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [2]:
# one-hot 编码
def getOneHot(y, category):
    #将y转换成one-hot编码
    num_class = category
    ohy = np.zeros((len(y), num_class))
    #ohy变为n1*p维向量 
    ohy[range(len(y)), y.ravel()] = 1
    return ohy

#过滤掉小于阈值的类别
def filter_by_class_size(X, y, threshold):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    df = pd.concat([X, y], axis=1)
    class_counts = df.iloc[:, -1].value_counts()
    minority_classes = class_counts[class_counts < threshold].index
    filtered_df = df[~df.iloc[:, -1].isin(minority_classes)]
    return filtered_df.iloc[:, :-1].values, filtered_df.iloc[:, -1].values

# 数据概率化--[0~1]
def normalize(data):
    return abs(data) / np.sum(abs(data),axis=1).reshape(-1,1)

In [3]:
#基分类器分类准确度
def get_base_predict(X, y, model_set):
    Accscore = []
    y_pred = []
    for model in model_set:
        y_pred.append(model.predict_proba(X))
        Accscore.append(accuracy_score(model.predict(X), y))
    return Accscore

In [4]:
#欧氏距离计算
def Euclidean(X_train_weight,y_train_weight,X_test,y_test,model_Set):
    m = 3   #分类器个数
    res_all_set = []
    train_weight_prediction_proba = []
    # 训练集上的预测概率
    train_weight_prediction_proba = [pd.DataFrame(model.predict_proba(X_train_weight)) for model in model_Set]
    y_train_weight_oh = getOneHot(y_train_weight, category)
    # 测试集上的预测概率
    test_prediction_proba = [pd.DataFrame(model.predict_proba(X_test)) for model in model_Set]
    y_test_oh = getOneHot(y_test, category)
       
    train_weight_prediction_proba_matrix = np.stack(train_weight_prediction_proba, axis=0) # shape: (M, N, P)
    y_train_weight_oh_matrix = np.stack(y_train_weight_oh, axis=0) # shape: (M, N, P)
    ## 数据准备阶段 完毕  
    ## ----------此处开始计算时间----------
    start_time = time.perf_counter()
    
    res_all_set = []
    for i in range(m):
        a_ij = np.sum(train_weight_prediction_proba_matrix * train_weight_prediction_proba_matrix[i], axis=(1,2)) # shape: (M,)
        b_i = np.sum(y_train_weight_oh_matrix * train_weight_prediction_proba_matrix[i], axis=(0,1)) # shape: (M,)
        res_all_set.append((a_ij, b_i))
        
    a = np.stack([res_all_set[i][0] for i in range(m)], axis=1) # shape: (M, M)
    b = np.stack([res_all_set[i][1] for i in range(m)], axis=0) # shape: (M, 1)
    w = LinearRegression(fit_intercept=False).fit(a,b).coef_
    # 权重归一
    ww = w / sum(w)
    
    addw_pred_test = np.sum(
        [ww[0]*test_prediction_proba[0], ww[1]*test_prediction_proba[1],
         ww[2]*test_prediction_proba[2]],axis=0)
    
    y_hat_test = np.argmax(addw_pred_test, axis=1)
    
    test_score = accuracy_score(y_test,y_hat_test)
    
    ## ----------此处结束计算时间----------
    end_time = time.perf_counter()
    timekeeping_E.append(end_time - start_time)
    
    return test_score

#平均权重计算
def averageWeight(X_train_weight,y_train_weight,X_test,y_test,model_Set):
    w = [1/3.0,1/3.0,1/3.0]
     # 训练集上的预测概率
    train_weight_prediction_proba = [pd.DataFrame(model.predict_proba(X_train_weight)) for model in model_Set]
    test_prediction_proba = [pd.DataFrame(model.predict_proba(X_test)) for model in model_Set]
    
    addw_pred_train = np.sum(
        [w[0]*train_weight_prediction_proba[0], w[1]*train_weight_prediction_proba[1],
         w[2]*train_weight_prediction_proba[2]],axis=0)
    addw_pred_test = np.sum(
        [w[0]*test_prediction_proba[0], w[1]*test_prediction_proba[1],
         w[2]*test_prediction_proba[2]],axis=0)
    
    y_hat_train = np.argmax(addw_pred_train, axis=1)  
    y_hat_test = np.argmax(addw_pred_test, axis=1)
    
    train_score = accuracy_score(y_train_weight,y_hat_train)
    test_score = accuracy_score(y_test,y_hat_test)
    
    return train_score,test_score


#简单权重表计算
def simple_computer_weight(X_train_weight,y_train_weight,X_test,y_test,model_Set): 
    train_weight_prediction_proba = []
    test_prediction_proba = []
    
    for model in model_Set:
        train_weight_prediction_proba.append(pd.DataFrame(model.predict_proba(X_train_weight)))
        test_prediction_proba.append(pd.DataFrame(model.predict_proba(X_test)))
    
    y_train_weight_oh = getOneHot(y_train_weight, category).reshape(-1,1)
    y_test_oh = getOneHot(y_test, category).reshape(-1,1)
    
    train_weight_stack = np.hstack([proba.values.reshape(-1, 1) for proba in train_weight_prediction_proba])
    test_stack = np.hstack([proba.values.reshape(-1, 1) for proba in test_prediction_proba]) 
    
    lr_final = LinearRegression(fit_intercept=False).fit(train_weight_stack, y_train_weight_oh)
    coef = lr_final.coef_[0]
    intercept = lr_final.intercept_
    # 权重归一
    ww = np.zeros(3)
    ww[0] = coef[0]/sum(coef)
    ww[1] = coef[1]/sum(coef)
    ww[2] = coef[2]/sum(coef)
    
    train_score,addw_pred_train = calcSW_accuracy(ww, intercept, train_weight_stack, y_train_weight)
    test_score,addw_pred_test = calcSW_accuracy(ww, intercept, test_stack, y_test)
    return train_score,test_score

def calcSW_accuracy(coef, intercept, prova, y):
    arr = prova.dot(coef) + intercept
    tmparr = np.split(arr, len(y))
    # list 转为numpy数组
    tmparr = np.array(tmparr)
    # 概率化处理
    tmparr = normalize(tmparr)
    max_index = np.argmax(tmparr, axis=1)
    return accuracy_score(max_index, y),tmparr

In [5]:
# oneVsOthers StackingC
def StackingC(X, y, X_test,y_test):   
    n_classifiers = 3
    n_features = X.shape[1]
    n_classes = int(n_features/n_classifiers)
    cols = np.array(range(n_features)).reshape(n_classifiers, n_classes).T
    XL = []
    XLT = []
    # for each column index in cols, create a ColumnSelector and fit_transform X
    XL = [ColumnSelector(cols=c).fit_transform(X) for c in cols]
    XLT = [ColumnSelector(cols=c).fit_transform(X_test) for c in cols]
    coef = np.zeros((n_classes, 3))  # 分类器个数==coef个数
    intercept = np.zeros(n_classes)   
    
    ## ----------此处开始计算时间----------
    start_time = time.perf_counter()
    
    for i in range(n_classes):
        # create a new label vector where the current class is 1 and the rest are 0
        y_bin = np.where(y == i, 1, 0)
        lr = LinearRegression()
        lr.fit(XL[i], y_bin)        
        # store the coefficients and intercepts of the model in the arrays
        coef[i] = lr.coef_
        intercept[i] = lr.intercept_
    n_samples, n_features = X_test.shape    
    # create an array to store the probabilities of each class for each sample
    prob = np.zeros((n_samples, n_classes))    
    for i in range(n_classes):
        z = XLT[i].dot(coef[i]) + intercept[i]
        prob[:, i] = z     
    ans = np.argmax(prob, axis=1)
    y_pred_test = accuracy_score(ans, y_test)    
    ## ----------此处结束计算时间----------
    end_time = time.perf_counter()
    
    timekeeping_SC.append(end_time - start_time)
    return y_pred_test

In [6]:
# oneVsOthers Stacking
def Stacking(X, y, X_test,y_test):
    n_classifiers = 3
    n_features = X.shape[1]
    n_classes = int(n_features/n_classifiers)
    coef = np.zeros((n_classes, n_features))
    intercept = np.zeros(n_classes)
    
    ## ----------此处开始计算时间----------
    start_time = time.perf_counter()
    
    for i in range(n_classes):
        y_bin = np.where(y == i, 1, 0)
        lr = LinearRegression()
        lr.fit(X, y_bin)
        coef[i] = lr.coef_
        intercept[i] = lr.intercept_
    n_samples, n_features = X_test.shape    
    prob = np.zeros((n_samples, n_classes))    
    for i in range(n_classes):
        z = X_test.dot(coef[i]) + intercept[i]
        prob[:, i] = z
        
    ans = np.argmax(prob, axis=1)
    y_pred_test = accuracy_score(ans, y_test)
    ## ----------此处结束计算时间----------
    end_time = time.perf_counter()
    
    timekeeping_S.append(end_time - start_time)
    return y_pred_test

In [7]:
def Stacking_predict(X_train,y_train,X_test,y_test,clf):
    b1_train = clf[0].predict_proba(X_train)
    b2_train = clf[1].predict_proba(X_train)
    b3_train = clf[2].predict_proba(X_train)
    b_train = np.concatenate((b1_train, b2_train, b3_train), axis=1)

    b1_test = clf[0].predict_proba(X_test)
    b2_test = clf[1].predict_proba(X_test)
    b3_test = clf[2].predict_proba(X_test)
    b_test =  np.concatenate((b1_test, b2_test, b3_test), axis=1)

    # 返回预测结果
    y_pred_test = Stacking(b_train,y_train,b_test,y_test)  
    
    return (y_pred_test)

def StackingC_predict(X_train,y_train,X_test,y_test,clf):
    b1_train = clf[0].predict_proba(X_train)
    b2_train = clf[1].predict_proba(X_train)
    b3_train = clf[2].predict_proba(X_train)
    b_train = np.concatenate((b1_train, b2_train, b3_train), axis=1)

    b1_test = clf[0].predict_proba(X_test)
    b2_test = clf[1].predict_proba(X_test)
    b3_test = clf[2].predict_proba(X_test)
    b_test =  np.concatenate((b1_test, b2_test, b3_test), axis=1)

    # 返回预测结果
    yC_pred_test = StackingC(b_train,y_train,b_test,y_test)
    
    return (yC_pred_test)

In [19]:
#数据录入
try:
    url = input("请输入一个以CSV为后缀的文件路径：")  #此处为输入数据集的路径url
    if not url.endswith(".csv"):
        raise ValueError("输入的文件名不是CSV格式")

    if not os.path.isfile(url):
        raise FileNotFoundError("找不到该文件")
             
    df = pd.read_csv(url,header=None)

    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values

    #过滤掉过少的类别
    X, y = filter_by_class_size(X, y, 2)
    #对y重新编号
    y = pd.factorize(y)[0]

    category = np.unique(y).size   #y的类别数
    sample_num = y.shape[0]  # 样本总数
    # pd.DataFrame(y).value_counts()

except (ValueError, FileNotFoundError) as e:
    print("出现错误：", e)

请输入一个以CSV为后缀的文件路径：soybean.csv


In [20]:
timekeeping_E = []
timekeeping_S = []
timekeeping_SC = []
# loop20
loop20_base_predict_test = []
loop20_Euclidean_predict = []
loop20_simple_weight_predict = []
loop20_averageWeight_predict = []
loop20_S_predict = []
loop20_SC_predict = []

In [21]:
for i in range(0,20):
    # 数据5折交叉划分  
    '''
        input: sample_num->样本总数
               X->特征矩阵
               y->标签向量
        output:train_for_model->训练基模型集
               train_for_weight->训练集成模型集
               test_set->测试集
            '''
    np.random.seed(i+1) # 设置随机种子
    index = np.arange(sample_num) # 生成索引数组
    np.random.shuffle(index) # 打乱索引数组
    X = X[index] # 按照打乱后的索引重新排列特征矩阵
    y = y[index] # 按照打乱后的索引重新排列标签向量

    # 将数据集平均分成5份
    X_subsets = np.array_split(X, 5) # 得到一个列表，包含5个子矩阵
    y_subsets = np.array_split(y, 5) # 得到一个列表，包含5个子向量

    train_for_model = []
    train_for_weight = []
    test_set = []
    # 循环5次，每次选择一个子集作为测试集，其余4个子集作为训练集
    for i in range(5):
        X_test = X_subsets[i] # 第i个子矩阵作为测试特征矩阵
        y_test = y_subsets[i] # 第i个子向量作为测试标签向量
        X_train = np.concatenate(X_subsets[:i] + X_subsets[i+1:]) # 其余4个子矩阵合并作为训练特征矩阵
        y_train = np.concatenate(y_subsets[:i] + y_subsets[i+1:]) # 其余4个子向量合并作为训练标签向量

        # 对每个训练集使用itertools.combinations函数生成所有可能的两两组合，并将它们存储在一个列表中
        combinations = itertools.combinations(range(4), 2) # 得到一个迭代器
        comb_list = list(combinations) # 将迭代器转换为列表

        X_subsets_train = np.array_split(X_train, 4) # 得到一个列表
        y_subsets_train = np.array_split(y_train, 4) # 得到一个列表
        # 从这个列表中选择6种不同的组合，并根据这些组合从训练特征矩阵和训练标签向量中提取相应的子集，
        # 并将它们合并作为X_train_for_model和X_train_for_weight
        for j in range(6):
            comb = comb_list[j] # 根据索引获取对应的组合
            comb_r = comb_list[5-j] # 根据索引获取对应的组合
            # 根据组合从训练特征矩阵和训练标签向量中提取相应的子集，并将它们合并作为trainformodel和trainforweight
            X_train_1 = X_subsets_train[comb[0]] # 第comb[0]部分作为第一个训练特征
            y_train_1 = y_subsets_train[comb[0]] # 第comb[0]部分作为第一个训练标签
            X_train_2 = X_subsets_train[comb[1]] # 第comb[1]部分作为第二个训练特征
            y_train_2 = y_subsets_train[comb[1]] # 第comb[1]部分作为第二个训练标签

            X_train_3 = X_subsets_train[comb_r[0]] # 第comb[0]部分作为第三个训练特征
            y_train_3 = y_subsets_train[comb_r[0]] # 第comb[0]部分作为第三个训练标签
            X_train_4 = X_subsets_train[comb_r[1]] # 第comb[1]部分作为第四个训练特征
            y_train_4 = y_subsets_train[comb_r[1]] # 第comb[1]部分作为第四个训练标签

            # 将它们合并
            X_train_model = np.concatenate([X_train_1 ,X_train_2]) # trainformodel特征矩阵
            y_train_model = np.concatenate([y_train_1 ,y_train_2]) # trainformodel标签向量

            X_train_weight = np.concatenate([X_train_3 ,X_train_4]) # trainforweight特征矩阵
            y_train_weight = np.concatenate([y_train_3 ,y_train_4]) # trainforweight标签向量

            train_for_model.append((X_train_model,y_train_model))
            train_for_weight.append((X_train_weight,y_train_weight))
            test_set.append((X_test,y_test))

    #--模型训练--
    '''
        input:train_for_model
        output:model_set->模型集
    '''
    model_Set = []
    clf1 = DecisionTreeClassifier(min_samples_leaf=5)
    clf2 = SVC(probability=True)
    clf3 = LogisticRegression()
    for turn in range(30):
        X_train_model = train_for_model[turn][0]
        y_train_model = train_for_model[turn][1]
        for clf in (clf1, clf2, clf3):
            clf.fit(X_train_model,y_train_model)
        model_Set.append([clf1,clf2,clf3])

    # --模型评估--
    base_predict_test = []
    Euclidean_predict = []
    simple_weight_predict = []
    averageWeight_predict = []
    S_predict = []
    SC_predict = []

    for turn in range(30):
        X_train_weight = train_for_weight[turn][0]
        y_train_weight = train_for_weight[turn][1]
        X_test = test_set[turn][0]
        y_test = test_set[turn][1]
        model_set = model_Set[turn]

        # 基分类器测试结果
        base_predict_test.append(get_base_predict(X_test,y_test,model_set))
        # Euvlidean训练效果,# Euvlidean测试效果
        Euclidean_predict.append(Euclidean(X_train_weight,y_train_weight,X_test,y_test,model_set))
        # 权重表训练效果,# 权重表测试效果
        simple_weight_predict.append(simple_computer_weight(X_train_weight,y_train_weight,X_test,y_test,model_set))
        # 平均权重训练效果,# 平均权重测试效果
        averageWeight_predict.append(averageWeight(X_train_weight,y_train_weight,X_test,y_test,model_set))
        # Stacking && StackingC 预测结果
        S_predict.append(Stacking_predict(X_train_weight,y_train_weight,X_test,y_test,model_set))
        SC_predict.append(StackingC_predict(X_train_weight,y_train_weight,X_test,y_test,model_set))

    loop20_base_predict_test.append(np.mean(np.array(base_predict_test),axis=0))
    loop20_Euclidean_predict.append(np.mean(np.array(Euclidean_predict),axis=0))
    loop20_simple_weight_predict.append(np.mean(np.array(simple_weight_predict),axis=0))
    loop20_averageWeight_predict.append(np.mean(np.array(averageWeight_predict),axis=0))
    loop20_S_predict.append(np.mean(np.array(S_predict),axis=0))
    loop20_SC_predict.append(np.mean(np.array(SC_predict),axis=0))

In [22]:
# 分类准确率比较

# 基分类器预测准确度度量
ansloop20_base_predict_test = np.mean(np.array(loop20_base_predict_test),axis=0)
# Euclidean加权预测准确度度量
ans_loop20_Euclidean_predict = np.mean(np.array(loop20_Euclidean_predict),axis=0)
# 权重表预测准确度度量
ans_loop20_simple_weight_predict = np.mean(np.array(loop20_simple_weight_predict),axis=0)[1]
# 平均权重预测准确度度量
ans_loop20_averageWeight_predict = np.mean(np.array(loop20_averageWeight_predict),axis=0)[1]
# Stacking预测准确度度量
ans_loop20_S_predict = np.mean(np.array(loop20_S_predict),axis=0)
# StackingC预测准确度度量
ans_loop20_SC_predict = np.mean(np.array(loop20_SC_predict),axis=0)
print("基分类器预测准确度度量:",ansloop20_base_predict_test,"\nEuclidean加权预测准确度度量:",ans_loop20_Euclidean_predict,
     "\n权重表预测准确度度量:",ans_loop20_simple_weight_predict,"\n平均权重预测准确度度量:",ans_loop20_averageWeight_predict,
     "\nStacking预测准确度度量:",ans_loop20_S_predict,"\nStackingC预测准确度度量",ans_loop20_SC_predict)

基分类器预测准确度度量: [0.8049839  0.87769    0.94035745] 
Euclidean加权预测准确度度量: 0.9399804994990697 
权重表预测准确度度量: 0.9399804994990697 
平均权重预测准确度度量: 0.9311329969944181 
Stacking预测准确度度量: 0.9419784957778733 
StackingC预测准确度度量 0.9435117897523974


In [23]:
#时间开销比较

# Euclidean加权时间开销
ans_timekeeping_E = np.mean(timekeeping_E)
# Stacking加权时间开销
ans_timekeeping_S = np.mean(timekeeping_S)
# StackingC加权时间开销
ans_timekeeping_SC = np.mean(timekeeping_SC)
print("Euclidean加权时间开销:",ans_timekeeping_E,"\nStacking加权时间开销:",ans_timekeeping_S,
      "\nStackingC加权时间开销:",ans_timekeeping_SC)

Euclidean加权时间开销: 0.0007453941666680445 
Stacking加权时间开销: 0.013613947833332153 
StackingC加权时间开销: 0.003884173666665447
