In [1]:
"""
执行代码，输入数据集csv文件路径，结果以名为correlation_comparison的excek表方式保存在同目录
"""
import numpy as np
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from scipy.stats import spearmanr
import math
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
#欧氏距离计算 l2范数
def Euclidean(y_pred_prob, y_test, category):
    y_test_oh = getOneHot(y_test, category)
    eucli = sum(np.linalg.norm(y_pred_prob-y_test_oh, axis=1))/ y_test_oh.shape[0]
    return eucli

#过滤掉小于阈值的类别
def filter_by_class_size(X, y, threshold):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    df = pd.concat([X, y], axis=1)
    class_counts = df.iloc[:, -1].value_counts()
    minority_classes = class_counts[class_counts < threshold].index
    filtered_df = df[~df.iloc[:, -1].isin(minority_classes)]
    return filtered_df.iloc[:, :-1].values, filtered_df.iloc[:, -1].values

# one-hot 编码
def getOneHot(y, category):
    #将y转换成one-hot编码
    num_class = category
    ohy = np.zeros((len(y), num_class))
    #ohy变为n1*p维向量 
    ohy[range(len(y)), y.ravel()] = 1
    return ohy

In [3]:
#数据录入
try:
    url = input("请输入一个以CSV为后缀的文件路径：")  #此处为输入数据集的路径url
    if not url.endswith(".csv"):
        raise ValueError("输入的文件名不是CSV格式")

    if not os.path.isfile(url):
        raise FileNotFoundError("找不到该文件")
             
    df = pd.read_csv(url,header=None)

    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values

    #过滤掉过少的类别
    X, y = filter_by_class_size(X, y, 2)
    #对y重新编号
    y = pd.factorize(y)[0]

    category = np.unique(y).size   #y的类别数
    sample_num = y.shape[0]  # 样本总数

except (ValueError, FileNotFoundError) as e:
    print("出现错误：", e)

请输入一个以CSV为后缀的文件路径：leaf.csv


In [4]:
loop_times = 20
frames = np.zeros((loop_times, 3, 5, 5))
for k in range(loop_times):
    # 数据集划分 （大数据集）
    '''
    train_set:存储5折交叉验证划分后的5组训练集属性X与标签y
    test_set:存储5折交叉验证划分后的5组训练集属性X与标签y
    '''
    np.random.seed(k) # 设置随机种子
    index = np.arange(sample_num) # 生成索引数组
    np.random.shuffle(index) # 打乱索引数组
    X = X[index] # 按照打乱后的索引重新排列特征矩阵
    y = y[index] # 按照打乱后的索引重新排列标签向量
    
# K-fold 交叉验证
    kf = StratifiedKFold(n_splits=5)
    train_set = []
    test_set = []
    for X_train,X_test in kf.split(X, y):
        train_set.append([np.array(X[X_train]),np.array(y[X_train])])
        test_set.append([np.array(X[X_test]),np.array(y[X_test])])   

    # 训练基分类器
    '''
    model_set:存储基分类器集
    sub_feature_set:存储训练样本子属性对应下标的集合
    X_train:训练样本属性
    y_train:训练样本真实标签
    X_train_sub:训练样本子属性
    sub_feature：训练样本子属性对应下标
    '''
    model_set = []
    sub_feature_set = []
    turn = 0
    X_train = train_set[turn][0]
    y_train = train_set[turn][1]    
    
    #决策树参数设置
    max_depth = [13,14,15]
    min_samples_split = [0.1,0.2]
    min_samples_leaf = [2,3,4,5,6,7]
    max_leaf_nodes = [13,14,15]
    criterion = ['gini','entropy']
    #SVM参数设置
    C = [1,2,3]
    gamma = [1, 0.1, 0.01]
    degree = [2, 3]
    
    
    # 从每个列表中随机抽取一个参数
    random_params = [
        random.choice(max_depth),
        random.choice(min_samples_split),
        random.choice(min_samples_leaf),
        random.choice(max_leaf_nodes),
        random.choice(criterion)
    ]  
    # 设置随机数种子，使得结果可复现
    np.random.seed(0)
    
    # 子特征提取 # 生成20棵不同的决策树与SVM
    for j in range(20):
        sub_feature = np.random.choice(X_train.shape[1], size=math.ceil(X.shape[1] * 0.5), replace=False)
        X_train_sub = X_train[:, sub_feature]
        model_dt = DecisionTreeClassifier(
            max_depth = random_params[0],
            min_samples_leaf = random_params[2],
            max_leaf_nodes = random_params[3],
            criterion = random_params[4]
        ).fit(X_train_sub, y_train)
        model_svm = SVC(probability=True, 
                     ).fit(X_train_sub, y_train)
        
        sub_feature_set.append(sub_feature)
        model_set.append(model_dt)
        sub_feature_set.append(sub_feature)
        model_set.append(model_svm)

    #计算各准确率评价指标
    '''
    model:用于预测的模型
    X_test[:, sub_feature]:用于对应模型预测的子特征
    X_test:测试样本属性
    y_test:测试样本真实标签
    y_pred:测试样本预测标签
    y_pred_prob:测试样本预测概率
    evaluation_index:存储各个评价指标集
    evaluation_index_set:存储评价指标集的集合
    '''
    evaluation_index_set = []
    for turn in range (5):
        # 遍历i个基分类器
        evaluation_index = []
        for i in range(len(model_set)):
            model = model_set[i]
            sub_feature = sub_feature_set[i]
            X_test = test_set[turn][0]
            y_test = test_set[turn][1]
            y_pred = model.predict(X_test[:, sub_feature])
            y_pred_prob = model.predict_proba(X_test[:, sub_feature])
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1score = f1_score(y_test, y_pred, average='macro')
            eucli = Euclidean(y_pred_prob, y_test, category)
            evaluation_index.append(np.array([accuracy,precision,recall,f1score,eucli]))
        evaluation_index_set.append(np.array(evaluation_index))

    '''
    eva_table:5折交叉验证后的结果表
    '''
    eva_table = np.mean(evaluation_index_set, axis=0)

    # Spearman's rank correlation coefficient
    corr_matrix1, p_matrix = spearmanr(eva_table)

    # Kendall Tau相关系数
    eva_table_df = pd.DataFrame(eva_table)
    corr_matrix2 = eva_table_df.corr(method='kendall')

    # Pearson相关系数
    corr_matrix3 = eva_table_df.corr(method='pearson')

    df1 = pd.DataFrame(corr_matrix1)
    df2 = pd.DataFrame(corr_matrix2)
    df3 = pd.DataFrame(corr_matrix3)

    frames[k,:,:,:] = [df1, df2, df3]
result = pd.concat([pd.DataFrame(np.mean(frames,axis=0)[i]) for i in range(3)], ignore_index=True)

for k in range(loop_times):
    # 数据集划分（小数据集）
    np.random.seed(k) # 设置随机种子
    index = np.arange(sample_num) # 生成索引数组
    np.random.shuffle(index) # 打乱索引数组
    X = X[index] # 按照打乱后的索引重新排列特征矩阵
    y = y[index] # 按照打乱后的索引重新排列标签向量
    
# K-fold 交叉验证
    kf = StratifiedKFold(n_splits=5)
    train_set = []
    test_set = []
    for X_train,X_test in kf.split(X, y):
        train_set.append([np.array(X[X_train]),np.array(y[X_train])])
        test_set.append([np.array(X[X_test]),np.array(y[X_test])])   

    # 训练基分类器
    model_set = []
    sub_feature_set = []
    turn = 0
    X_train = train_set[turn][0]
    y_train = train_set[turn][1]    
    
    #决策树参数设置
    max_depth = [1,2,3,4,5]
    min_samples_split = [0.3,0.4,0.5,0.6,0.7]
    min_samples_leaf = [1,2,3,4,5,6,7]
    max_leaf_nodes = [2,3,4,5]
    criterion = ['gini','entropy']
    #SVM参数设置
    C = [1, 2, 3]
    gamma = [1, 0.1, 0.01]
    degree = [2, 3]
        
    # 从每个列表中随机抽取一个参数
    random_params = [
        random.choice(max_depth),
        random.choice(min_samples_split),
        random.choice(min_samples_leaf),
         random.choice(max_leaf_nodes),
    ]  
    
    # 设置随机数种子，使得结果可复现
    np.random.seed(0)
    
    # 子特征提取 # 生成20棵不同的决策树与SVM
    for j in range(20):
        sub_feature = np.random.choice(X_train.shape[1], size=math.ceil(X.shape[1] * 0.5), replace=False)
        X_train_sub = X_train[:, sub_feature]
        model_dt = DecisionTreeClassifier(
            max_depth = random_params[0],
            min_samples_split = random_params[1],
            min_samples_leaf = random_params[2],
            max_leaf_nodes = random_params[3],
        ).fit(X_train_sub, y_train)
        model_svm = SVC(probability=True, 
                       ).fit(X_train_sub, y_train)
        
        sub_feature_set.append(sub_feature)
        model_set.append(model_dt)
        sub_feature_set.append(sub_feature)
        model_set.append(model_svm)

    #计算各准确率评价指标
    evaluation_index_set = []
    for turn in range (5):
        # 遍历i个基分类器
        evaluation_index = []
        for i in range(len(model_set)):
            model = model_set[i]
            sub_feature = sub_feature_set[i]
            X_test = test_set[turn][0]
            y_test = test_set[turn][1]
            y_pred = model.predict(X_test[:, sub_feature])
            y_pred_prob = model.predict_proba(X_test[:, sub_feature])
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1score = f1_score(y_test, y_pred, average='macro')
            eucli = Euclidean(y_pred_prob, y_test, category)
            evaluation_index.append(np.array([accuracy,precision,recall,f1score,eucli]))
        evaluation_index_set.append(np.array(evaluation_index))

    eva_table = np.mean(evaluation_index_set, axis=0)

    # Spearman's rank correlation coefficient
    corr_matrix1, p_matrix = spearmanr(eva_table)

    # Kendall Tau相关系数
    eva_table_df = pd.DataFrame(eva_table)
    corr_matrix2 = eva_table_df.corr(method='kendall')

    # Pearson相关系数
    corr_matrix3 = eva_table_df.corr(method='pearson')

    df1 = pd.DataFrame(corr_matrix1)
    df2 = pd.DataFrame(corr_matrix2)
    df3 = pd.DataFrame(corr_matrix3)

    frames[k,:,:,:] = [df1, df2, df3]
result2 = pd.concat([pd.DataFrame(np.mean(frames,axis=0)[i]) for i in range(3)], ignore_index=True)
result = result if abs(result.iloc[0][4]) > abs(result2.iloc[0][4]) else result2
# 设置行名和列名
result.columns = ['accuracy', 'precision', 'recall','f1score','eucli']
result.index = ['Spearman: accuracy', 'precision', 'recall','f1score','eucli',
             'Kendall:accuracy', 'precision', 'recall','f1score','eucli',
             'Pearson:accuracy', 'precision', 'recall','f1score','eucli']

In [5]:
# 将相关性数据写入correlation_comparison表中
writer = pd.ExcelWriter('correlation_comparison.xlsx')
result.to_excel(writer, sheet_name='Sheet1')
writer.save()