### 1.训练集和测试集性能对比图（ROC，校准曲线，平行线图，DCA曲线）

In [None]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import lightgbm
import time
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import label_binarize
import shap
import scipy.stats as stats
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
import warnings
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

In [None]:
# 设置字体
plt.rcParams['font.sans-serif'] = ['Times New Roman']  # 设置字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

#去除警告
warnings.filterwarnings('ignore')

In [None]:
# Bootstrap函数，用于计算各评价指标的置信区间
def bootstrap_ci(data, n_bootstrap=1000, ci=95):
    """
    通过 Bootstrap 方法计算指定置信区间。
    :param data: 一维数据 (list or np.array)
    :param n_bootstrap: 迭代次数，默认为1000
    :param ci: 置信水平，默认为95%
    :return: 下限, 上限
    """
    boot_means = []
    for _ in range(n_bootstrap):
        # 随机采样数据并计算均值
        boot_sample = np.random.choice(data, size=len(data), replace=True)
        boot_means.append(np.mean(boot_sample))
    # 计算置信区间
    lower = np.percentile(boot_means, (100 - ci) / 2)
    upper = np.percentile(boot_means, 100 - (100 - ci) / 2)
    lower = np.round(lower, 3)
    upper = np.round(upper, 3)
    return lower, upper

In [None]:
#计算样本均值和方差（正态法）
def calculate_ci_stats(data):
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)
    ci = stats.t.interval(0.95, df=n - 1, loc=mean, scale=se)
    ci_round = np.round(ci, 3)
    return ci_round

In [None]:
#训练集十折交叉验证
def Stratified_kfold_algorithm(model, x_train, y_train, ks):
    kf = StratifiedKFold(n_splits=ks, shuffle=True, random_state=42)
    t_acc, t_re, t_pre, t_f1, t_auc, t_spec, t_recall, ws_each, t_kappa = [], [], [], [], [], [], [], [], []
    y_test_fold_all,y_scores_fold_all = [],[]
    fpr_list, tpr_list = [], [] #保存每一折的fpr和tpr

    figs = [plt.figure() for _ in range(np.unique(y_train).shape[0])]

    #在这里加SMOTE
    smote = SMOTE(random_state =42)
    for k, (train, test) in enumerate(kf.split(x_train, y_train)):
        x_train_fold_raw, y_train_fold_raw = x_train.iloc[train], y_train.iloc[train]
        x_test_fold, y_test_fold = x_train.iloc[test], y_train.iloc[test]

        x_train_fold,y_train_fold = smote.fit_resample(x_train_fold_raw,y_train_fold_raw)
        #将y_test_fold进行One-hot编码
        y_test_fold_binarized = label_binarize(y_test_fold, classes=np.unique(y_train))
        
        start_time = time.time()
        clf = model.fit(x_train_fold, y_train_fold)
        y_pred_model = clf.predict(x_test_fold)
        y_scores_fold = clf.predict_proba(x_test_fold)

        # 新增代码：计算并保存这一折的 FPR / TPR
        fpr_k, tpr_k, _ = roc_curve(y_test_fold_binarized.ravel(), y_scores_fold.ravel())
        fpr_list.append(fpr_k)
        tpr_list.append(tpr_k)

        #将所有折的原始标签和预测标签数据保存下来
        y_test_fold_all.append(y_test_fold_binarized)
        y_scores_fold_all.append(y_scores_fold)

        # 输出并保存混淆矩阵
        cm = confusion_matrix(y_test_fold, y_pred_model)
        print(f'混淆矩阵 (Fold {k + 1}):\n{cm}')
        # print(cm.shape)  5*5

        # 计算每个类别的特异度
        specificity_each = []
        for i in range(cm.shape[0]):
            # True Negative (TN): 所有不是第 i 类的预测为负类的样本
            TN = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
            # False Positive (FP): 实际为负类，但被分类为第 i 类的样本
            FP = np.sum(cm[:, i]) - cm[i, i]

            # 计算特异度
            specificity = TN / (TN + FP)
            specificity_each.append(specificity)

        specificities = np.mean(specificity_each)#每个类别的特异度取平均得到该折的整体特异度
        t_spec.append(specificities)

        classifier = str(model).split('(')[0]

        score = metrics.accuracy_score(y_test_fold, y_pred_model)
        print('The accuracy score of {0} is: {1}%'.format(classifier, round(score * 100, 2)))
        end_time = time.time()
        use_time = end_time - start_time
        print('The time of using {0} is :{1}'.format(classifier, round(use_time, 2)))

        kappa = metrics.cohen_kappa_score(y_test_fold, y_pred_model)

        #添加一个kappa
        t_kappa.append(kappa)
        t_acc.append(score)
        t_re.append(recall_score(y_test_fold, y_pred_model, average='macro'))
        t_pre.append(precision_score(y_test_fold, y_pred_model, average='macro'))
        t_f1.append(fbeta_score(y_test_fold, y_pred_model, average='macro', beta=1))
        t_auc.append(roc_auc_score(y_test_fold, y_scores_fold, average='macro', multi_class='ovr'))
        # roc_auc_score 将基于每个样本的类别概率来计算多分类的 AUC，而不是基于类别预测，因而需要使用y_scores_fold表示类别预测值

        # 保存每一折的模型
        # joblib.dump(clf, f'模型和结果/{classifier}_fold{k + 1}.pkl')
        # 将得到的每种参数进行拼接，以便输出每一折的参数情况。zip 将这些一维列表拼接成二维列表，并转置
        ws_each = [list(x) for x in zip(t_acc, t_re, t_pre, t_f1, t_auc, t_spec, t_kappa)]
    
    # 合并所有折的数据
    y_test_fold_all = np.concatenate(y_test_fold_all, axis=0)
    y_scores_fold_all = np.concatenate(y_scores_fold_all, axis=0)

    strap_list = []
    strap_list_stats = []

    # 计算置信区间
    acc_ci = bootstrap_ci(t_acc)
    sen_ci = bootstrap_ci(t_re)
    pre_ci = bootstrap_ci(t_pre)
    f1_ci = bootstrap_ci(t_f1)
    auc_ci = bootstrap_ci(t_auc)
    spec_ci = bootstrap_ci(t_spec)
    kap_ci = bootstrap_ci(t_kappa)
    strap_list.extend([acc_ci, sen_ci, pre_ci, f1_ci, auc_ci, spec_ci,kap_ci])

    n = 10
    # 样本均值和标准误差
    acc_ci = calculate_ci_stats(t_acc)
    sen_ci = calculate_ci_stats(t_re)
    pre_ci = calculate_ci_stats(t_pre)
    f1_ci = calculate_ci_stats(t_f1)
    auc_ci = calculate_ci_stats(t_auc)
    spec_ci = calculate_ci_stats(t_spec)
    kap_ci = calculate_ci_stats(t_kappa)
    strap_list_stats.extend([acc_ci, sen_ci, pre_ci, f1_ci, auc_ci, spec_ci,kap_ci])

    return np.mean(t_acc), np.mean(t_re), np.mean(t_pre), np.mean(t_f1), np.mean(t_auc), np.mean(
        t_spec), np.mean(t_kappa), classifier, ws_each, strap_list, strap_list_stats,y_test_fold_all,y_scores_fold_all,fpr_list, tpr_list

In [None]:
#计算AUC的均值和95%CI
def compute_auc_with_ci_multiclass(y_true_binarized, y_probs, n_bootstraps=1000, random_seed=42):
    """
    计算多类别问题的AUC及其95%置信区间（基于全体样本随机采样）

    参数：
    - y_true_binarized: (n_samples, n_classes) 二值化后的真实标签
    - y_probs: (n_samples, n_classes) 模型预测概率
    - n_bootstraps: 采样次数
    - random_seed: 随机种子

    返回：
    - auc_mean: 平均AUC
    - ci_lower: 95%置信区间下界
    - ci_upper: 95%置信区间上界
    """
    rng = np.random.RandomState(random_seed)
    auc_scores = []

    # 展平二值化的真实标签和预测概率
    y_true_flat = y_true_binarized.ravel()
    y_probs_flat = y_probs.ravel()

    for _ in range(n_bootstraps):
        # 全体样本随机采样
        indices = rng.choice(len(y_true_flat), size=len(y_true_flat), replace=True)

        # 确保采样结果中至少包含两个类别
        if len(np.unique(y_true_flat[indices])) < 2:
            continue

        # 计算ROC曲线和AUC
        fpr, tpr, _ = roc_curve(y_true_flat[indices], y_probs_flat[indices])
        auc_scores.append(auc(fpr, tpr))

    # 计算AUC均值和95%置信区间
    auc_mean = np.mean(auc_scores)
    ci_lower = np.percentile(auc_scores, 2.5)
    ci_upper = np.percentile(auc_scores, 97.5)

    return auc_mean, ci_lower, ci_upper

In [None]:
# 计算 Brier Score 和 置信区间
def compute_brier_score_with_ci(y_true_binarized, y_probs, n_bootstraps=1000, random_seed=42):
    """
    计算 Brier Score 及其 95% 置信区间（通过自助法计算）

    参数：
    - y_true_binarized: 二值化后的真实标签 (n_samples, n_classes)
    - y_probs: 预测的概率 (n_samples, n_classes)
    - n_bootstraps: 采样次数
    - random_seed: 随机种子

    返回：
    - brier_score_mean: Brier Score 的均值
    - brier_score_ci_lower: 置信区间下界
    - brier_score_ci_upper: 置信区间上界
    """
    rng = np.random.RandomState(random_seed)
    brier_scores = []

    # 展平二值化的真实标签和预测概率
    y_true_flat = y_true_binarized.ravel()
    y_probs_flat = y_probs.ravel()

    for _ in range(n_bootstraps):
        # 全体样本随机采样
        indices = rng.choice(len(y_true_flat), size=len(y_true_flat), replace=True)

        # 计算 Brier Score
        brier_score = np.mean((y_probs_flat[indices] - y_true_flat[indices]) ** 2)
        brier_scores.append(brier_score)

    # 计算均值和 95% 置信区间
    brier_score_mean = np.mean(brier_scores)
    brier_score_ci_lower = np.percentile(brier_scores, 2.5)
    brier_score_ci_upper = np.percentile(brier_scores, 97.5)

    return brier_score_mean, brier_score_ci_lower, brier_score_ci_upper

In [None]:
#训练集上的校正曲线图
def CalibratedClassify_training_set(classifiers,x_train,y_train):
    sns.set_palette("Set2")  # 或 "muted", "bright", "Set2"
    fig2,ax2 = plt.subplots(figsize=(8,7))
    for classifier in classifiers:
        # 训练模型并进行校准
        calibrated_clf = CalibratedClassifierCV(classifier, method='isotonic', cv=10)
        calibrated_clf.fit(x_train, y_train)

        # 获取训练集上的预测概率
        calibrated_probabilities_train = calibrated_clf.predict_proba(x_train)
        y_train_binarized = label_binarize(y_train, classes=np.unique(y_train))

        # 计算 Brier Score及其置信区间
        brier_score_mean, brier_score_ci_lower, brier_score_ci_upper = compute_brier_score_with_ci(y_train_binarized,calibrated_probabilities_train)

        #展平标签和预测概率
        y_true_flat = y_train_binarized.ravel()
        calibrated_probabilities_flat = calibrated_probabilities_train.ravel()

        # 计算校准曲线
        calibrated_fraction_of_positives, calibrated_mean_predicted_value = calibration_curve(y_true_flat, calibrated_probabilities_flat, n_bins=10, strategy='uniform')

        # 绘制校准曲线
        ax2.plot(calibrated_mean_predicted_value,calibrated_fraction_of_positives,marker='o', label=f'{classifier.__class__.__name__}(Brier: {brier_score_mean:.3f}, CI: [{brier_score_ci_lower:.3f}, {brier_score_ci_upper:.3f}])')
        # 绘制校准曲线图
    ax2.plot([0, 1], [0, 1], linestyle='--', color='black', label='Perfect Calibration')
    ax2.set(xlim=(-0.05, 1.05), ylim=(-0.05, 1.05), xlabel="Mean Predicted Probability", ylabel="Fraction of Positives")
    ax2.legend(loc="upper left")
    ax2.grid(True)
    ax2.set_title("Training set Calibration Curve")

    # 保存校准曲线图
    fig2.tight_layout()
    fig2.savefig(f"Result/Calibration Curve/calibration_curve_training_set.png", dpi=300)

In [None]:
#计算测试集上的校准曲线
def CalibratedClassify_test_set(classifiers, x_train, y_train, x_test, y_test):
    sns.set_palette("Set2")  # 或 "muted", "bright", "Set2"
    #将标签进行One-hot编码
    y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))

    # 设置校准曲线的绘图
    fig, ax_cal = plt.subplots(figsize=(8, 7))
    for classifier in classifiers:
        # 训练模型并预测概率
        calibrated_clf = CalibratedClassifierCV(classifier,method='isotonic',cv=10)
        calibrated_clf.fit(x_train,y_train)

        # 将所有类别的预测概率和真实标签展平
        calibrated_probabilities = calibrated_clf.predict_proba(x_test)

        # 计算 Brier Score及其置信区间
        brier_score_mean, brier_score_ci_lower, brier_score_ci_upper = compute_brier_score_with_ci(y_test_binarized,calibrated_probabilities)

        #展平标签和预测概率
        y_true_flat = y_test_binarized.ravel()
        calibrated_probabilities_flat = calibrated_probabilities.ravel()

        # 计算校准曲线
        calibrated_fraction_of_positives, calibrated_mean_predicted_value = calibration_curve(y_true_flat, calibrated_probabilities_flat, n_bins=10, strategy='uniform')

        # 绘制校准曲线
        ax_cal.plot(calibrated_mean_predicted_value,calibrated_fraction_of_positives,marker='o', label=f'{classifier.__class__.__name__}(Brier: {brier_score_mean:.3f}, CI: [{brier_score_ci_lower:.3f}, {brier_score_ci_upper:.3f}])')

    # 添加完美校准参考线
    ax_cal.plot([0, 1], [0, 1], linestyle='--', color='black', label='Perfect Calibration')

    # 设置图例和坐标轴
    ax_cal.set(xlim=(-0.05, 1.05), ylim=(-0.05, 1.05),xlabel='Mean Predicted Probability', ylabel='Fraction of Positives')
    ax_cal.legend(loc="upper left")
    ax_cal.grid(True)
    ax_cal.set_title("Test set Calibration Curve")
    plt.tight_layout()
    plt.savefig(f"Result/Calibration Curve/calibration_curve_test_set.png",dpi=500)
    plt.show()

In [None]:
#保存最佳模型在测试集上的混淆矩阵
def save_confusion_matrix(y_test, y_pred_test, className,classifier_name):
    # 输出并保存测试集的混淆矩阵
    cm_test = confusion_matrix(y_test, y_pred_test)
    print(f'测试集混淆矩阵:\n{cm_test}')
    # 绘制混淆矩阵
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='YlGnBu', xticklabels=className, yticklabels=className,annot_kws={"size": 15})#设置数字大小
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.xlabel('Predicted Labels',fontsize=18)
    plt.ylabel('True Labels',fontsize=18)
    plt.title(f'   {classifier_name} Confusion Matrix (Test Set)', fontsize=18)
    plt.savefig(f'Result/Confusion Matrix/Test_Confusion_Matrix.png')
    plt.show()

In [None]:
#最佳模型在测试集上的结果
def get_validation(test_y, predicted_y,save_path = "Result/test_result.csv"):
    averages = ['micro', 'macro', 'weighted']
    results = {"Average":[],"Accuracy":[], "Recall":[], "Precision":[], "F1":[],"F2":[],"Specificity":[]}
    accuracy = round(accuracy_score(test_y, predicted_y),3)

    for average in averages:
        #计算各指标
        recall = round(recall_score(test_y, predicted_y, average=average),3)
        precision = round(precision_score(test_y, predicted_y, average=average),3)
        f1 = round(fbeta_score(test_y, predicted_y, average=average,beta=1),3)
        f2 = round(fbeta_score(test_y, predicted_y, average=average,beta=2),3)

        #将结果添加到字典中
        results["Average"].append(average)
        results["Accuracy"].append(accuracy)
        results["Recall"].append(recall)
        results["Precision"].append(precision)
        results["F1"].append(f1)
        results["F2"].append(f2)

    #Specificity (手动实现)
    cm = confusion_matrix(test_y, predicted_y)
    specificity_list = []
    for i in range(len(np.unique(test_y))):
        tn = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i])
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        specificity_list.append(specificity)
    
    micro_specificity = round(np.mean(specificity_list),3)  # Micro 特异度等于所有类别混淆矩阵展开的计算
    macro_specificity = round(np.mean(specificity_list),3)  # Macro 特异度为所有类别取平均
    weighted_specificity = round(np.average(specificity_list, weights=np.bincount(test_y)),3)

    results["Specificity"].append(micro_specificity)
    results["Specificity"].append(macro_specificity)
    results["Specificity"].append(weighted_specificity)
    
    #将结果转换为dataFrame并保存
    df_results = pd.DataFrame(results)
    df_results.to_csv(save_path, index=False)
    print(f"结果已保存为{save_path}")

In [None]:
#最佳模型在测试集上的结果
def get_validation_class(test_y, predicted_y,save_path = "Result/test_result_for_each_class.csv"):
    classes = np.unique(test_y)
    class_name = ['WSS','LFBU','PFSI','WFAI','KED']
    results = {"Class":[],"Accuracy":[], "Recall":[], "Precision":[], "F1_score":[],"F2_score":[],"Specificity":[],"Kappa":[]}
    cm = confusion_matrix(test_y, predicted_y)
    #计算每个类别的指标
    for i in classes:
        cls_test_y = (test_y == i).astype(int)  #将目标转换为二分类任务，即将类别 i 作为正类，其他类别作为负类进行计算。确保下标（样本顺序）对齐
        # 逻辑表达式 (test_y == i) 和 (predicted_y == i) 将类别 i 转换为二分类格式（1 表示当前类别，0 表示其他类别）。
        # 这样可以保证 cls_test_y 和 cls_predicted_y 的样本数量与顺序完全对齐。
        cls_predicted_y = (predicted_y == i).astype(int)
        accuracy = round(accuracy_score(cls_test_y, cls_predicted_y),3)
        recall = round(recall_score(cls_test_y,cls_predicted_y,average='binary'),3)
        precision = round(precision_score(cls_test_y,cls_predicted_y,average='binary'),3)
        f1 = round(fbeta_score(cls_test_y,cls_predicted_y,beta=1),3)
        f2 = round(fbeta_score(cls_test_y,cls_predicted_y,beta=2),3)
        ka = round(metrics.cohen_kappa_score(cls_test_y, cls_predicted_y),3)
        results["Class"].append(class_name[i])
        results["Accuracy"].append(accuracy)
        results["Recall"].append(recall)
        results["Precision"].append(precision)
        results["F1_score"].append(f1)
        results["F2_score"].append(f2)
        results["Kappa"].append(ka)
        
        #计算Specificity
        # True Negative (TN): 所有不是第 i 类的预测为负类的样本
        TN = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
        # False Positive (FP): 实际为负类，但被分类为第 i 类的样本
        FP = np.sum(cm[:, i]) - cm[i, i]

        # 计算特异度
        specificity = round(TN / (TN + FP), 3) if (TN + FP) > 0 else 0
        results["Specificity"].append(specificity)
        print(results)

    #将结果转换为dataFrame并保存
    df_results = pd.DataFrame(results)
    df_results.to_csv(save_path, index=False)
    print(f"结果已保存为{save_path}")

In [None]:
#绘制最佳模型在测试集上的ROC曲线
def plot_roc_curve(y_test, y_score, new_classes,classifier_name):
    # 将标签二值化，将每个类别的标签转换为二进制的形式
    y_label = label_binarize(y_test, classes=np.unique(y_test))
    n_classes = y_label.shape[1]

    # 计算每一类的ROC
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_label[:, i], y_score[:, i],drop_intermediate=False)
        roc_auc[i] = auc(fpr[i], tpr[i])
        

    # 计算micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_label.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # 计算 macro-average ROC curve and ROC area
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) #合并所有类别的假阳性率（FPR），并得到一个全局的 FPR 序列
    mean_tpr = np.zeros_like(all_fpr) # 创建一个与 FPR 相同大小的零数组用于存放均值 TPR
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) # 对每个类别的 TPR 进行插值，使得它们在相同的 FPR 上
    mean_tpr /= n_classes # 对所有类别的 TPR 取平均
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    #绘制micro-average和macro-average曲线
    ave_color = '#FFB6C1'
    #lw = 3
    plt.figure(figsize=(8,6))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average (area = {0:0.2f})'.format(roc_auc["micro"]),
             color=ave_color, linestyle=':')
    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average (area = {0:0.2f})'.format(roc_auc["macro"]),
             color=ave_color, linestyle='--')

    #colors = ['#1F77B4','#8CBF3F','#D62728','#9467BD','#FFC107']
    linetype = ['-','--',':','-.']
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], linestyle = linetype[i%4],linewidth=2, 
                 label='class {0} (area = {1:0.2f})'.format(new_classes[i], roc_auc[i]))
        # plt.fill_between(fpr[i], tpr[i]-0.01, tpr[i]+0.01, alpha=0.3)
    # Fill between the min and max for each class to add shadow effect
    # for i, color_ in zip(range(n_classes), colors):
    #     plt.plot(fpr[i], tpr[i], color=color_, lw=lw,
    #              label='class {0} (area = {1:0.2f})'.format(new_classes[i], roc_auc[i]))
        # 填充每个类别的阴影区域
        #plt.fill_between(fpr[i], tpr[i], tpr[i], color=color_, alpha=0.3)
    label_num = 18
    under_num = 15
    plt.xlim([-0.05, 1.05])
    plt.ylim([0.78, 1.02])
    plt.xticks([0.0, 0.5, 1.0], fontsize=under_num)
    plt.yticks([0.8, 0.9, 1.0], fontsize=under_num)
    plt.xlabel('False Positive Rate', fontsize=label_num)
    plt.ylabel('True Positive Rate', fontsize=label_num)
    plt.legend(loc="lower right", fontsize=under_num)
    plt.title(f"     {classifier_name} ROC Curve (Test set)",fontsize=label_num)
    plt.grid(True)

    filename = f'Result/ROC Curve/{classifier_name}_Test_set.png'
    plt.savefig(filename,dpi=500)  # 保存图片
    plt.show()

In [None]:
#模型选择
def model_select(classifiers,x_train,y_train):
    ws = []
    best_score = 0
    best_model = None
    best_name = None

    strap_all = []
    strap_all_stats = []
    #绘制各模型在训练集上的ROC曲线
    fig1,ax1 = plt.subplots(figsize=(8,7))

    for classifier in classifiers:
        score, score1, score2, score3, score4, score5, score6, clf_name, ws_each, strap_list, strap_list_stats,y_test_fold_all,y_scores_fold_all,fpr_list,tpr_list = Stratified_kfold_algorithm(
            classifier, x_train, y_train, 10)
        # 计算总体的 FPR 和 TPR
        # fpr_micro, tpr_micro, _ = roc_curve(y_test_fold_all.ravel(), y_scores_fold_all.ravel())
        print(f"\n模型{classifier}每一折在训练集上的表现如下：")
        print("....................................................")
        print(f"AUC_value:{score4},AUC_ci:{strap_list[4]}")

        # 新增：绘制置信区间，带有区间的曲线
        mean_fpr = np.linspace(0, 1, 100)
        tprs_interp = []

        for fpr_k, tpr_k in zip(fpr_list, tpr_list):
            tpr_interp = np.interp(mean_fpr, fpr_k, tpr_k)
            tpr_interp[0] = 0.0
            tprs_interp.append(tpr_interp)

        tprs_interp = np.array(tprs_interp)
        mean_tpr = np.mean(tprs_interp, axis=0)
        std_tpr = np.std(tprs_interp, axis=0)
        tpr_upper = np.clip(mean_tpr + 1.96 * std_tpr, 0, 1)
        tpr_lower = np.clip(mean_tpr - 1.96 * std_tpr, 0, 1)

        line, = ax1.plot(mean_fpr, mean_tpr,
                         lw=2,
                         label=f'{classifier.__class__.__name__} (AUC: {score4:.2f}, CI: [{strap_list[4][0]:.2f}, {strap_list[4][1]:.2f}])')
        # color = line.get_color()
        # ax1.fill_between(mean_fpr, tpr_lower, tpr_upper,
        #                  color=color, alpha=0.2)
        #绘制曲线，整体的一道曲线
        # line, = ax1.plot(fpr_micro,tpr_micro,label=f'{classifier.__class__.__name__} (AUC:{score4:.2f}, CI:[{strap_list[4][0]:.2f}, {strap_list[4][1]:.2f}])')

        # 先将ws_each转为dataframe类型
        df = pd.DataFrame(data=ws_each,
                          columns=['Acc_score', 'Recall', 'Precision_score', 'F1_score', 'AUC', 'Specificity', 'Kappa'])
        # 计算每一列的标准差，返回一个包含每一列标准差的 Series
        df_std = df.std()
        # 接着在首列添加折数，在末尾行添加平均值。len(df) 返回 DataFrame 中的总行数，即当前包含的样本数。
        df.insert(0, 'fold', df.index + 1)
        df.loc[len(df)] = ['平均值', str(round(100 * score, 2)) + '%', score1, score2, score3, score4, score5, score6]
        df.loc[len(df)] = ['标准差'] + df_std.values.tolist()
        df.loc[len(df)] = ['置信区间（自助法）'] + strap_list
        df.loc[len(df)] = ['置信区间（正态法）'] + strap_list_stats
        # 输出
        print(df)
        # 保存为csv文件
        df.to_csv(f'Result/TenFold/{classifier}每一折在训练集上的表现.csv', encoding='gbk', index=False)
        # ws是经过十折之后的平均参数情况，跳出循环后，输出全部的。当然这里也可以输出一次进行验证。
        ws.append([clf_name, np.round(score,3), score1, score2, score3, score4, score5, score6])
        print(f"\n模型 {clf_name} 在训练集上的表现：十折平均准确度{score}")
        strap_all.append([clf_name] + [i for i in strap_list])
        strap_all_stats.append([clf_name] + [i for i in strap_list_stats])

        print(f'.......................................{clf_name}..............................................')
        # 选出最好的模型
        if score > best_score:
            best_score = score
            best_model = classifier
            best_name = clf_name

    ax1.set(xlim=(-0.05,1.05),ylim=(-0.05,1.05),xlabel="False Positive Rate",ylabel="True Positive Rate")
    ax1.plot([0,1],[0,1],linestyle='--',color='red',label='Random Classifier')
    ax1.plot([0,0,1],[0,1,1],linestyle=':',color='green',label='Perfect Classifier')

    #绘图设置
    ax1.legend(loc="lower right")
    ax1.grid(False)
    ax1.set_title("Training set")

    # 保存第一个图
    fig1.tight_layout()
    fig1.savefig(f"Result/ROC Curve/roc_show_ci_training_set.png", dpi=500)

    # 最后总结与总览
    df1 = pd.DataFrame(data=strap_all,
                       columns=['Classifier_type', 'Acc_score', 'Recall', 'Precision_score', 'F1_score', 'AUC',
                                'Specificity', 'Kappa'])

    df3 = pd.DataFrame(data=strap_all_stats,
                       columns=['Classifier_type', 'Acc_score', 'Recall', 'Precision_score', 'F1_score', 'AUC',
                                'Specificity', 'Kappa'])

    print(f'\n模型在训练集上的平均表现如下：')
    df1.to_csv(f'Result/TenFold/所有模型在训练集上的平均表现（置信区间(自助法））.csv', encoding='gbk', index=False)
    df3.to_csv(f'Result/TenFold/所有模型在训练集上的平均表现（置信区间(正态法)）.csv', encoding='gbk', index=False)

    df2 = pd.DataFrame(data=ws,
                       columns=['Classifier_type', 'Acc_score', 'Recall', 'Precision_score', 'F1_score', 'AUC',
                                'Specificity', 'Kappa'])
    # 保留小数点后三位
    df2[['Recall', 'Precision_score', 'F1_score', 'AUC', 'Specificity', 'Kappa']] = df2[
        ['Recall', 'Precision_score', 'F1_score', 'AUC', 'Specificity', 'Kappa']].round(3)

    df2.to_csv(f'Result/TenFold/所有模型在训练集上的平均表现（平均值）.csv', encoding='gbk', index=False)

    # 保存最佳模型
    joblib.dump(best_model, 'Result/Machine_best_model.pkl')
    print(f"最佳模型是{best_name}\n最佳模型的十折平均准确度为{best_score}")
    return best_model,best_name

In [None]:
#测试集上各模型的ROC曲线图
# from sklearn.utils import resample
# from sklearn.metrics import roc_curve, auc

# def compute_bootstrap_roc(y_true, y_proba, n_bootstraps=1000):
#     boot_fprs = []
#     boot_tprs = []
#     mean_fpr = np.linspace(0, 1, 100)

#     for _ in range(n_bootstraps):
#         # Bootstrap 抽样
#         indices = resample(np.arange(len(y_true)),
#                            replace=True,
#                            n_samples=len(y_true),
#                            random_state=42)
#         y_true_boot = y_true[indices]
#         y_proba_boot = y_proba[indices]

#         # 计算 ROC
#         fpr, tpr, _ = roc_curve(y_true_boot.ravel(), y_proba_boot.ravel())
#         tpr_interp = np.interp(mean_fpr, fpr, tpr)
#         tpr_interp[0] = 0.0  # 强制从原点开始
#         boot_fprs.append(fpr)
#         boot_tprs.append(tpr_interp)

#     boot_tprs = np.array(boot_tprs)
#     mean_tpr = np.mean(boot_tprs, axis=0)
#     std_tpr = np.std(boot_tprs, axis=0)
#     ci_upper = np.clip(mean_tpr + 1.96 * std_tpr, 0, 1)
#     ci_lower = np.clip(mean_tpr - 1.96 * std_tpr, 0, 1)

#     return mean_fpr, mean_tpr, ci_lower, ci_upper, boot_tprs

def test_evaluate_df(classifiers,x_test,y_test):
    test_result = []
    test_result_ci = []
    fig2,ax2 = plt.subplots(figsize=(8,7))
    for classifier in classifiers:
        #在测试集上做预测，并绘制测试集上的各模型ROC曲线
        y_pred_test = classifier.predict(x_test)
        y_pred_probs_test = classifier.predict_proba(x_test)

        y_test_binarized_raw = label_binarize(y_test,classes=np.unique(y_test))
        #Micro-average ROC
        fpr_micro_test,tpr_micro_test,_ = roc_curve(y_test_binarized_raw.ravel(),y_pred_probs_test.ravel())
        roc_auc_micro = auc(fpr_micro_test,tpr_micro_test)

        # mean_fpr, mean_tpr, ci_lower, ci_upper, boot_tprs = compute_bootstrap_roc(y_test_binarized_raw,y_pred_probs_test)
        # line, =ax2.plot(mean_fpr,mean_tpr,lw=2,label=f'{classifier.__class__.__name__} (AUC:{roc_auc_micro:.2f})')
        # color = line.get_color()
        # ax2.fill_between(mean_fpr,ci_lower,ci_upper,color=color,alpha=.2)

        #计算AUC及其95%CI
        auc_mean,ci_lower,ci_upper = compute_auc_with_ci_multiclass(y_test_binarized_raw,y_pred_probs_test)
        #绘制曲线
        line, = ax2.plot(fpr_micro_test,tpr_micro_test,label=f'{classifier.__class__.__name__} (AUC:{auc_mean:.2f}, CI:[{ci_lower:.2f},{ci_upper:.2f}])')
        #计算各评价指标
        acc = accuracy_score(y_test,y_pred_test)
        rec = recall_score(y_test,y_pred_test,average='macro')
        pre = precision_score(y_test,y_pred_test,average='macro')
        f1 = fbeta_score(y_test,y_pred_test,average='macro',beta=1)
        kap = metrics.cohen_kappa_score(y_test,y_pred_test)
        auc_value = roc_auc_score(y_test,classifier.predict_proba(x_test),average='macro',multi_class='ovr')
        cm = confusion_matrix(y_test,y_pred_test)
        print(f'测试集上的混淆矩阵形状：{cm.shape}')
        #计算每个类的特异度
        spe_class = []
        for i in range(cm.shape[0]):
            # True Negative (TN): 所有不是第 i 类的预测为负类的样本
            TN = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
            # False Positive (FP): 实际为负类，但被分类为第 i 类的样本
            FP = np.sum(cm[:, i]) - cm[i, i]

            # 计算特异度
            specificity = TN / (TN + FP)
            spe_class.append(specificity)

        spe = np.mean(spe_class)
        test_result.append({'Model': type(classifier).__name__,
            'Accuracy': acc,
            'Recall': rec,
            'Precision': pre,
            'F1 Score': f1,
            'AUC': auc_value,
            'Specificity': spe,
            'Kappa': kap
            })

    test_result_df = pd.DataFrame(test_result)
    test_result_df.to_csv(f'Result/model_outer_test_evalu.csv',index=False,float_format='%.3f')

    ax2.set(xlim=(-0.05,1.05),ylim=(-0.05,1.05),xlabel="False Positive Rate",ylabel="True Positive Rate")
    ax2.plot([0,1],[0,1],linestyle='--',color='red',label='Random Classifier')
    ax2.plot([0,0,1],[0,1,1],linestyle=':',color='green',label='Perfect Classifier')
    #绘图设置
    ax2.legend(loc="lower right")

    #图例去重
    # handles, labels = ax2.get_legend_handles_labels()
    # unique_labels = list(set(labels))
    # unique_handles = [handles[labels.index(label)] for label in unique_labels]
    # ax2.legend(unique_handles, unique_labels, loc="lower right")

    ax2.grid(False)
    ax2.set_title(f"Test set (outer)")
    # 保存第二个图
    fig2.tight_layout()
    fig2.savefig(f"Result/ROC Curve/roc_show_ci_outer_dataset.png", dpi=500)

In [None]:
#DCA曲线的绘制相关函数
def calculate_net_benefit_model_multiclass(thresh_group, y_pred_score, y_label, num_classes):
    net_benefit_model = []

    for thresh in thresh_group:
        net_benefit_class = np.zeros(num_classes)  # 用来存储每个类的净效益

        # 针对每个类别计算净效益
        for i in range(num_classes):
            # 将预测概率转化为二元标签：阈值大于该值的预测为1，小于该值的预测为0
            y_pred_label = (y_pred_score[:, i] > thresh).astype(int)

            # 计算混淆矩阵
            cm = confusion_matrix((y_label == i).astype(int), y_pred_label)

            # 如果是二分类的混淆矩阵（0类 vs. i类），才提取tn, fp, fn, tp
            if cm.shape == (2, 2):
                tn, fp, fn, tp = cm.ravel()
            else:
                tp = np.sum((y_pred_label == 1) & (y_label == i))
                fp = np.sum((y_pred_label == 1) & (y_label != i))
                fn = np.sum((y_pred_label == 0) & (y_label == i))
                tn = np.sum((y_pred_label == 0) & (y_label != i))

            n = len(y_label)  # 样本总数
            # 计算净效益：TP / n - FP / n * (thresh / (1 - thresh))
            net_benefit_class[i] = (tp / n) - (fp / n) * (thresh / (1 - thresh))

        # 计算所有类别的平均净效益
        net_benefit_model.append(net_benefit_class.mean())

    return np.array(net_benefit_model)


def calculate_net_benefit_all_multiclass(thresh_group, y_label, num_classes):
    net_benefit_all = []

    for thresh in thresh_group:
        net_benefit_class = np.zeros(num_classes)  # 用来存储每个类的净效益

        # 针对每个类别计算“Treat all”的净效益
        for i in range(num_classes):
            tp = np.sum(y_label == i)  # Treat all: 所有实际为i类的样本都被预测为i类
            fp = np.sum(y_label != i)  # Treat all: 其余样本都不属于i类
            total = tp + fp
            net_benefit_class[i] = (tp / total) - (fp / total) * (thresh / (1 - thresh))

        # 计算所有类别的平均净效益
        net_benefit_all.append(net_benefit_class.mean())

    return np.array(net_benefit_all)


def plot_DCA_multiclass(ax, thresh_group, net_benefit_model_dict, net_benefit_all,title):
    linetype = ['-','--',':','-.']
    i = 0
    # 绘制每个模型的净效益
    for model_name, net_benefit_model in net_benefit_model_dict.items():
        ax.plot(thresh_group, net_benefit_model, label=f'{model_name}',linestyle=linetype[i])
        i += 1
        if i == 4:
            i = 0
    
    # 绘制基准“Treat all”和“Treat none”的净效益
    ax.plot(thresh_group, net_benefit_all, color='black', label='Treat all')
    ax.plot((0, 1), (0, 0), color='black', linestyle=':', label='Treat none')

    y2 = np.maximum(net_benefit_all, 0)
    for model_name, net_benefit_model in net_benefit_model_dict.items():
        y1 = np.maximum(net_benefit_model, y2)
        #ax.fill_between(thresh_group, y1, y2, alpha=0.2)

    # 美化图表
    ax.set_xlim(0, 1)

    # 计算所有模型的最小和最大净效益值
    all_net_benefits = np.concatenate(list(net_benefit_model_dict.values()) + [net_benefit_all])
    ax.set_ylim(np.min(all_net_benefits) - 0.15, np.max(all_net_benefits) + 0.15)

    ax.set_title(f'DCA Curve on {title}')
    ax.set(ylim=(-0.05,0.25))
    ax.set_xlabel('Threshold Probability', fontdict={'family': 'Times New Roman'})
    ax.set_ylabel('Net Benefit', fontdict={'family': 'Times New Roman'})
    ax.grid(True)
    ax.spines['right'].set_color((0.8, 0.8, 0.8))
    ax.spines['top'].set_color((0.8, 0.8, 0.8))
    ax.legend(loc='lower left')
    return ax

In [None]:
#测试集DCA
def DCA_select_test_set(classifiers,x_train,x_test,y_train,y_test):
    # 阈值设置
    thresh_group = np.arange(0, 1, 0.001)

    # 用于存储模型的净效益
    net_benefit_model_dict = {}

    # 遍历每个分类器
    for clf in classifiers:
        clf.fit(x_train, y_train)
        y_pred_score = clf.predict_proba(x_test)  # 获得预测的概率
        model_name = clf.__class__.__name__
        
        # 计算模型的净效益
        net_benefit_model = calculate_net_benefit_model_multiclass(thresh_group, y_pred_score, y_test, num_classes=5)
        net_benefit_model_dict[model_name] = net_benefit_model

    # 计算“Treat all”的净效益
    net_benefit_all = calculate_net_benefit_all_multiclass(thresh_group, y_test, num_classes=5)

    # 绘制DCA图
    fig, ax = plt.subplots(figsize=(8, 7))
    ax = plot_DCA_multiclass(ax, thresh_group, net_benefit_model_dict, net_benefit_all,title='Test set')
    fig.patch.set_edgecolor('black')
    ax.spines['top'].set_color('black')
    ax.spines['right'].set_color('black')
    
    fig.savefig('Result/DCA Curve/DCA_test_set.png',dpi=300)
    plt.show()

In [None]:
#训练集DCA
def DCA_select_training_set(classifiers,x_train,y_train):
    # 阈值设置
    thresh_group = np.arange(0, 1, 0.001)

    # 用于存储模型的净效益
    net_benefit_model_dict = {}

    # 遍历每个分类器
    for clf in classifiers:
        clf.fit(x_train, y_train)
        y_pred_score = clf.predict_proba(x_train)  # 获得预测的概率
        model_name = clf.__class__.__name__
        
        # 计算模型的净效益
        net_benefit_model = calculate_net_benefit_model_multiclass(thresh_group, y_pred_score, y_train, num_classes=5)
        net_benefit_model_dict[model_name] = net_benefit_model

    # 计算“Treat all”的净效益
    net_benefit_all = calculate_net_benefit_all_multiclass(thresh_group, y_train, num_classes=5)

    # 绘制DCA图
    fig, ax = plt.subplots(figsize=(8, 7))
    ax = plot_DCA_multiclass(ax, thresh_group, net_benefit_model_dict, net_benefit_all,title='Training set')
    fig.patch.set_edgecolor('black')
    ax.spines['top'].set_color('black')
    ax.spines['right'].set_color('black')
    
    fig.savefig('Result/DCA Curve/DCA_training_set.png',dpi=300)
    plt.show()

In [None]:
def resample_dataset(x_train,y_train):
    # 查看数据分布
    print(f"原始数据集类别分布: {Counter(y_train)}")

#将SMOTE应用在别处
    # 使用SMOTE进行过采样
    smote = SMOTE(random_state=42)
    x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

    #进行smote之后的数据集类型为numpy，需要转为pandas
    # 查看过采样后的数据分布
    print(f"过采样后数据集类别分布: {Counter(y_resampled)}")
    feature_names = x_train.columns
    x_resampled_df = pd.DataFrame(x_resampled,columns=feature_names)
    y_resampled_df = pd.DataFrame(y_resampled,columns=['Classification'])

    resampled_data = pd.concat([x_resampled_df,y_resampled_df],axis=1)#axis=1表示按列拼接
    resampled_data.to_csv('data/data_resampled.csv',index=False)

    return x_resampled_df,y_resampled_df

In [None]:
def data_process_fill(data,data_type):
    # 全局数值化
    data = data.apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # 单独处理指定列
    for col in ['Tinnitus Evaluation Questionnaire', 'Tinnitus Handicap Inventory', 
                'Visual Analog Scale for Tinnitus Distress', 'Course of disease months']:
        data[col] = data[col].astype(float)

    #存储归一化前的数据
    data_before_normalization = data.copy()

    # 标准化
    scaler = MinMaxScaler()
    data[['Tinnitus Evaluation Questionnaire', 'Tinnitus Handicap Inventory', 
          'Visual Analog Scale for Tinnitus Distress', 'Age']] = scaler.fit_transform(
        data[['Tinnitus Evaluation Questionnaire', 'Tinnitus Handicap Inventory', 
              'Visual Analog Scale for Tinnitus Distress', 'Age']])
    
    # 提取特征和标签
    x = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    data_after_normalization = data.copy()
    
    if data_type == 'train_set':
      df = pd.DataFrame(data_after_normalization)
      df.to_csv('data/after_normalize_dataset.csv',index=False)

    return x,y

In [None]:
#数据加载
data = pd.read_csv('data/train_set.csv')
x_train,y_train = data_process_fill(data,'train_set')

In [None]:
#ML模型
classifiers = [
    SVC(probability=True),
    MLPClassifier(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    MultinomialNB(),
    LogisticRegression(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    lightgbm.LGBMClassifier()
]

#数据加载
data = pd.read_csv('data/train_set.csv')
x_train,y_train = data_process_fill(data,'train_set')

# x_resampled,y_resampled= resample_dataset(x_train,y_train)

In [None]:
#模型选择
# best_model,best_name = model_select(classifiers,x_resampled,y_resampled)
best_model,best_name = model_select(classifiers,x_train,y_train)

In [None]:
#划分出外部测试集
data2 = pd.read_csv('data/test_set.csv')
x_test,y_test= data_process_fill(data2,'test_set')


In [None]:

#测试集上各模型的ROC曲线
test_evaluate_df(classifiers,x_test,y_test)

In [None]:

#训练集上的校准曲线
CalibratedClassify_training_set(classifiers, x_train, y_train)

#训练集上的DCA曲线
DCA_select_training_set(classifiers,x_train,y_train)

#测试集上的校准曲线
CalibratedClassify_test_set(classifiers,x_train,y_train,x_test, y_test)

#测试集上的DCA曲线
DCA_select_test_set(classifiers,x_train,x_test,y_train,y_test)

# 测试最佳模型
y_pred_test = best_model.predict(x_test)
y_scores_test = best_model.predict_proba(x_test)
classifier_name = best_name
print(y_test.shape,y_pred_test.shape)
# print(".................................")
# print(y_test)
# print("....................................")
# print(y_pred_test)
#最佳模型在测试集上的表现
get_validation(y_test, y_pred_test)
get_validation_class(y_test,y_pred_test)

# #绘制测试集上的混淆矩阵
className = ['WSS','LFBU','PFSI','WFAI','KED']
save_confusion_matrix(y_test,y_pred_test,className,classifier_name)

# #测试集上最佳模型的ROC曲线
plot_roc_curve(y_test, y_scores_test,className,classifier_name)

In [None]:
#训练集
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

plt.rcParams['font.family'] = "Times New Roman"
df = pd.read_csv('Result/TenFold/所有模型在训练集上的平均表现（平均值）.csv')

cmap = plt.cm.get_cmap('rainbow')  # 使用彩虹色系
plt.figure(figsize=(9,7))

for i,model in df.iterrows():
    plt.plot(df.columns[1:],model[1:],marker='o',label=model['Classifier_type'],color=cmap(i/9))

plt.legend(loc='lower right')

plt.xlabel('Evaluation Metrics')
plt.ylabel('Score')
plt.title('Model Evaluation Metrics Comparison(Training set)')
plt.tight_layout()
plt.grid(True)
plt.savefig(f'Result/modelEvaluation_train_set.png',dpi=300)
plt.show()

In [None]:
#测试集
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams['font.family'] = "Times New Roman"
df = pd.read_csv(f"Result/model_outer_test_evalu.csv")

cmap = plt.cm.get_cmap('rainbow')  # 使用彩虹色系
plt.figure(figsize=(9,7))

for i,model in df.iterrows():
    plt.plot(df.columns[1:],model[1:],marker='o',label=model['Model'],color=cmap(i/9))

plt.legend(loc='lower right')

plt.xlabel('Evaluation Metrics')
plt.ylabel('Score')
plt.title('Model Evaluation Metrics Comparison(Test set)')
plt.tight_layout()
plt.grid(True)
plt.savefig(f'Result/modelEvaluation_Test_set.png',dpi=500)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Times New Roman'
cmap = plt.cm.get_cmap('rainbow')
plt.figure(figsize=(9, 7))

file_path = 'Result/test_result_for_each_class.csv'
df  = pd.read_csv(file_path)

for i,model in df.iterrows():
    #获取每一列的值
    metrics_values = model[1:].values
    #获取均值和方差
    mean = np.mean(metrics_values)
    var = np.var(metrics_values)
    std_dev = np.std(metrics_values)
    #绘制线条
    plt.plot(df.columns[1:],model[1:],marker='o',label = model['Class'],color = cmap(i/len(df)))
    # 绘制均值和方差的阴影区域
    # plt.fill_between(df.columns[1:], 
    #                  metrics_values - std_dev, 
    #                  metrics_values + std_dev, 
    #                  color=cmap(i/len(df)), 
    #                  alpha=0.3)  # alpha 控制透明度
plt.legend(loc = 'lower right')
plt.xlabel('Evaluation Metrics')
plt.ylabel('Score')
plt.title('Comparison of different classes on test dataset')
plt.grid(True)
plt.tight_layout()
plt.savefig('Result/Classes comparison on test dataset.png',dpi=300)
plt.show()


### 2.SHAP条形图（前20重要性特征）

In [None]:
# # 在模型训练完成后，应使用测试集数据来计算 shap_values 并绘制 SHAP 图，这样可以提供更可靠的模型解释，反映模型在未见过的数据上的特征贡献情况。
#树模型，常用于LGBMClassifier
explainer = shap.TreeExplainer(best_model)
shap_values = explainer(x_test)
for i in range(shap_values.values.shape[2]):  # 遍历每个类别
    pd.DataFrame(shap_values.values[:, :, i], 
                columns=x_test.columns).to_csv(
        f'Result/SHAP/shap_values_class_{i}.csv',
        index=False
    )
print(shap_values.values.shape)

In [None]:
shap.initjs()
shap.plots.force(shap_values[:,:,0])

In [None]:
for c in range(shap_values.values.shape[2]):
    print(f"All sample - Class {c}")
    shap.save_html(f"Result/SHAP/force_plot_class_{c}.html", shap.plots.force(shap_values[:, :, c]))

In [None]:
for i in range(5):
    for name in x_test.columns:
        shap.dependence_plot(name, shap_values[:,:,i], x_test)

In [None]:
shap_values_matrix = shap_values.values
final_shap_values = np.abs(shap_values_matrix).sum(axis=0)
print(final_shap_values.shape)#得到每个类别下每个特征的SHAP值的总和

# 计算每个类别下每个特征的 SHAP 绝对值平均值
feature_importance_per_class = np.mean(np.abs(shap_values_matrix), axis=0)
#对 SHAP 值矩阵的第 0 维（样本维度）进行平均，也就是对每个特征在所有样本中的 SHAP 绝对值进行平均，得到每个特征的重要性。每个类别下每个特征的SHAP值的平均
print(feature_importance_per_class.shape)

#特征名称
feature_names = x_test.columns
class_names = ['WSS','LFBU','PFSI','WFAI','KED']

# 总的特征重要性（所有类别的 SHAP 值总和）
total_feature_importance = np.sum(feature_importance_per_class, axis=1)
print(total_feature_importance.shape)

are_equal = np.array_equal(total_feature_importance,final_shap_values)
print(are_equal)

# 找到 SHAP 值贡献最大的前 20 个特征（可根据需要调整数量）
top_indices = np.argsort(total_feature_importance)[-20:]  # 选择贡献值最大的20个特征

# 筛选出前20个特征及其 SHAP 值
feature_importance_per_class_top = feature_importance_per_class[top_indices]
feature_names_top = x_test.columns[top_indices]  # 使用真实特征名称

In [None]:
import shap
import matplotlib.pyplot as plt

# 设置绘图风格（可选）
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (12, 8)

# -------------------------------
# 方法一：绘制所有类别的 SHAP summary plot（默认方式）
# 每个类别都会单独显示颜色区分
# -------------------------------
shap.summary_plot(
    shap_values=shap_values,
    features=x_test,
    feature_names=feature_names,
    class_names=class_names,
    show=False
)

# 添加标题和标签
plt.title("SHAP Summary Plot for Test Set", fontsize=14)
plt.xlabel("SHAP Value (Impact on Model Output)", fontsize=12)
plt.ylabel("Features", fontsize=12)

# 保存图像
plt.tight_layout()
# plt.savefig("Result/SHAP/shap_summary_all_features_all_classes.png", dpi=300, bbox_inches='tight')
plt.show()

# 清空画布
plt.clf()

In [None]:
# 绘制堆叠条形图
plt.figure(figsize=(8, 7))

#colors = ['#F4E7DE','#D1DBDC','#D3E2B7','#B786BC','#92D3D9']
colors = ['#1F77B4','#8CBF3F','#D62728','#9467BD','#FFC107']
y_pos = np.arange(len(feature_names_top)) * 1.2# 将间距调整为1.2倍

#设置背景为白色
plt.gca().set_facecolor('white')

# 初始化底部
bottom = np.zeros(len(feature_names_top))

# 绘制每个类别的堆叠条形图
for i in range(5):  # 假设有5个类别
    plt.barh(y_pos, feature_importance_per_class_top[:, i], left=bottom, color=colors[i], label=class_names[i])
    bottom += feature_importance_per_class_top[:, i]  # 更新底部位置

for spine in plt.gca().spines.values():
    spine.set_visible(True)
    spine.set_color('black')
    spine.set_linewidth(1)
    
plt.grid(False)
plt.xticks(fontsize=10)
# 设置 y 轴标签为特征名
plt.yticks(y_pos, feature_names_top,fontsize=10)
plt.xlabel('Mean SHAP Value',fontsize=12)
plt.title('Top 20 Features by SHAP Value',fontsize=12)
plt.legend(loc='best', fontsize=10)

plt.tight_layout()
plt.savefig('Result/SHAP/最佳20特性的SHAP图（集合）',dpi=500)
plt.show()

### SHAP全局图

### 3.SHAP散点图，条形图，环形图，力图，瀑布图

In [None]:
#绘制环形图
classes = ['WSS','LFBU','PFSI','WFAI','KED']
feature_names = x_test.columns
ls = []
for i in range(5):
    shap_values_for_class = shap_values.values[:,:,i]
    mean_shap_values = np.abs(shap_values_for_class).mean(axis=0)
    top_idx = np.argsort(mean_shap_values)[-10:][::-1]

    # 限制 SHAP 值和 x_test 为前10个特征
    shap_values_for_class_top = mean_shap_values[top_idx]
    x_test_top = x_test.iloc[:, top_idx]
    sorted_features = [feature_names[i] for i in top_idx]
    #将重要性归一化为百分比
    importance_percent_for_class = 100 * shap_values_for_class_top / np.sum(shap_values_for_class_top)

    ls.append(sorted_features)

    print(sorted_features,shap_values_for_class_top,importance_percent_for_class)
    #设置颜色
    colors = plt.cm.tab20(np.linspace(0,1,len(sorted_features)))

    #设置扇形与中心的偏移量
    explode = (0.03,0.02,0.01,0,0,0,0,0,0,0)
    #绘制环形图
    fig,ax = plt.subplots(figsize=(15,8), subplot_kw={'aspect':'equal'},facecolor='w')
    wedges, texts, autotexts = ax.pie(
        importance_percent_for_class,
        explode=explode,
        autopct='%.1f%%',
        # labels=sorted_features,
        startangle=90,
        colors=colors,
        pctdistance=0.75, #百分比标签距离中心的距离
        # wedgeprops={'linewidth':.8, 'edgecolor': 'k', "width":.38} #扇形的属性，如线宽和颜色
        wedgeprops=dict(width=0.5,edgecolor='white',linewidth=2)
    )

    #环形图设置
    plt.setp(autotexts,size=15,weight='bold')
    ax.set_title(f'Features Importance(SHAP) for Class {classes[i]}',fontsize=18)
    # 在圆圈中心添加文字
    ax.text(0, 0, f'{classes[i]}', ha='center', va='center', fontsize=30, color='black')
    ax.legend(sorted_features,bbox_to_anchor=(1.4,0.7),loc='upper right',frameon=False)#图例横向偏移1.6个单位，纵向不变（将图例移动到环形图右侧）
    # plt.subplots_adjust(right=0.65) #调整子图边界，为图例腾出空间
    plt.tight_layout()
    plt.savefig(f'Result/SHAP/SHAP_new/Pie_for_class{i}_legend.png',dpi=300)
    plt.show()

# # print(ls)
# common_elements = set(ls[0])
# for row in ls[1:]:
#     common_elements &= set(row)
# print('each appear:',common_elements)

In [None]:
#绘制前十特征的SHAP柱状图和散点图的组合

classes = ['WSS','LFBU','PFSI','WFAI','KED']

font = 15
for i in range(5):
    fig, ax1 = plt.subplots(figsize=(15, 8))
    plt.title(f"SHAP Summary Plot for Class {classes[i]}",fontsize=font)
    plt.grid(False)

    shap_values_for_class = shap_values.values[:, :, i]
    mean_shap_values = np.abs(shap_values_for_class).mean(axis=0)
    top_idx = np.argsort(mean_shap_values)[-10:]  # 选取个特征索引

    # 限制 SHAP 值和 x_test 为前10个特征
    shap_values_for_class_top = shap_values_for_class[:, top_idx]
    x_test_top = x_test.iloc[:, top_idx]

    # 每四个单词换行
    def wrap_feature_names(names, words_per_line=6):
        wrapped_names = []
        for name in names:
            words = name.split()
            wrapped_name = '\n'.join([' '.join(words[i:i + words_per_line]) for i in range(0, len(words), words_per_line)])
            wrapped_names.append(wrapped_name)
        return wrapped_names


    shap.summary_plot(
        shap_values_for_class_top,
        x_test_top,
        feature_names=x_test_top.columns,
        plot_type="dot",
        show=False,
        color_bar=True,
        cmap='coolwarm'
    )

    ax0 = plt.gca()
    ax2 = ax0.twiny()

    shap.summary_plot(
        shap_values_for_class_top,
        x_test_top,
        plot_type="bar",
        show=False,
        color='#F59F9F'
    )

    bars = ax2.patches
    for bar in bars:
        bar.set_alpha(0.2)

    ax1.set_xlabel('Shapley Value Contribution', fontsize=font)
    ax2.set_xlabel('Mean Shapley Value (Feature Importance)', fontsize=font)
    ax1.set_ylabel('Features', fontsize=font)
    ax2.xaxis.set_label_position('top')
    ax2.xaxis.tick_top()

    y_labels = ax1.get_yticklabels()
    for label in y_labels:
        label.set_fontsize(font)
    x_labels_1 = ax1.get_xticklabels()
    x_labels_2 = ax2.get_xticklabels()
    for label in x_labels_1:
        label.set_fontsize(font)
    for label in x_labels_2:
        label.set_fontsize(font)

    # 设置y轴标签为带换行的版本
    ax1.set_yticklabels(wrap_feature_names(x_test_top), fontsize=font)

    plt.grid(False)
    plt.tight_layout()
    plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.4, hspace=0.6)
    plt.savefig(f'Result/SHAP/SHAP_new/SHAP_top10_features_for_Class_{classes[i]}.png', format='png', bbox_inches='tight',dpi=300)
    #plt.show()

In [None]:
#绘制前十特征的SHAP柱状图和散点图的组合
#反转坐标轴

classes = ['WSS','LFBU','PFSI','WFAI','KED']

font = 15
for i in range(5):
    fig, ax1 = plt.subplots(figsize=(15, 8))
    fig.patch.set_facecolor('white')
    ax1.set_facecolor('white')
    plt.title(f"SHAP Summary Plot for Class {classes[i]}",fontsize=font)
    plt.grid(False)

    shap_values_for_class = shap_values.values[:, :, i]
    mean_shap_values = np.abs(shap_values_for_class).mean(axis=0)
    top_idx = np.argsort(mean_shap_values)[-10:]  # 选取个特征索引

    # 限制 SHAP 值和 x_test 为前10个特征
    shap_values_for_class_top = shap_values_for_class[:, top_idx]
    x_test_top = x_test.iloc[:, top_idx]

    # 每四个单词换行
    def wrap_feature_names(names, words_per_line=6):
        wrapped_names = []
        for name in names:
            words = name.split()
            wrapped_name = '\n'.join([' '.join(words[i:i + words_per_line]) for i in range(0, len(words), words_per_line)])
            wrapped_names.append(wrapped_name)
        return wrapped_names


    shap.summary_plot(
        shap_values_for_class_top,
        x_test_top,
        feature_names=x_test_top.columns,
        plot_type="dot",
        show=False,
        color_bar=False,
        cmap='coolwarm'
    )

    ax0 = plt.gca()
    ax2 = ax0.twiny()

    shap.summary_plot(
        shap_values_for_class_top,
        x_test_top,
        plot_type="bar",
        show=False,
        color='#F59F9F'
    )

    bars = ax2.patches
    for bar in bars:
        bar.set_alpha(0.2)

    ax1.set_xlabel('Shapley Value Contribution', fontsize=font)
    ax2.set_xlabel('Mean Shapley Value (Feature Importance)', fontsize=font)
    # ax1.set_ylabel('Features', fontsize=font)
    ax2.xaxis.set_label_position('top')
    ax2.xaxis.tick_top()

    y_labels = ax1.get_yticklabels()
    for label in y_labels:
        label.set_fontsize(font)
    x_labels_1 = ax1.get_xticklabels()
    x_labels_2 = ax2.get_xticklabels()
    for label in x_labels_1:
        label.set_fontsize(font)
    for label in x_labels_2:
        label.set_fontsize(font)

    # 设置y轴标签为带换行的版本
    ax1.set_yticklabels(wrap_feature_names(x_test_top), fontsize=font)

    # 方法1：分别设置每个边框
    ax1.spines['left'].set_visible(True)
    ax1.spines['left'].set_color('black')
    ax1.spines['left'].set_linewidth(0.5)
    
    ax1.spines['bottom'].set_visible(True)
    ax1.spines['bottom'].set_color('black')
    ax1.spines['bottom'].set_linewidth(2)
    
    #反转坐标轴
    ax1.invert_xaxis()
    ax2.invert_xaxis()
    right_end_x = ax1.get_xlim()[1]
    # ax1.spines['left'].set_position(('data',right_end_x))
    ax1.spines['left'].set_position(('axes',1))
    ax1.yaxis.tick_right()
    ax1.yaxis.set_label_position('right')
    ax1.yaxis.set_ticks([])  # 隐藏刻度线
    
    plt.grid(False)
    plt.tight_layout()
    plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.4, hspace=0.6)
    plt.savefig(f'Result/SHAP/SHAP_new/SHAP_top10_features_for_Class_{classes[i]}_reversed.png', format='png', bbox_inches='tight',dpi=300)
    plt.show()

In [None]:
#找出错误样本
for i in range(len(y_test)):
    if y_test.iloc[i] != y_pred_test[i]:
        formatted_scores = ','.join(f'{score:.2f}' for score in y_scores_test[i]) #先将y_scores_test[i]中每个分数格式化为4位小数，再用逗号连接
        # print(f'下标{i}  真实值：{y_test.iloc[i]}, 预测值：{y_pred_test[i]}, 预测概率：[{formatted_scores}]')
        print(f'{i} {y_test.iloc[i]} {y_pred_test[i]} [{formatted_scores}]')

In [None]:
import csv
import numpy as np  # 用于处理概率排序

# 创建一个CSV文件并写入表头
with open('Result/error_samples.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Index', 'True Value', 'Predicted Value', 'Prediction Probabilities', 'Top 2 Categories'])

    # 遍历测试集的每个样本，找出错误样本
    for i in range(len(y_test)):
        if y_test.iloc[i] != y_pred_test[i]:
            # 格式化预测概率并加上方括号
            formatted_scores = '[' + ','.join(f'{score:.2f}' for score in y_scores_test[i]) + ']'

            # 找到预测概率最高的两种类别及其概率
            top_2_indices = np.argsort(y_scores_test[i])[-2:][::-1]  # 概率从高到低排序取前两个
            top_2_info = ', '.join([f'{index} ({y_scores_test[i][index]:.2f})' for index in top_2_indices])

            # 写入到CSV文件中
            writer.writerow([i, y_test.iloc[i], y_pred_test[i], formatted_scores, top_2_info])

In [None]:
#绘制正确和错误分类的力图
num = 176
ls = [0,4]
for i in ls:
    plt.figure(figsize=(15,8))
    shap.plots.force(shap_values[num,:,i], matplotlib=True)
    plt.title(f'SHAP force img (index {num}) for Class {className[i]}',y=1.5, fontsize=18)
    plt.gca().tick_params(labelsize=15)  # 修改坐标轴数字大小
    plt.close()
    # plt.show()
    #base_values是基准值，values是预测值，x_test是输入值，以上都是第一个样本
    # 保存 SHAP 生成的热力图
    # plt.savefig(f'Result/SHAP/SHAP_force_{num}_img_Class_{className[i]}.png', format='png', bbox_inches='tight',dpi=300)

In [None]:
#绘制正确和错误分类的力图
num = 176
ls = [0,4]
for i in ls:
    plt.figure(figsize=(15,8))
    shap_values_for_class = shap_values[:, :, i]
    shap.force_plot(shap_values_for_class[num].base_values,shap_values_for_class[num].values,x_test.iloc[num],matplotlib=True,show=False)
    plt.title(f'SHAP force img (index {num}) for Class {className[i]}',y=1.5, fontsize=18)
    plt.subplots_adjust(top=0.95)
    plt.gca().tick_params(labelsize=15)  # 修改坐标轴数字大小

    #base_values是基准值，values是预测值，x_test是输入值，以上都是第一个样本
    # 保存 SHAP 生成的热力图
    plt.savefig(f'Result/SHAP/SHAP_force_{num}_img_Class_{className[i]}.png', format='png', bbox_inches='tight',dpi=300)

In [None]:
#绘制正确和错误分类的力图
num = 83
ls = [2,4]
for i in ls:
    plt.figure(figsize=(15,8))
    shap_values_for_class = shap_values[:, :, i]
    shap.force_plot(shap_values_for_class[num].base_values,shap_values_for_class[num].values,x_test.iloc[num],matplotlib=True,show=False)
    plt.title(f'SHAP force img (index {num}) for Class {className[i]}',y=1.5, fontsize=18)
    plt.subplots_adjust(top=0.95)
    plt.gca().tick_params(labelsize=15)  # 修改坐标轴数字大小
    plt.show()
    #base_values是基准值，values是预测值，x_test是输入值，以上都是第一个样本
    # 保存 SHAP 生成的热力图
    # plt.savefig(f'Result/SHAP/SHAP_force_{num}_img_Class_{className[i]}.png', format='png', bbox_inches='tight',dpi=300)

In [None]:
#绘制SHAP瀑布图
className = ['WSS','LFBU','PFSI','WFAI','KED']
inx = 83
feature = 4
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[inx,:,feature],max_display=20, show=False)
plt.title(f'Waterfall plot for {className[feature]} (index({inx}))')
plt.show()
# plt.savefig(f'Result/SHAP/waterfall_{inx}_{feature}.png',dpi = 300)

In [None]:
#'Age','Course of disease months','Tinnitus Evaluation Questionnaire','Tinnitus Handicap Inventory','Visual Analog Scale for Tinnitus Distress'
# 绘制SHAP特征交互图
className = ['WSS', 'LFBU', 'PFSI', 'WFAI', 'KED']
classNum = shap_values.shape[2]
feature1 = 'Course of disease months'
feature2 = 'Lower back pain and nocturnal emission'
figure,axes = plt.subplots(1,classNum,figsize = (8*classNum,7))
for i in range(classNum):
    # plt.figure(figsize=(8,7))
    axes[i].set_title(className[i])
    shap.dependence_plot(feature1, shap_values.values[:, :, i], x_test,interaction_index=feature2,show=False,ax=axes[i])#show=False至关重要，如果不加的话，会在绘制时只有第一张是图像，其他是空白，会被覆盖掉
figure.suptitle(f'SHAP Dependence Plot: {feature1} vs {feature2}',fontsize = 16)
plt.tight_layout()
plt.savefig(f'Result/SHAP/shap_dependence_plot_{feature1} vs {feature2}.png',dpi=300)
plt.show()

###  4.数据分布图

In [None]:
# 各类别数据分布
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'Times New Roman'

file_path = "../../data/modified_dataset_distribute.csv"
data = pd.read_csv(file_path)

classes=['WSS','LFBU','PFSI','WFAI','KED']
colors = sns.color_palette("hls",5)
categories = data["Characteristics"].tolist()
x_values = data.iloc[:,1:].values.tolist()
x_values_transposed = list(zip(*x_values)) #转置，使每一列变成一组，之后倒序
print(x_values_transposed)
plt.figure(figsize=(15,20),dpi=300)
for i ,col_values in enumerate(x_values_transposed):
    y = range(len(col_values))#y轴为一个列表，为每一列的索引

    #绘制散点图
    plt.scatter(
        col_values, y,
        label=classes[i],
        s=200,
        marker="*",
        color=colors[i],
        edgecolors="black",
        linewidths=1,
        zorder=3
    )
    #连接点的线条
    plt.plot(
        col_values,y,
        linestyle="--",
        color=colors[i],
        alpha=0.7,
        zorder=2
    )
    #添加标签文字
    for j,(x_val,y_val) in enumerate(zip(col_values,y)):
        plt.text(
            x_val,y_val+0.1,str(x_val),ha='center',va='bottom',
            fontsize=14,color='black',bbox=dict(facecolor='white',edgecolor='none',alpha=0.5)
        )
#自定义刻度
#range(len(categories)) 生成从 0 到 len(categories)-1 的整数序列，表示 Y 轴刻度的位置
plt.yticks(range(len(categories)),categories,fontsize=12)
plt.xlabel("Number of Features",fontsize=16)
plt.xticks(fontsize=14)
plt.ylabel("Feature",fontsize=15)
plt.title("Data Distribute",fontsize=20)
plt.grid(axis="x",linestyle="--",alpha=0.5)

#添加图例
plt.legend(fontsize=12,loc='best',title="Classes",title_fontsize=15)
plt.tight_layout()
plt.savefig(f"../../Result/Data_process/modified_dataset_distribute.png")
plt.show()

### 5.饼状图

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取CSV文件
file_path = 'data/train_set.csv'
df = pd.read_csv(file_path)

# 提取 "Classification" 列并统计每个类别的数量
classification_column = 'Classification'
class_counts = df[classification_column].value_counts()

# 定义格式化函数
def func(pct, allvalues):
    absolute = int(pct / 100. * sum(allvalues))
    # 返回百分比和数量，\n 用于换行
    return "{:d}\n({:.1f}%)".format(absolute, pct)

# 定义自定义类别名称的映射
category_mapping = {
    0.0: 'WSS',
    1.0: 'LFBU',
    2.0: 'PFSI',
    3.0: 'WFAI',
    4.0: 'KED',
}

# 使用映射替换类别标签
custom_labels = [category_mapping.get(x, str(x)) for x in class_counts.index]
#print(custom_labels)

custom_colors = ['#FDE089', '#F59F9F', '#CEE7E7', '#76B2ED', '#B29ABE']  # 示例颜色
# 绘制饼状图
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.figure(figsize=(8, 8))  # 设置图形大小
plt.pie(class_counts, 
        labels=custom_labels, 
        autopct=lambda pct: func(pct, class_counts), 
        startangle=90, 
        colors=custom_colors,
        textprops={'fontsize': 25})

plt.title('Class Distribution in Classification Column\n(Training set)',fontsize = 25)
plt.axis('equal')  # 保证饼状图为正圆形

fig = plt.savefig('Result/Data process/class_distribution_train.png',dpi=300)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['Times New Roman']  # 设置中文字体为黑体

# 各圈百分比数据
percentages = [
    [57, 43],    # 内圈
    [65.4, 34.6],    # 第二圈
    [40.6, 19.3, 16.1, 24.0],    # 第三圈
    [48.4, 51.6],     # 外圈
    [45.1, 47.1, 7.8],    # 外圈
    [68.8, 31.3]
]

# 颜色设置
colors = [
    ['#d4e2fd', '#fde8cf'],  # 内圈颜色
    ['#d4e2fd', '#fde8cf'],  # 第二圈颜色
    ['#d4e2fd', '#fde8cf', '#E6B2B6', '#fefcd6'],  # 第三圈颜色
    ['#d4e2fd', '#fde8cf'],   # 外圈颜色
    ['#d4e2fd', '#fde8cf', '#fefcd6'],   # 外圈颜色
    ['#d4e2fd', '#fde8cf']
]

# 创建图表
fig, ax = plt.subplots(figsize=(8,7))

# 计算每个圆环的起始角度
# start_angle = np.cumsum([0] + [sum(p) for p in percentages[:-1]]) * 360 / 100
start_angle = 90

# 绘制每个圆环
for i, (percents, color) in enumerate(zip(percentages, colors)):
    # 计算当前圆环的半径和百分比标签距离中心的距离
    radius = 1 + i * 0.3
    pctdistance = 0.7 + i * 0.05
    
    # 绘制圆环
    wedges, texts = ax.pie(
        percents, 
        colors=color, 
        # startangle=start_angle[i],
        startangle=start_angle, 
        pctdistance=pctdistance, #百分比标签距离中心的距离
        radius=radius,  # 圆环半径
        wedgeprops=dict(width=0.3, edgecolor='black', linewidth=0.5),  # 设置圆环的宽度
        # autopct='%1.1f%%',  # 显示百分比，保留一位小数
        # autopct=None,
        textprops=dict(color="black")  # 设置文本颜色为黑色
    )
# 添加图例
legend_labels = [
    '1_1', '1_2',
    '2_1', '2_2',
    '3_1', '3_2', '3_3','3_4',
    '4_1', '4_2',
    '5_1', '5_2', '5_3',
    '6_1', '6_2'
]
legend_handles = [plt.Rectangle((0,0),1,1, color=c) for c in sum(colors, [])]
ax.legend(legend_handles, legend_labels, loc='upper left', bbox_to_anchor=(1.6, 1.5))
# 设置图表样式
ax.set(aspect="equal")  # 确保饼图是圆形的
plt.tight_layout()
plt.savefig('../../Other Gram/pie_test_set.png',dpi=300)
# 显示图表
plt.show()

### 6.小提琴图

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 设置全局绘图参数
plt.rcParams["font.family"] = "Times New Roman"

data = pd.read_csv('data/train_set.csv')

# 全局数值化
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)

# 选择需要绘制的特征列
selected_columns = ['Age', 'Tinnitus Evaluation Questionnaire', 'Tinnitus Handicap Inventory', 'Visual Analog Scale for Tinnitus Distress']

# 预处理：将数据转换为适合 seaborn 的格式
melted_data = data[selected_columns].melt(var_name="Features", value_name="Values")

# 定义颜色
colors = ["#2FBE8F", "#459DFF", "#FF5B9B", "#F7A24F"]

# 创建颜色调色板
palette = dict(zip(selected_columns, colors))

# 绘制小提琴图
fig, ax = plt.subplots(figsize=(6, 4), facecolor="w")  # facecolor 设置背景颜色

sns.violinplot(data=melted_data, x="Features", y="Values", palette=palette, inner="quartile", ax=ax)

# 自定义轴标签
x_label_view = ['Age', 'TEQ', 'THI', 'VAS']
ax.set_xticklabels(x_label_view)
ax.set_xlabel("Features")
ax.set_ylabel("Values")

# 保存和展示图形
plt.tight_layout()
plt.savefig("Result/Data process/violin_plot_seaborn.png", dpi=300, bbox_inches='tight')
plt.show()

### 7.对应关系图

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_chord_diagram import chord_diagram
from matplotlib.patches import FancyArrowPatch

plt.rcParams['font.family'] = 'Times New Roman'
# plt.rcParams['font.size'] = 30
# 创建 flux_data，指定连接的节点对
flux_data = np.zeros((10, 10))  # 创建一个10x10的零矩阵

# 设置指定的连接强度
flux_data[0, 5] = 3  # WSS 和 gong
flux_data[1, 6] = 3  # LFBU 和 shang
flux_data[2, 7] = 3  # PFSI 和 jue
flux_data[3, 8] = 3  # WFAI 和 zhi
flux_data[4, 9] = 3  # KED 和 yu

flux_data[5,0] = 3  # WSS 和 gong
flux_data[6,1] = 3  # LFBU 和 shang
flux_data[7,2] = 3  # PFSI 和 jue
flux_data[8,3] = 3  # WFAI 和 zhi
flux_data[9,4] = 3  # KED 和 yu

names = ["WSS", "LFBU", "PFSI", "WFAI", "KED", "Gong", "Shang", "Jue", "Zhi", "Yu"]

custom_colors = [
    '#E0676C',
    '#FCAE18',
    '#74AED4',
    '#90B357',
    '#B786BC'
]
extend_colors = custom_colors*2
# 创建图表
fig, ax = plt.subplots(figsize=(6, 6), facecolor="w")

# 绘制弦图，设置颜色对比
chord_diagram = chord_diagram(mat=flux_data, names=names, alpha=.9, use_gradient=False, ax=ax,colors=extend_colors)

plt.tight_layout()
fig.savefig('Result/Treatment/treat_strategy_with_arrows.png', dpi=300)
plt.show()

### 8.箱线对比图

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats 
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

plt.rcParams['font.family'] = 'Times New Roman'
# plt.rcParams['font.size'] = 14
df = pd.read_csv("Result/Feature analysis/output_one_dimension.csv")

x = "Performance Metrics"
y = "Value"
hue = "state"
pairs = [(("Accuracy", "Before"), ("Accuracy", "After")),
         (("Recall", "After"), ("Recall", "Before")),
         (("Precision", "After"), ("Precision", "Before")),
         (("F1 score", "After"), ("F1 score", "Before")),
         (("Specificity", "After"), ("Specificity", "Before"))]
# colors = ["#459DFF","#2FBE8F"]
colors = ["#459DFF","#FF5B9B"]

fig,ax = plt.subplots(figsize=(8,6),facecolor="w")
ax = sns.boxplot(data=df, x=x, y=y, hue=hue, hue_order=["Before", "After"], palette=colors,saturation=1,width=.7,linewidth=1.2)
ax.set_xlabel("Performance Metrics")
ax.set_ylabel("Values")

annot = Annotator(ax,pairs,data=df, x=x, y=y, hue=hue)
annot.configure(test='t-test_ind', text_format='star', loc='inside',
                comparisons_correction=None, line_height=0.05, line_width=1,text_offset=2)
annot.apply_test().annotate(line_offset_to_group=0.2, line_offset=0.1)
ax.legend(loc='upper left', bbox_to_anchor=(1.00, 1))
plt.tight_layout()
plt.savefig('Result/Feature analysis/Train_set_boxplot_performance_metrics2.png',dpi=300)
plt.show()

### 9.数据分布热图

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.patches as patches

plt.rcParams['font.family'] = ('Times New Roman')
df = pd.read_excel(f"../../data/heatmap_distribute_test.xlsx")

# 初始化热图数据
heatmap_data = np.zeros((len(df.columns), len(df), 3))  # RGB 颜色

# 自定义颜色映射方案
def create_gradient_color_map(base_color, n=256):
    """生成单色渐变颜色映射"""
    cmap = np.array(
        [np.linspace(base_color[0], 1, n),  # Red 渐变
         np.linspace(base_color[1], 1, n),  # Green 渐变
         np.linspace(base_color[2], 1, n)]  # Blue 渐变
    ).T
    return cmap

# 连续变量单色渐变
continuous_color_maps = {
    'Age': create_gradient_color_map([0.8, 0.4, 0.1]),  # 棕色渐变
    'Course of disease months': create_gradient_color_map([0.1, 0.6, 0.8]),  # 蓝色渐变
    'Tinnitus Evaluation Questionnaire':create_gradient_color_map([0.8, 0.1, 0.3]),  # 红色渐变
    'Tinnitus Handicap Inventory':create_gradient_color_map([0.9, 0.7, 0.2]),  # 黄色渐变
    'Visual Analog Scale for Tinnitus Distress':create_gradient_color_map([0.3, 0.8, 0.1])  # 绿色渐变
}

# 分类变量多种颜色
categorical_colors = {
    'Sex': [(0.9, 0.9, 0.9), (0.2, 0.6, 0.8)],  # 第一种颜色：浅白色，第二种颜色：浅蓝色
    'Fatigue': [(0.9, 0.9, 0.9), (0.7, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：紫红色
    'Noise': [(0.9, 0.9, 0.9), (0.4, 0.6, 0.9)],  # 第一种颜色：浅白色，第二种颜色：天蓝色
    'Headphone use': [(0.9, 0.9, 0.9), (0.8, 0.1, 0.4)],  # 第一种颜色：浅白色，第二种颜色：深红色
    'Exercise': [(0.9, 0.9, 0.9), (0.3, 0.7, 0.3)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Stress': [(0.9, 0.9, 0.9), (0.6, 0.2, 0.6)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Infection': [(0.9, 0.9, 0.9), (0.9, 0.3, 0.2)],  # 第一种颜色：浅白色，第二种颜色：红色
    'Smoking': [(0.9, 0.9, 0.9), (0.4, 0.5, 0.2)],  # 第一种颜色：浅白色，第二种颜色：橄榄色
    'Alcohol consumption': [(0.9, 0.9, 0.9), (0.8, 0.7, 0.1)],  # 第一种颜色：浅白色，第二种颜色：浅黄色
    'Tea drinking': [(0.9, 0.9, 0.9), (0.1, 0.9, 0.5)],  # 第一种颜色：浅白色，第二种颜色：淡绿色
    'Coffee': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.3)],  # 第一种颜色：浅白色，第二种颜色：棕色
    'Hypertension': [(0.9, 0.9, 0.9), (0.2, 0.8, 0.5)],  # 第一种颜色：浅白色，第二种颜色：浅绿色
    'Diabetes': [(0.9, 0.9, 0.9), (0.3, 0.7, 0.8)],  # 第一种颜色：浅白色，第二种颜色：蓝色
    'Hyperlipidemia': [(0.9, 0.9, 0.9), (0.9, 0.5, 0.3)],  # 第一种颜色：浅白色，第二种颜色：橙色
    'Deafness': [(0.9, 0.9, 0.9), (0.6, 0.6, 0.1)],  # 第一种颜色：浅白色，第二种颜色：黄绿色
    'Continuous tinnitus': [(0.9, 0.9, 0.9), (0.4, 0.5, 0.7)],  # 第一种颜色：浅白色，第二种颜色：浅蓝色
    'Cicada like tinnitus': [(0.9, 0.9, 0.9), (0.9, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：紫红色
    'Buzzing tinnitus': [(0.9, 0.9, 0.9), (0.7, 0.3, 0.8)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Roaring tinnitus': [(0.9, 0.9, 0.9), (0.8, 0.2, 0.7)],  # 第一种颜色：浅白色，第二种颜色：粉红色
    'Whistling tinnitus': [(0.9, 0.9, 0.9), (0.4, 0.3, 0.9)],  # 第一种颜色：浅白色，第二种颜色：深蓝色
    'Clicking tinnitus': [(0.9, 0.9, 0.9), (0.5, 0.7, 0.4)],  # 第一种颜色：浅白色，第二种颜色：橄榄色
    'Pulsatile tinnitus': [(0.9, 0.9, 0.9), (0.6, 0.3, 0.6)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Hissing tinnitus': [(0.9, 0.9, 0.9), (0.3, 0.7, 0.3)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Electrical tinnitus': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.2)],  # 第一种颜色：浅白色，第二种颜色：黄绿色
    'Left ear (tinnitus)': [(0.9, 0.9, 0.9), (0.8, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：紫红色
    'Right ear (tinnitus)': [(0.9, 0.9, 0.9), (0.6, 0.4, 0.2)],  # 第一种颜色：浅白色，第二种颜色：黄棕色
    'Bilateral ear (tinnitus)': [(0.9, 0.9, 0.9), (0.5, 0.7, 0.6)],  # 第一种颜色：浅白色，第二种颜色：浅绿色
    'Low to mid frequency (associated symptoms)': [(0.9, 0.9, 0.9), (0.9, 0.2, 0.3)],  # 第一种颜色：浅白色，第二种颜色：红色
    'High frequency (associated symptoms)': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.4)],  # 第一种颜色：浅白色，第二种颜色：橄榄色
    'Anxiety (associated symptoms)': [(0.9, 0.9, 0.9), (0.6, 0.5, 0.1)],  # 第一种颜色：浅白色，第二种颜色：黄色
    'Depression (associated symptoms)': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.3)],  # 第一种颜色：浅白色，第二种颜色：浅棕色
    'Hearing loss (associated symptoms)': [(0.9, 0.9, 0.9), (0.6, 0.3, 0.8)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Ear fullness (associated symptoms)': [(0.9, 0.9, 0.9), (0.8, 0.7, 0.1)],  # 第一种颜色：浅白色，第二种颜色：黄绿色
    'Ear pressure (associated symptoms)': [(0.9, 0.9, 0.9), (0.4, 0.6, 0.8)],  # 第一种颜色：浅白色，第二种颜色：蓝色
    'Ear pain (associated symptoms)': [(0.9, 0.9, 0.9), (0.3, 0.5, 0.6)],  # 第一种颜色：浅白色，第二种颜色：蓝绿色
    'Ear itching (associated symptoms)': [(0.9, 0.9, 0.9), (0.8, 0.4, 0.7)],  # 第一种颜色：浅白色，第二种颜色：粉红色
    'Head fullness (associated symptoms)': [(0.9, 0.9, 0.9), (0.4, 0.8, 0.6)],  # 第一种颜色：浅白色，第二种颜色：浅绿色
    'Dizziness (associated symptoms)': [(0.9, 0.9, 0.9), (0.8, 0.7, 0.4)],  # 第一种颜色：浅白色，第二种颜色：浅黄色
    'Visual rotation (associated symptoms)': [(0.9, 0.9, 0.9), (0.9, 0.6, 0.3)],  # 第一种颜色：浅白色，第二种颜色：浅橙色
    'Headache (associated symptoms)': [(0.9, 0.9, 0.9), (0.6, 0.5, 0.7)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Nausea (associated symptoms)': [(0.9, 0.9, 0.9), (0.7, 0.4, 0.9)],  # 第一种颜色：浅白色，第二种颜色：深蓝色
    'Vomiting (associated symptoms)': [(0.9, 0.9, 0.9), (0.9, 0.2, 0.5)],  # 第一种颜色：浅白色，第二种颜色：粉红色
    'Ear discharge (associated symptoms)': [(0.9, 0.9, 0.9), (0.6, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：浅蓝色
    'Palpitations (associated symptoms)': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.2)],  # 第一种颜色：浅白色，第二种颜色：黄色
    'Chest tightness (associated symptoms)': [(0.9, 0.9, 0.9), (0.9, 0.4, 0.6)],  # 第一种颜色：浅白色，第二种颜色：橙色
    'Tinnitus like cicada calls worsens at night': [(0.9, 0.9, 0.9), (0.7, 0.3, 0.8)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Low pitched tinnitus occurred within one month': [(0.9, 0.9, 0.9), (0.5, 0.6, 0.7)],  # 第一种颜色：浅白色，第二种颜色：蓝绿色
    'History of cold or chronic rhinitis': [(0.9, 0.9, 0.9), (0.4, 0.6, 0.9)],  # 第一种颜色：浅白色，第二种颜色：淡蓝色
    'Tinnitus sounds like roaring wind or tide': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.3)],  # 第一种颜色：浅白色，第二种颜色：橄榄色
    'Restless insomnia with early morning awakening': [(0.9, 0.9, 0.9), (0.6, 0.3, 0.9)],  # 第一种颜色：浅白色，第二种颜色：深蓝色
    'Irritability insomnia and vivid dreams': [(0.9, 0.9, 0.9), (0.8, 0.3, 0.7)],  # 第一种颜色：浅白色，第二种颜色：粉红色
    'Lower back pain and nocturnal emission': [(0.9, 0.9, 0.9), (0.7, 0.2, 0.6)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Impulsive and irritable personality': [(0.9, 0.9, 0.9), (0.4, 0.8, 0.5)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Heaviness in the head bitter or bland taste in the mouth': [(0.9, 0.9, 0.9), (0.6, 0.2, 0.8)],  # 第一种颜色：浅白色，第二种颜色：深紫色
    'Headache and dry mouth': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.4)],  # 第一种颜色：浅白色，第二种颜色：棕色
    'Sensation of emptiness in the ear': [(0.9, 0.9, 0.9), (0.6, 0.7, 0.2)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Ear fullness and blockage causing breathlessness': [(0.9, 0.9, 0.9), (0.4, 0.7, 0.5)],  # 第一种颜色：浅白色，第二种颜色：浅绿色
    'Worsens when standing up': [(0.9, 0.9, 0.9), (0.5, 0.6, 0.7)],  # 第一种颜色：浅白色，第二种颜色：蓝绿色
    'Worsens after exertion': [(0.9, 0.9, 0.9), (0.7, 0.5, 0.4)],  # 第一种颜色：浅白色，第二种颜色：浅棕色
    'Bitter and dry mouth': [(0.9, 0.9, 0.9), (0.6, 0.4, 0.8)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Poor appetite': [(0.9, 0.9, 0.9), (0.8, 0.3, 0.9)],  # 第一种颜色：浅白色，第二种颜色：深粉色
    'Difficulty falling asleep': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.2)],  # 第一种颜色：浅白色，第二种颜色：浅黄色
    'Light sleep with frequent awakenings': [(0.9, 0.9, 0.9), (0.8, 0.7, 0.4)],  # 第一种颜色：浅白色，第二种颜色：浅黄色
    'Vivid dreams or excessive dreaming': [(0.9, 0.9, 0.9), (0.7, 0.4, 0.8)],  # 第一种颜色：浅白色，第二种颜色：紫色
    'Red tongue': [(0.9, 0.9, 0.9), (0.9, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：深红色
    'Pale tongue': [(0.9, 0.9, 0.9), (0.8, 0.5, 0.3)],  # 第一种颜色：浅白色，第二种颜色：浅棕色
    'Dark tongue': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.4)],  # 第一种颜色：浅白色，第二种颜色：暗色
    'Thin tongue coating': [(0.9, 0.9, 0.9), (0.9, 0.8, 0.3)],  # 第一种颜色：浅白色，第二种颜色：淡黄色
    'Greasy tongue coating': [(0.9, 0.9, 0.9), (0.5, 0.8, 0.4)],  # 第一种颜色：浅白色，第二种颜色：浅绿色
    'White tongue coating': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.5)],  # 第一种颜色：浅白色，第二种颜色：浅棕色
    'Yellow tongue coating': [(0.9, 0.9, 0.9), (0.6, 0.3, 0.5)],  # 第一种颜色：浅白色，第二种颜色：浅粉色
    'Sparse tongue coating': [(0.9, 0.9, 0.9), (0.8, 0.6, 0.7)],  # 第一种颜色：浅白色，第二种颜色：紫粉色
    'Floating pulse': [(0.9, 0.9, 0.9), (0.9, 0.6, 0.3)],  # 第一种颜色：浅白色，第二种颜色：浅橙色
    'Deep pulse': [(0.9, 0.9, 0.9), (0.7, 0.4, 0.8)],  # 第一种颜色：浅白色，第二种颜色：深蓝色
    'Rapid pulse': [(0.9, 0.9, 0.9), (0.6, 0.2, 0.8)],  # 第一种颜色：浅白色，第二种颜色：深紫色
    'Wiry pulse': [(0.9, 0.9, 0.9), (0.8, 0.3, 0.6)],  # 第一种颜色：浅白色，第二种颜色：浅紫色
    'Slippery pulse': [(0.9, 0.9, 0.9), (0.7, 0.6, 0.4)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Thin pulse': [(0.9, 0.9, 0.9), (0.9, 0.4, 0.8)],  # 第一种颜色：浅白色，第二种颜色：蓝色
    'Weak pulse': [(0.9, 0.9, 0.9), (0.6, 0.7, 0.3)],  # 第一种颜色：浅白色，第二种颜色：绿色
    'Rough pulse': [(0.9, 0.9, 0.9), (0.4, 0.5, 0.6)],  # 第一种颜色：浅白色，第二种颜色：蓝色
}

# 根据特征设置颜色
for i, col in enumerate(df.columns):
    if col in continuous_color_maps:
        # 连续变量：生成归一化后的颜色
        normed_values = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
        cmap = continuous_color_maps[col]
        row_colors = np.array([cmap[int(val * (len(cmap) - 1))] for val in normed_values])
    elif col in categorical_colors:
        # 分类变量：每个类别对应一种颜色
        unique_vals = df[col].unique()
        color_map = {val: categorical_colors[col][idx % len(categorical_colors[col])]
                     for idx, val in enumerate(unique_vals)}
        row_colors = np.array([color_map[val] for val in df[col]])
    else:
        # 默认颜色（如果未指定）
        row_colors = np.ones((len(df), 3)) * 0.8
    heatmap_data[i] = row_colors

# 绘制热图
fig, ax = plt.subplots(figsize=(40, 45))
ax.imshow(heatmap_data, aspect='auto')

# 设置特征名称
ax.set_yticks(range(len(df.columns)))
ax.set_yticklabels(df.columns, fontsize=16)
ax.set_xticks([])  # 隐藏x轴

# 在每个特征行上添加白色边框
for i in range(len(df.columns)):
    ax.add_patch(patches.Rectangle(
        (0, i - 0.5), len(df), 1, linewidth=2, edgecolor='white', facecolor='none'))
plt.legend()
plt.savefig(f"../../Result/Data_process/data_distribute.png",dpi=300)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np


plt.rcParams['font.family'] = 'Times New Roman'

# 连续变量单色渐变
continuous_color_maps = {
    'Age': (np.linspace(0, 90, 256), [0.8, 0.4, 0.1]),
    'Course of disease months': (np.linspace(0, 500, 256), [0.1, 0.6, 0.8]),
    'Visual Analog Scale for Tinnitus Distress': (np.linspace(0, 35, 256), [0.3, 0.8, 0.1]),
    'Tinnitus Evaluation Questionnaire': (np.linspace(0, 25, 256), [0.8, 0.1, 0.3]),
    'Tinnitus Handicap Inventory': (np.linspace(0, 100, 256), [0.9, 0.7, 0.2])
}

# 分类变量多种颜色
categorical_colors = {
    'Sex': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.2, 0.6, 0.8)   # 第二种颜色：浅蓝色
    },
    'Fatigue': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.3, 0.5)   # 第二种颜色：紫红色
    },
    'Noise': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.6, 0.9)   # 第二种颜色：天蓝色
    },
    'Headphone use': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.1, 0.4)   # 第二种颜色：深红色
    },
    'Exercise': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.3, 0.7, 0.3)   # 第二种颜色：绿色
    },
    'Stress': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.2, 0.6)   # 第二种颜色：紫色
    },
    'Infection': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.3, 0.2)   # 第二种颜色：红色
    },
    'Smoking': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.5, 0.2)   # 第二种颜色：橄榄色
    },
    'Alcohol consumption': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.7, 0.1)   # 第二种颜色：浅黄色
    },
    'Tea drinking': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.1, 0.9, 0.5)   # 第二种颜色：淡绿色
    },
    'Coffee': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.3)   # 第二种颜色：棕色
    },
    'Hypertension': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.2, 0.8, 0.5)   # 第二种颜色：浅绿色
    },
    'Diabetes': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.3, 0.7, 0.8)   # 第二种颜色：蓝色
    },
    'Hyperlipidemia': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.5, 0.3)   # 第二种颜色：橙色
    },
    'Deafness': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.6, 0.1)   # 第二种颜色：黄绿色
    },
    'Continuous tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.5, 0.7)   # 第二种颜色：浅蓝色
    },
    'Cicada like tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.3, 0.5)   # 第二种颜色：紫红色
    },
    'Buzzing tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.3, 0.8)   # 第二种颜色：紫色
    },
    'Roaring tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.2, 0.7)   # 第二种颜色：粉红色
    },
    'Whistling tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.3, 0.9)   # 第二种颜色：深蓝色
    },
    'Clicking tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.7, 0.4)   # 第二种颜色：橄榄色
    },
    'Pulsatile tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.3, 0.6)   # 第二种颜色：紫色
    },
    'Hissing tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.3, 0.7, 0.3)   # 第二种颜色：绿色
    },
    'Electrical tinnitus': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.2)   # 第二种颜色：黄绿色
    },
    'Left ear (tinnitus)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.3, 0.5)   # 第二种颜色：紫红色
    },
    'Right ear (tinnitus)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.4, 0.2)   # 第二种颜色：黄棕色
    },
    'Bilateral ear (tinnitus)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.7, 0.6)   # 第二种颜色：浅绿色
    },
    'Low to mid frequency (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.2, 0.3)   # 第二种颜色：红色
    },
    'High frequency (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.4)   # 第二种颜色：橄榄色
    },
    'Anxiety (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.5, 0.1)   # 第二种颜色：黄色
    },
    'Depression (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.3)   # 第二种颜色：浅棕色
    },
    'Hearing loss (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.3, 0.8)   # 第二种颜色：紫色
    },
    'Ear fullness (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.7, 0.1)   # 第二种颜色：黄绿色
    },
    'Ear pressure (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.6, 0.8)   # 第二种颜色：蓝色
    },
    'Ear pain (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.3, 0.5, 0.6)   # 第二种颜色：蓝绿色
    },
    'Ear itching (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.4, 0.7)   # 第二种颜色：粉红色
    },
    'Head fullness (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.8, 0.6)   # 第二种颜色：浅绿色
    },
    'Dizziness (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.7, 0.4)   # 第二种颜色：浅黄色
    },
    'Visual rotation (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.6, 0.3)   # 第二种颜色：浅橙色
    },
    'Headache (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.5, 0.7)   # 第二种颜色：紫色
    },
    'Nausea (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.4, 0.9)   # 第二种颜色：深蓝色
    },
    'Vomiting (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.2, 0.5)   # 第二种颜色：粉红色
    },
    'Ear discharge (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.3, 0.5)   # 第二种颜色：浅蓝色
    },
    'Palpitations (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.2)   # 第二种颜色：黄色
    },
    'Chest tightness (associated symptoms)': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.4, 0.6)   # 第二种颜色：橙色
    },
    'Tinnitus like cicada calls worsens at night': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.3, 0.8)   # 第二种颜色：紫色
    },
    'Low pitched tinnitus occurred within one month': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.6, 0.7)   # 第二种颜色：蓝绿色
    },
    'History of cold or chronic rhinitis': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.6, 0.9)   # 第二种颜色：淡蓝色
    },
    'Tinnitus sounds like roaring wind or tide': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.3)   # 第二种颜色：橄榄色
    },
    'Restless insomnia with early morning awakening': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.3, 0.9)   # 第二种颜色：深蓝色
    },
    'Irritability insomnia and vivid dreams': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.3, 0.7)   # 第二种颜色：粉红色
    },
    'Lower back pain and nocturnal emission': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.2, 0.6)   # 第二种颜色：紫色
    },
    'Impulsive and irritable personality': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.8, 0.5)   # 第二种颜色：绿色
    },
    'Heaviness in the head bitter or bland taste in the mouth': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.2, 0.8)   # 第二种颜色：深紫色
    },
    'Headache and dry mouth': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.4)   # 第二种颜色：棕色
    },
    'Sensation of emptiness in the ear': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.7, 0.2)   # 第二种颜色：绿色
    },
    'Ear fullness and blockage causing breathlessness': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.7, 0.5)   # 第二种颜色：浅绿色
    },
    'Worsens when standing up': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.6, 0.7)   # 第二种颜色：蓝绿色
    },
    'Worsens after exertion': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.5, 0.4)   # 第二种颜色：浅棕色
    },
    'Bitter and dry mouth': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.4, 0.8)   # 第二种颜色：紫色
    },
    'Dizziness or anxiety caused by stress': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.8, 0.6)   # 第二种颜色：浅绿色
    },
    'Headaches relieved by dark and quiet environment': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.5)   # 第二种颜色：棕色
    },
    'Light sleep with frequent awakenings': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.7, 0.4)   # 第二种颜色：浅黄色
    },
    'Vivid dreams or excessive dreaming': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.4, 0.8)   # 第二种颜色：紫色
    },
    'Red tongue': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.3, 0.5)   # 第二种颜色：深红色
    },
    'Pale tongue': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.5, 0.3)   # 第二种颜色：浅棕色
    },
    'Dark tongue': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.4)   # 第二种颜色：暗色
    },
    'Thin tongue coating': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.8, 0.3)   # 第二种颜色：淡黄色
    },
    'Greasy tongue coating': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.5, 0.8, 0.4)   # 第二种颜色：浅绿色
    },
    'White tongue coating': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.5)   # 第二种颜色：浅棕色
    },
    'Yellow tongue coating': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.3, 0.5)   # 第二种颜色：浅粉色
    },
    'Sparse tongue coating': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.6, 0.7)   # 第二种颜色：紫粉色
    },
    'Floating pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.6, 0.3)   # 第二种颜色：浅橙色
    },
    'Deep pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.4, 0.8)   # 第二种颜色：深蓝色
    },
    'Rapid pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.2, 0.8)   # 第二种颜色：深紫色
    },
    'Wiry pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.8, 0.3, 0.6)   # 第二种颜色：浅紫色
    },
    'Slippery pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.7, 0.6, 0.4)   # 第二种颜色：绿色
    },
    'Thin pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.9, 0.4, 0.8)   # 第二种颜色：蓝色
    },
    'Weak pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.6, 0.7, 0.3)   # 第二种颜色：绿色
    },
    'Rough pulse': {
        0: (0.9, 0.9, 0.9),  # 第一种颜色：浅白色
        1: (0.4, 0.5, 0.6)   # 第二种颜色：蓝色
    }
}

# 绘制所有特征图例
def draw_legends(continuous_features, categorical_features, ncols=3):
    total_features = len(continuous_features) + len(categorical_features)
    nrows = (total_features + ncols - 1) // ncols  # 确定需要的行数

    fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 1.3, nrows * 4))
    axes = axes.flatten()  # 将 axes 展平为一维数组

    for idx, feature in enumerate(continuous_features):
        ax = axes[idx]
        values, base_color = continuous_color_maps[feature]
        gradient = np.linspace(0, 1, 256).reshape(-1, 1)  # 更窄的渐变条
        cmap = np.array(
            [np.linspace(base_color[0], 1, 256),
             np.linspace(base_color[1], 1, 256),
             np.linspace(base_color[2], 1, 256)]
        ).T
        ax.imshow(gradient, aspect='auto', extent=(0, 0.12, 0, 4), cmap=plt.cm.colors.ListedColormap(cmap))
        ax.set_yticks([0, 2, 4])
        ax.set_xticks([])
        ax.set_yticklabels([f"{values[0]:.1f}", f"{values[len(values) // 2]:.1f}", f"{values[-1]:.1f}"], fontsize=8)
        ax.set_title(feature, fontsize=9, loc='center', pad=10)
        ax.set_xlim(0, 0.3)
        ax.set_frame_on(False)

    for idx, feature in enumerate(categorical_features, len(continuous_features)):
        ax = axes[idx]
        categories = categorical_colors[feature]
        y_start = 0
        spacing = 0.5  # 控制颜色块之间的间距
        for label, color in sorted(categories.items(),reverse=True):
            ax.add_patch(plt.Rectangle((0, y_start), 0.43, 0.5, color=color, edgecolor='black'))
            ax.text(0.6, y_start + 0.25, label, ha='left', va='center', fontsize=8)
            y_start += spacing
        ax.set_yticks([])
        ax.set_xticks([])
        ax.set_title(feature, fontsize=9, loc='center', pad=10, ha='center', va='center')
        ax.set_ylim(0, y_start)
        ax.set_frame_on(False)
        ax.set_aspect('equal')

    # 隐藏多余的子图
    for ax in axes[total_features:]:
        ax.axis('off')

    plt.savefig(f"../../Result/Data_process/Legend_heatmap_3.png",dpi=300)
    plt.tight_layout()
    plt.show()

# 定义连续变量和离散变量
continuous_features = list(continuous_color_maps.keys())
categorical_features = list(categorical_colors.keys())

# 绘制图例
draw_legends(continuous_features, categorical_features, ncols=3)


### 10.相关性图（R语言，换内核）

In [None]:
library(corrplot)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)

In [None]:
library(
    grDevices
)
names(pdfFonts())

windowsFonts(Times = windowsFont("Times New Roman"))

In [None]:
data <- read.csv('data/after_normalize_dataset.csv')
cor_matrix = cor(data)
# 将相关性矩阵转换为数据框
cor_data <- as.data.frame(cor_matrix)

# 导出相关性矩阵数值
write.csv(cor_data, "Result/Relation/New pearson/correlation_matrix_R.csv", row.names = TRUE)

In [None]:
cor_matrix <- read.csv("Result/Relation/New pearson/correlation_matrix_R.csv",row.names = 1, check.names = FALSE)
cor_matrix <- as.matrix(cor_matrix)  # 转换为矩阵

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_0_number.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'number',
    # order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    number.cex = 0.3
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_1_number.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'number',
    # order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    number.cex = 0.3
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_2_color.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'color',
    order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_3_circle_AOE.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'circle', 
    order = 'AOE',
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_4_shade.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'shade', 
    order = 'AOE',
    diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_5_square_FPC.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'square', 
    order = 'FPC',
    type = "lower",
    diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_6_ellipse_AOE.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'ellipse', 
    order = 'AOE',
    type = "upper",
    diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_7_mixed_AOE.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot.mixed(
    cor_matrix, 
    order = 'AOE',
    # type = "upper",
    # diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    number.cex = 0.3,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_8_mixed_hclust.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot.mixed(
    cor_matrix, 
    lower = 'shade',
    upper = 'pie',
    order = 'hclust',
    # type = "upper",
    # diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    number.cex = 0.3,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_9_pie_hclust.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'pie', 
    order = 'hclust',
    type = "upper",
    diag = FALSE,
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_10_AOE_col.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    # method = 'pie', 
    order = 'AOE',
    # type = "upper",
    # diag = FALSE,
    col = COL2('RdBu',85),
    tl.col = "black", 
    tl.cex = 0.5,
    # cl.ratio = 0.1,
    cl.cex = 0.5,
    # cl.align.text = "c",
    # addgrid.col = "gray",
    # tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_11_hclust_square.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'square', 
    order = 'hclust',
    # cl.pos = 'n',
    # type = "upper",
    # diag = FALSE,
    col = COL2('BrBG',85),
    tl.col = "black", 
    tl.cex = 0.5,
    cl.ratio = 0.07,
    cl.cex = 0.5,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_12_lower_hclust.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    # method = 'square', 
    order = 'hclust',
    # cl.pos = 'n',
    type = "lower",
    # diag = FALSE,
    col = COL2('PuOr',85),
    tl.col = "black", 
    tl.cex = 0.5,
    cl.ratio = 0.07,
    tl.srt = 45,
    cl.cex = 0.5,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_13_lower_hclust.png", 
    family = "Times", 
    width = 3300, 
    height = 3200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'square', 
    order = 'hclust',
    # cl.pos = 'n',
    type = "lower",
    # diag = FALSE,
    col = COL2('PuOr',85),
    tl.col = "black", 
    tl.cex = 0.5,
    cl.ratio = 0.07,
    tl.srt = 45,
    cl.cex = 0.5,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_14_color_hclust.png", 
    family = "Times", 
    width = 2300, 
    height = 2200, 
    res = 600
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'color',
    order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.2,
    cl.ratio = 0.1,
    cl.cex = 0.3,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_15_color_hclust.png", 
    family = "Times", 
    width = 2300, 
    height = 2200, 
    res = 600
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, 
    method = 'color',
    order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.2,
    cl.ratio = 0.1,
    cl.cex = 0.3,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
data <- read.csv('data/after_normalize_dataset.csv')
cor_matrix = cor(data)

# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_16_color_hclust.png", 
    family = "Times", 
    width = 2100, 
    height = 2000, 
    res = 500
    )
par(family = "Times")
# 绘制相关性图
corrplot.mixed(
    cor_matrix, 
    lower = 'shade',
    upper = 'pie',
    order = 'hclust',
    tl.col = "black", 
    tl.cex = 0.2,
    cl.ratio = 0.1,
    cl.cex = 0.3,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
data <- read.csv('data/after_normalize_dataset.csv')
cor_matrix = cor(data)
testRes = cor.mtest(data, conf.level = 0.95)

# 设置画布大小
# par(pin = c(35,30))
png(
    filename = "Result/Relation/New pearson/related_matrix_17_color_hclust.png", 
    family = "Times", 
    width = 1500, 
    height = 1200, 
    res = 300
    )
par(family = "Times")
# 绘制相关性图
corrplot(
    cor_matrix, p.mat = testRes$p, method = 'color', diag = FALSE,
         sig.level = c(0.001, 0.01, 0.05), pch.cex = 0.9,
         insig = 'label_sig', pch.col = 'grey20', order = 'AOE',
    tl.col = "black", 
    tl.cex = 0.2,
    cl.ratio = 0.1,
    cl.cex = 0.3,
    cl.align.text = "c",
    addgrid.col = "gray",
    tl.offset = 0.1
    )
dev.off()

In [None]:
M = cor(mtcars)
corrplot(M, method = 'number') # colorful number