In [23]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive

# 挂载 Google 硬盘
drive.mount('/content/drive')

# 读取数据集
file_path = '/content/drive/MyDrive/Colab Notebooks/Processed_Dataset_of_Diabetes_Version3.csv'
try:
    data = pd.read_csv(file_path)
except FileNotFoundError:
    print("错误：未找到数据集文件，请检查文件路径。")
else:
    # 划分特征和标签
    X = data.drop('CLASS', axis=1)
    y = data['CLASS']

    # 特征选择（这里简单假设保留重要特征，可根据实际情况调整）
    important_features = ['Urea', 'Cr', 'HbA1c', 'BMI']  # 可根据特征重要性分析调整
    X = X[important_features]

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 创建 XGBoost 分类器，调整参数
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        eval_metric=['merror', 'mlogloss'],
        n_estimators=100,  # 减少树的数量
        max_depth=3,  # 减少树的深度
        learning_rate=0.2,  # 增大学习率
        reg_alpha=0.01,  # 减少 L1 正则化强度
        reg_lambda=0.01  # 减少 L2 正则化强度
    )

    results = {
        'train': {'merror': [], 'mlogloss': []},
        'test': {'merror': [], 'mlogloss': []}
    }
    eval_set = [(X_train, y_train), (X_test, y_test)]

    for i in range(model.n_estimators):
        model.fit(X_train, y_train, xgb_model=model.get_booster() if i > 0 else None)

        # 计算训练集指标
        y_pred_train = model.predict(X_train)
        train_error = 1 - accuracy_score(y_train, y_pred_train)
        train_log_loss = -np.log(model.predict_proba(X_train))[np.arange(len(y_train)), y_train].mean()
        results['train']['merror'].append(train_error)
        results['train']['mlogloss'].append(train_log_loss)

        # 计算测试集指标
        y_pred_test = model.predict(X_test)
        test_error = 1 - accuracy_score(y_test, y_pred_test)
        test_log_loss = -np.log(model.predict_proba(X_test))[np.arange(len(y_test)), y_test].mean()
        results['test']['merror'].append(test_error)
        results['test']['mlogloss'].append(test_log_loss)

    # 进行预测
    y_pred = model.predict(X_test)

    # 评估模型
    accuracy = accuracy_score(y_test, y_pred)
    print(f"模型准确率: {accuracy:.2f}")

    # 分类报告
    print("\n分类报告:")
    print(classification_report(y_test, y_pred))

    # 混淆矩阵
    print("\n混淆矩阵:")
    print(confusion_matrix(y_test, y_pred))

    # 特征重要性
    feature_importance = model.feature_importances_
    feature_names = X.columns
    sorted_idx = np.argsort(feature_importance)
    print("\n特征重要性:")
    for index in sorted_idx:
        print(f"{feature_names[index]}: {feature_importance[index]:.4f}")

    # AUC - ROC 曲线
    y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
    y_score = model.predict_proba(X_test)
    auc_scores = []
    for i in range(3):
        auc = roc_auc_score(y_test_bin[:, i], y_score[:, i])
        auc_scores.append(auc)
        print(f"类别 {i} 的 AUC 值: {auc:.2f}")

    # 交叉验证结果
    cv_scores = cross_val_score(model, X, y, cv=5)
    print(f"\n交叉验证的平均准确率: {cv_scores.mean():.2f}，标准差: {cv_scores.std():.2f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
模型准确率: 0.96

分类报告:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       173
           1       0.85      0.81      0.83        21
           2       1.00      1.00      1.00         6

    accuracy                           0.96       200
   macro avg       0.94      0.93      0.94       200
weighted avg       0.96      0.96      0.96       200


混淆矩阵:
[[170   3   0]
 [  4  17   0]
 [  0   0   6]]

特征重要性:
Cr: 0.0101
Urea: 0.0238
HbA1c: 0.2451
BMI: 0.7209
类别 0 的 AUC 值: 1.00
类别 1 的 AUC 值: 0.99
类别 2 的 AUC 值: 1.00

交叉验证的平均准确率: 0.94，标准差: 0.06
