In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve, auc, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import random
import json
from datetime import datetime
import os

In [None]:
# XGBoost 分类模型 - 优化版本
# 保存每个fold的详细结果，包括准确率、混淆矩阵、F1等指标
# 最终绘制混淆矩阵、ROC曲线和SHAP分析

In [None]:
# 设置随机种子
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

# 加载CSV数据
def load_data_from_csv(csv_path):
    """从CSV文件加载数据"""
    df = pd.read_csv(csv_path)
    
    print(f"Loaded data from: {csv_path}")
    print(f"Total samples: {len(df)}")
    
    # 去除filename列
    if 'filename' in df.columns:
        df = df.drop('filename', axis=1)
    
    # 分离特征和标签
    if 'label' not in df.columns:
        raise ValueError("'label' column not found in CSV file")
    
    y = df['label'].values
    X = df.drop('label', axis=1).values
    feature_names = df.drop('label', axis=1).columns.tolist()
    
    print(f"Features: {len(feature_names)}")
    print(f"Positive samples: {np.sum(y == 1)} ({np.sum(y == 1)/len(y)*100:.2f}%)")
    print(f"Negative samples: {np.sum(y == 0)} ({np.sum(y == 0)/len(y)*100:.2f}%)")
    
    return X, y, feature_names

set_seed(42)

In [None]:
# XGBoost模型训练和预测函数
def train_xgboost_optimized(X_train, y_train, X_val=None, y_val=None, seed=51):
    """使用优化后的参数训练XGBoost模型，并记录学习曲线"""
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 9,
        'learning_rate': 0.042,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'scale_pos_weight': 2.23,
        'min_child_weight': 1,
        'gamma': 0.2,
        'seed': seed
    }
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    # 如果提供了验证集，记录学习曲线
    if X_val is not None and y_val is not None:
        dval = xgb.DMatrix(X_val, label=y_val)
        evals = [(dtrain, 'train'), (dval, 'val')]
        evals_result = {}
        model = xgb.train(params, dtrain, num_boost_round=100, 
                         evals=evals, evals_result=evals_result, 
                         verbose_eval=False)
        return model, evals_result
    else:
        model = xgb.train(params, dtrain, num_boost_round=100, verbose_eval=False)
        return model, None

def predict_xgboost(model, X_test):
    """XGBoost预测"""
    dtest = xgb.DMatrix(X_test)
    y_pred_proba = model.predict(dtest)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    return y_pred, y_pred_proba

In [None]:
# 加载数据
csv_path = 'data/dataset/features_final_38_new.csv'
X, y, feature_names = load_data_from_csv(csv_path)

# 设置参数
SPLIT_SEED = 43
XGBOOST_MODEL_SEED = 51
N_SPLITS = 5

print(f"\n{'='*70}")
print(f"XGBoost 交叉验证配置")
print(f"数据集: {csv_path}")
print(f"特征数量: {len(feature_names)}")
print(f"数据集划分 Seed: {SPLIT_SEED}")
print(f"XGBoost 模型 Seed: {XGBOOST_MODEL_SEED}")
print(f"Fold数量: {N_SPLITS}")
print(f"{'='*70}\n")

In [None]:
# 交叉验证并保存详细结果
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SPLIT_SEED)

# 存储所有结果
fold_results = []
all_models = []
all_X_train = []
all_y_train = []
all_X_test = []
all_y_test = []
all_learning_curves = []  # 存储学习曲线

print("开始交叉验证...")
print("-" * 70)

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    print(f"\nFold {fold+1}/{N_SPLITS}")
    
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # 保存数据用于SHAP分析
    all_X_train.append(X_train)
    all_y_train.append(y_train)
    all_X_test.append(X_test)
    all_y_test.append(y_test)
    
    # 训练模型并获取学习曲线（使用测试集作为验证集记录学习曲线，不影响训练数据量）
    model, evals_result = train_xgboost_optimized(X_train, y_train, X_test, y_test, XGBOOST_MODEL_SEED)
    all_models.append(model)
    all_learning_curves.append(evals_result)  # 保存学习曲线
    
    # 预测
    y_pred, y_pred_proba = predict_xgboost(model, X_test)
    
    # 计算各种指标
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # 分类指标
    precision_0 = tn / (tn + fn) if (tn + fn) > 0 else 0
    recall_0 = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0
    
    precision_1 = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_1 = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0
    
    # 计算ROC曲线数据
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    # 存储结果
    fold_result = {
        'fold': fold + 1,
        'accuracy': float(accuracy),
        'auc': float(auc_score),
        'confusion_matrix': {
            'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)
        },
        'class_0': {
            'precision': float(precision_0),
            'recall': float(recall_0),
            'f1': float(f1_0)
        },
        'class_1': {
            'precision': float(precision_1),
            'recall': float(recall_1),
            'f1': float(f1_1)
        },
        'roc_curve': {
            'fpr': fpr.tolist(),
            'tpr': tpr.tolist(),
            'thresholds': thresholds.tolist()
        }
    }
    fold_results.append(fold_result)
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  AUC: {auc_score:.4f}")
    print(f"  Class 0 - Precision: {precision_0:.4f}, Recall: {recall_0:.4f}, F1: {f1_0:.4f}")
    print(f"  Class 1 - Precision: {precision_1:.4f}, Recall: {recall_1:.4f}, F1: {f1_1:.4f}")

print(f"\n{'='*70}")
print("交叉验证完成")
print(f"{'='*70}\n")

In [None]:
# 计算平均指标
avg_metrics = {
    'accuracy': np.mean([f['accuracy'] for f in fold_results]),
    'accuracy_std': np.std([f['accuracy'] for f in fold_results]),
    'auc': np.mean([f['auc'] for f in fold_results]),
    'auc_std': np.std([f['auc'] for f in fold_results]),
    'class_0': {
        'precision': np.mean([f['class_0']['precision'] for f in fold_results]),
        'recall': np.mean([f['class_0']['recall'] for f in fold_results]),
        'f1': np.mean([f['class_0']['f1'] for f in fold_results])
    },
    'class_1': {
        'precision': np.mean([f['class_1']['precision'] for f in fold_results]),
        'recall': np.mean([f['class_1']['recall'] for f in fold_results]),
        'f1': np.mean([f['class_1']['f1'] for f in fold_results])
    }
}

print("平均指标:")
print("-" * 70)
print(f"Accuracy: {avg_metrics['accuracy']:.4f} ± {avg_metrics['accuracy_std']:.4f}")
print(f"AUC: {avg_metrics['auc']:.4f} ± {avg_metrics['auc_std']:.4f}")
print(f"\nClass 0 (Negative):")
print(f"  Precision: {avg_metrics['class_0']['precision']:.4f}")
print(f"  Recall: {avg_metrics['class_0']['recall']:.4f}")
print(f"  F1: {avg_metrics['class_0']['f1']:.4f}")
print(f"\nClass 1 (Positive):")
print(f"  Precision: {avg_metrics['class_1']['precision']:.4f}")
print(f"  Recall: {avg_metrics['class_1']['recall']:.4f}")
print(f"  F1: {avg_metrics['class_1']['f1']:.4f}")
print(f"{'='*70}\n")

In [None]:
# 计算特征重要性
print("计算特征重要性...")

fold_feature_importances = []
for fold_idx, model in enumerate(all_models):
    importance_dict = model.get_score(importance_type='gain')
    fold_importance = {}
    for i, fname in enumerate(feature_names):
        key = f'f{i}'
        fold_importance[fname] = float(importance_dict.get(key, 0))
    fold_feature_importances.append(fold_importance)

# 计算平均特征重要性
avg_feature_importance = {}
for fname in feature_names:
    importances = [fold_imp[fname] for fold_imp in fold_feature_importances]
    avg_feature_importance[fname] = float(np.mean(importances))

# 排序并获取Top 10
sorted_features = sorted(avg_feature_importance.items(), key=lambda x: x[1], reverse=True)
top_10_features = sorted_features[:10]

print("\nTop 10 最重要的特征:")
print("-" * 70)
for i, (feat, imp) in enumerate(top_10_features, 1):
    print(f"{i:2d}. {feat:<40} {imp:.2f}")
print(f"{'='*70}\n")

In [None]:
# 保存结果到文件
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'training_results/run_xgboost_{timestamp}'
os.makedirs(save_dir, exist_ok=True)

# 保存fold结果
results_data = {
    'timestamp': timestamp,
    'dataset': csv_path,
    'n_features': len(feature_names),
    'feature_names': feature_names,
    'config': {
        'split_seed': SPLIT_SEED,
        'model_seed': XGBOOST_MODEL_SEED,
        'n_splits': N_SPLITS,
    },
    'fold_results': fold_results,
    'average_metrics': {k: float(v) if not isinstance(v, dict) else v for k, v in avg_metrics.items()},
    'feature_importance': avg_feature_importance,
    'top_10_features': {feat: float(imp) for feat, imp in top_10_features}
}

# 保存完整结果
with open(f'{save_dir}/complete_results.json', 'w', encoding='utf-8') as f:
    json.dump(results_data, f, indent=4, ensure_ascii=False)

# 保存平均指标（单独文件）
with open(f'{save_dir}/average_metrics.json', 'w', encoding='utf-8') as f:
    json.dump(avg_metrics, f, indent=4)

# 保存特征重要性
with open(f'{save_dir}/feature_importance.json', 'w', encoding='utf-8') as f:
    json.dump(avg_feature_importance, f, indent=4)

print(f"结果已保存到: {save_dir}")
print(f"  - complete_results.json: 完整结果")
print(f"  - average_metrics.json: 平均指标")
print(f"  - feature_importance.json: 特征重要性")
print(f"{'='*70}\n")

In [None]:
# 保存学习曲线数据到CSV
print("保存学习曲线数据到CSV...")

for fold_idx, evals_result in enumerate(all_learning_curves):
    # 创建DataFrame
    learning_curve_df = pd.DataFrame({
        'iteration': range(1, len(evals_result['train']['logloss']) + 1),
        'train_logloss': evals_result['train']['logloss'],
        'val_logloss': evals_result['val']['logloss']
    })
    
    # 保存到CSV
    csv_filename = f'{save_dir}/learning_curve_fold_{fold_idx+1}.csv'
    learning_curve_df.to_csv(csv_filename, index=False)
    print(f"  - Fold {fold_idx+1} 学习曲线已保存: {csv_filename}")

print(f"{'='*70}\n")

In [None]:
# 保存ROC曲线数据到CSV
print("保存ROC曲线数据到CSV...")

for fold_idx, fold_result in enumerate(fold_results):
    # 创建DataFrame
    roc_df = pd.DataFrame({
        'fpr': fold_result['roc_curve']['fpr'],
        'tpr': fold_result['roc_curve']['tpr'],
        'thresholds': fold_result['roc_curve']['thresholds']
    })
    
    # 保存到CSV
    csv_filename = f'{save_dir}/roc_curve_fold_{fold_idx+1}.csv'
    roc_df.to_csv(csv_filename, index=False)
    print(f"  - Fold {fold_idx+1} ROC曲线已保存: {csv_filename}")

print(f"{'='*70}\n")

In [None]:
# 保存平均学习曲线数据到CSV（包含std区域）
print("保存平均学习曲线数据到CSV...")

# 计算平均值和标准差
train_logloss_all = np.array([evals_result['train']['logloss'] for evals_result in all_learning_curves])
val_logloss_all = np.array([evals_result['val']['logloss'] for evals_result in all_learning_curves])

mean_train_logloss = np.mean(train_logloss_all, axis=0)
mean_val_logloss = np.mean(val_logloss_all, axis=0)
std_train_logloss = np.std(train_logloss_all, axis=0)
std_val_logloss = np.std(val_logloss_all, axis=0)

# 创建DataFrame
mean_learning_curve_df = pd.DataFrame({
    'iteration': range(1, len(mean_train_logloss) + 1),
    'mean_train_logloss': mean_train_logloss,
    'mean_val_logloss': mean_val_logloss,
    'std_train_logloss': std_train_logloss,
    'std_val_logloss': std_val_logloss,
    'train_upper': mean_train_logloss + std_train_logloss,
    'train_lower': mean_train_logloss - std_train_logloss,
    'val_upper': mean_val_logloss + std_val_logloss,
    'val_lower': mean_val_logloss - std_val_logloss
})

# 保存到CSV
csv_filename = f'{save_dir}/mean_learning_curve.csv'
mean_learning_curve_df.to_csv(csv_filename, index=False)
print(f"  - 平均学习曲线已保存: {csv_filename}")
print(f"{'='*70}\n")

In [None]:
# 保存平均ROC曲线数据到CSV（包含std区域）
print("保存平均ROC曲线数据到CSV...")

# 计算平均ROC曲线（插值到统一的fpr点）
mean_fpr = np.linspace(0, 1, 100)
tprs = []

for fold_result in fold_results:
    fpr = np.array(fold_result['roc_curve']['fpr'])
    tpr = np.array(fold_result['roc_curve']['tpr'])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
std_tpr = np.std(tprs, axis=0)
tpr_upper = np.minimum(mean_tpr + std_tpr, 1)
tpr_lower = np.maximum(mean_tpr - std_tpr, 0)

# 计算平均AUC和std
mean_auc_value = auc(mean_fpr, mean_tpr)
std_auc_value = np.std([f['auc'] for f in fold_results])

# 创建DataFrame
mean_roc_df = pd.DataFrame({
    'fpr': mean_fpr,
    'mean_tpr': mean_tpr,
    'std_tpr': std_tpr,
    'tpr_upper': tpr_upper,
    'tpr_lower': tpr_lower
})

# 保存到CSV
csv_filename = f'{save_dir}/mean_roc_curve.csv'
mean_roc_df.to_csv(csv_filename, index=False)
print(f"  - 平均ROC曲线已保存: {csv_filename}")
print(f"  - Mean AUC: {mean_auc_value:.4f} ± {std_auc_value:.4f}")
print(f"{'='*70}\n")

In [None]:
# 绘制学习曲线 (Log Loss)
print("绘制学习曲线...")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for fold_idx, evals_result in enumerate(all_learning_curves):
    ax = axes[fold_idx]
    
    iterations = range(1, len(evals_result['train']['logloss']) + 1)
    train_logloss = evals_result['train']['logloss']
    val_logloss = evals_result['val']['logloss']
    
    ax.plot(iterations, train_logloss, label='Train', color='#1f77b4', linewidth=2)
    ax.plot(iterations, val_logloss, label='Validation', color='#ff7f0e', linewidth=2)
    
    ax.set_xlabel('Iteration', fontsize=11)
    ax.set_ylabel('Log Loss', fontsize=11)
    ax.set_title(f'Fold {fold_idx+1} - Learning Curve', fontsize=12, fontweight='bold')
    ax.legend(loc='upper right', fontsize=10)
    ax.grid(True, alpha=0.3)

# 计算并绘制平均学习曲线
ax = axes[5]

# 获取所有fold的学习曲线长度
max_iterations = max(len(evals_result['train']['logloss']) for evals_result in all_learning_curves)
iterations = range(1, max_iterations + 1)

# 计算平均值
train_logloss_all = np.array([evals_result['train']['logloss'] for evals_result in all_learning_curves])
val_logloss_all = np.array([evals_result['val']['logloss'] for evals_result in all_learning_curves])

mean_train_logloss = np.mean(train_logloss_all, axis=0)
mean_val_logloss = np.mean(val_logloss_all, axis=0)
std_train_logloss = np.std(train_logloss_all, axis=0)
std_val_logloss = np.std(val_logloss_all, axis=0)

ax.plot(iterations, mean_train_logloss, label='Mean Train', color='#1f77b4', linewidth=3)
ax.plot(iterations, mean_val_logloss, label='Mean Validation', color='#ff7f0e', linewidth=3)

# 添加标准差区域
ax.fill_between(iterations, 
                mean_train_logloss - std_train_logloss,
                mean_train_logloss + std_train_logloss,
                alpha=0.2, color='#1f77b4', label='Train ± 1 std')
ax.fill_between(iterations,
                mean_val_logloss - std_val_logloss,
                mean_val_logloss + std_val_logloss,
                alpha=0.2, color='#ff7f0e', label='Val ± 1 std')

ax.set_xlabel('Iteration', fontsize=11)
ax.set_ylabel('Log Loss', fontsize=11)
ax.set_title('Average Learning Curve', fontsize=12, fontweight='bold')
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{save_dir}/learning_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"学习曲线已保存: {save_dir}/learning_curves.png\n")

In [None]:
# 绘制混淆矩阵
print("绘制混淆矩阵...")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for fold_idx, fold_result in enumerate(fold_results):
    cm_data = fold_result['confusion_matrix']
    cm = np.array([[cm_data['tn'], cm_data['fp']], 
                   [cm_data['fn'], cm_data['tp']]])
    
    ax = axes[fold_idx]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
                cbar_kws={'label': 'Count'}, annot_kws={'fontsize': 12})
    ax.set_xlabel('Predicted Label', fontsize=11)
    ax.set_ylabel('True Label', fontsize=11)
    ax.set_title(f"Fold {fold_idx+1} - Acc: {fold_result['accuracy']:.4f}", 
                 fontsize=12, fontweight='bold')
    ax.set_xticklabels(['Negative', 'Positive'])
    ax.set_yticklabels(['Negative', 'Positive'])

# 计算并绘制平均混淆矩阵
avg_cm = np.zeros((2, 2))
for fold_result in fold_results:
    cm_data = fold_result['confusion_matrix']
    avg_cm += np.array([[cm_data['tn'], cm_data['fp']], 
                        [cm_data['fn'], cm_data['tp']]])
avg_cm = avg_cm / N_SPLITS

ax = axes[5]
sns.heatmap(avg_cm, annot=True, fmt='.1f', cmap='Greens', ax=ax, 
            cbar_kws={'label': 'Average Count'}, annot_kws={'fontsize': 12})
ax.set_xlabel('Predicted Label', fontsize=11)
ax.set_ylabel('True Label', fontsize=11)
ax.set_title(f"Average - Acc: {avg_metrics['accuracy']:.4f}", 
             fontsize=12, fontweight='bold')
ax.set_xticklabels(['Negative', 'Positive'])
ax.set_yticklabels(['Negative', 'Positive'])

plt.tight_layout()
plt.savefig(f'{save_dir}/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"混淆矩阵已保存: {save_dir}/confusion_matrices.png\n")

In [None]:
# 绘制ROC曲线
print("绘制ROC曲线...")

plt.figure(figsize=(10, 8))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

# 绘制每个fold的ROC曲线
for fold_idx, fold_result in enumerate(fold_results):
    fpr = np.array(fold_result['roc_curve']['fpr'])
    tpr = np.array(fold_result['roc_curve']['tpr'])
    roc_auc = fold_result['auc']
    
    plt.plot(fpr, tpr, color=colors[fold_idx], lw=2, alpha=0.8,
             label=f"Fold {fold_idx+1} (AUC = {roc_auc:.4f})")

# 绘制平均ROC曲线（插值）
mean_fpr = np.linspace(0, 1, 100)
tprs = []

for fold_result in fold_results:
    fpr = np.array(fold_result['roc_curve']['fpr'])
    tpr = np.array(fold_result['roc_curve']['tpr'])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std([f['auc'] for f in fold_results])

plt.plot(mean_fpr, mean_tpr, color='navy', lw=3, linestyle='--',
         label=f'Mean ROC (AUC = {mean_auc:.4f} ± {std_auc:.4f})')

# 绘制标准差区域
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                 label='± 1 std. dev.')

# 绘制对角线
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random (AUC = 0.5)')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.title('ROC Curves - XGBoost Cross Validation', fontsize=15, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)

plt.savefig(f'{save_dir}/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"ROC曲线已保存: {save_dir}/roc_curves.png\n")

In [None]:
# SHAP分析 - 使用最后一个fold的模型和数据
print("进行SHAP分析...")
print("注意: SHAP分析可能需要较长时间...")

# 使用最后一个fold的模型进行SHAP分析
model = all_models[-1]
X_train_shap = all_X_train[-1]
X_test_shap = all_X_test[-1]

# 创建SHAP explainer
explainer = shap.TreeExplainer(model)

# 计算SHAP值（使用测试集）
shap_values = explainer.shap_values(X_test_shap)

print("SHAP分析完成\n")

In [None]:
# 绘制SHAP Summary Plot - Top 10特征
print("绘制SHAP Summary Plot (Top 10 特征)...")

# 获取Top 10特征的索引
top_10_feature_names = [feat for feat, _ in top_10_features]
top_10_indices = [feature_names.index(fname) for fname in top_10_feature_names]

# 提取Top 10特征的SHAP值
shap_values_top10 = shap_values[:, top_10_indices]
X_test_top10 = X_test_shap[:, top_10_indices]

# 创建DataFrame便于显示
X_test_top10_df = pd.DataFrame(X_test_top10, columns=top_10_feature_names)

plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_top10, X_test_top10_df, 
                  plot_type="dot", show=False, 
                  max_display=10)
plt.title('SHAP Summary Plot - Top 10 Features', fontsize=15, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f'{save_dir}/shap_summary_top10.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"SHAP Summary Plot已保存: {save_dir}/shap_summary_top10.png")
print(f"\n{'='*70}")
print("所有分析完成！")
print(f"{'='*70}")
print(f"\n结果保存位置: {save_dir}")
print("包含文件:")
print("  1. complete_results.json - 完整的训练结果")
print("  2. average_metrics.json - 平均指标")
print("  3. feature_importance.json - 特征重要性")
print("  4. confusion_matrices.png - 混淆矩阵可视化")
print("  5. roc_curves.png - ROC曲线")
print("  6. shap_summary_top10.png - SHAP分析（Top 10特征）")
print(f"{'='*70}\n")