# 电力系统稳定性分类模型

相比于train.ipynb多了最后的 12. 动态权重集成模型 训练数据也分为了train_1.csv和train_2.csv


## 1. 导入所需的库

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, 
                           roc_auc_score, roc_curve, precision_recall_curve, 
                           average_precision_score, f1_score)
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import optuna
import time
import warnings
import os
import joblib

# 忽略警告
warnings.filterwarnings('ignore')

## 2. 设置输出目录与全局参数

In [None]:
# 基础目录设置
base_dir = '/data/jinming/ee_stable'



# 创建最终结果比较和SHAP分析的目录
comparison_dir = f'{base_dir}/comparison'
shap_dir = f'{base_dir}/shap'
os.makedirs(comparison_dir, exist_ok=True)
os.makedirs(shap_dir, exist_ok=True)
print(f"创建了最终比较结果目录: {comparison_dir}")
print(f"创建了SHAP分析目录: {shap_dir}")



# 设置统一的超参数优化trials数量和并行处理线程数
n_trials_optuna = 100  # 可以根据计算资源和时间调整
n_proc = 16          # 并行处理的线程数(CPU)
print(f"设置统一的Optuna超参数优化trials数量: {n_trials_optuna}")
print(f"设置统一的并行处理线程数: {n_proc}")

# 创建各个模型的输出目录
models = ['catboost', 'lightgbm', 'xgboost']
for model in models:
    model_dir = f'{base_dir}/{model}'
    plots_dir = f'{model_dir}/plots'
    models_dir = f'{model_dir}/models'
    results_dir = f'{model_dir}/results'
    
    for directory in [plots_dir, models_dir, results_dir]:
        os.makedirs(directory, exist_ok=True)
        
    print(f"为 {model} 创建了输出目录")

## 3. 数据加载与准备

In [None]:
# 加载数据
print("===== 加载数据 =====")
train_df = pd.read_csv(f'{base_dir}/data/train_1.csv')
test_df = pd.read_csv(f'{base_dir}/data/test.csv')
val_df = pd.read_csv(f'{base_dir}/data/val.csv')

# 数据准备
print("===== 准备数据 =====")
# 排除不需要的列
# drop_columns = ['stab', 'stabf_encoded', 'stabf', 'p1', 'p2', 'p3', 'p4']
drop_columns = ['stab', 'stabf_encoded', 'stabf']

X_train = train_df.drop(drop_columns, axis=1)
X_test = test_df.drop(drop_columns, axis=1)
X_val = val_df.drop(drop_columns, axis=1)

y_train = train_df['stabf_encoded']
y_test = test_df['stabf_encoded']
y_val = val_df['stabf_encoded']

print(f"数据集维度 - 训练集: {X_train.shape}, 验证集: {X_val.shape}, 测试集: {X_test.shape}")

## 4. 特征工程

In [None]:
def create_features(X_train, X_test, X_val):
    """为所有数据集创建特征"""
    # 深拷贝，避免修改原始数据
    X_train_new = X_train.copy()
    X_test_new = X_test.copy()
    X_val_new = X_val.copy()
    
    # 为所有数据集应用特征工程
    for df in [X_train_new, X_test_new, X_val_new]:
        # 基本交互特征
        df['tau1_g1'] = df['tau1'] * df['g1']
        df['tau2_g2'] = df['tau2'] * df['g2']
        df['tau3_g3'] = df['tau3'] * df['g3']
        df['tau4_g4'] = df['tau4'] * df['g4']
        
        # 延迟比率
        df['tau_ratio'] = df[['tau1', 'tau2', 'tau3', 'tau4']].max(axis=1) / df[['tau1', 'tau2', 'tau3', 'tau4']].min(axis=1).replace(0, 0.001)
        
        # 延迟-弹性比率：每个节点的响应灵敏度
        df['tau1_g1_ratio'] = df['tau1'] / df['g1'].replace(0, 0.001)
        df['tau2_g2_ratio'] = df['tau2'] / df['g2'].replace(0, 0.001)
        df['tau3_g3_ratio'] = df['tau3'] / df['g3'].replace(0, 0.001)
        df['tau4_g4_ratio'] = df['tau4'] / df['g4'].replace(0, 0.001)
        
        # 系统总体弹性
        df['total_elasticity'] = df['g1'] + df['g2'] + df['g3'] + df['g4']
        
        # 弹性分布的不均匀性
        df['elasticity_disparity'] = df[['g1', 'g2', 'g3', 'g4']].max(axis=1) / df[['g1', 'g2', 'g3', 'g4']].min(axis=1).replace(0, 0.001)
        
        # 非线性特征 - 二次项
        df['tau1_squared'] = df['tau1'] ** 2
        df['tau2_squared'] = df['tau2'] ** 2
        df['tau3_squared'] = df['tau3'] ** 2
        df['tau4_squared'] = df['tau4'] ** 2
        
        # 节点间关系特征
        df['tau_g_correlation'] = (
            (df['tau1'] * df['g1']) + 
            (df['tau2'] * df['g2']) + 
            (df['tau3'] * df['g3']) + 
            (df['tau4'] * df['g4'])
        ) / (df['tau1'] + df['tau2'] + df['tau3'] + df['tau4'] + 0.001)
        
        # 系统整体响应速度指标
        df['system_response_speed'] = 4 / (
            (1/df['tau1'].replace(0, 0.001)) + 
            (1/df['tau2'].replace(0, 0.001)) + 
            (1/df['tau3'].replace(0, 0.001)) + 
            (1/df['tau4'].replace(0, 0.001))
        )
    
    return X_train_new, X_test_new, X_val_new

# 应用特征工程
print("\n===== 执行特征工程 =====")
X_train_featured, X_test_featured, X_val_featured = create_features(X_train, X_test, X_val)
print(f"特征工程后特征数量: {X_train_featured.shape[1]}")

## 5. 特征选择

In [None]:
def select_features(X_train, X_test, X_val, y_train):
    """
    基于相关性和重要性的特征自动选择
    """
    print("\n===== 开始自动特征选择 =====")
    
    # 步骤1: 计算与目标变量的相关性
    print("步骤1: 计算每个特征与目标的相关性")
    
    # 特征与目标的相关性
    feature_target_corr = {}
    for col in X_train.columns:
        # 使用点二列相关系数计算相关性
        corr = abs(np.corrcoef(X_train[col], y_train)[0, 1])
        feature_target_corr[col] = corr
    
    feature_corr_df = pd.DataFrame({
        'Feature': list(feature_target_corr.keys()),
        'Target_Correlation': list(feature_target_corr.values())
    }).sort_values('Target_Correlation', ascending=False)
    
    # 可视化与目标的相关性
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Target_Correlation', y='Feature', data=feature_corr_df.head(20))
    plt.title('Feature Correlation with Target')
    plt.tight_layout()
    plt.savefig(f'{base_dir}/catboost/plots/target_correlation.png')
    plt.close()
    
    print("Top 10 与目标高相关的特征:")
    print(feature_corr_df.head(10))
    
    # 步骤2: 智能去除高相关特征，保留与目标相关性更高的特征
    print("\n步骤2: 智能去除冗余特征")
    
    # 计算特征间相关性
    corr_matrix = X_train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # 可视化相关性矩阵
    plt.figure(figsize=(16, 14))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, cmap='coolwarm', center=0, mask=mask,
                square=True, linewidths=.5, annot=False, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig(f'{base_dir}/catboost/plots/feature_correlation.png')
    plt.close()
    
    # 高度相关的特征对
    correlation_threshold = 0.7
    to_drop = set()
    
    for i, row_name in enumerate(upper.index):
        for col_name in upper.columns[i:]:
            if upper.loc[row_name, col_name] > correlation_threshold:
                if feature_target_corr[row_name] > feature_target_corr[col_name]:
                    to_drop.add(col_name)
                else:
                    to_drop.add(row_name)
    
    print(f"移除 {len(to_drop)} 个高相关冗余特征:")
    print(", ".join(list(to_drop)))
    
    # 移除冗余特征
    X_train_filtered = X_train.drop(columns=list(to_drop))
    X_test_filtered = X_test.drop(columns=list(to_drop))
    X_val_filtered = X_val.drop(columns=list(to_drop))
    
    # 步骤3: 使用模型评估特征重要性
    print("\n步骤3: 基于模型特征重要性筛选")
    
    # 训练一个LightGBM模型用于特征重要性评估
    feature_selector = lgb.LGBMClassifier(
        objective='binary',
        metric='auc',
        boosting_type='gbdt',
        num_leaves=31,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )
    
    feature_selector.fit(X_train_filtered, y_train)
    
    # 获取特征重要性
    importances = feature_selector.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X_train_filtered.columns,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # 可视化特征重要性
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
    plt.title('LightGBM Feature Importance')
    plt.tight_layout()
    plt.savefig(f'{base_dir}/catboost/plots/feature_importance.png')
    plt.close()
    
    # 动态阈值设置
    mean_importance = feature_importance_df['Importance'].mean()
    importance_threshold = mean_importance * 0.5  # 使用平均值的50%作为阈值
    print(f"动态阈值: {importance_threshold:.2f} (平均重要性的50%)")
    selected_features = feature_importance_df[feature_importance_df['Importance'] > importance_threshold]['Feature'].tolist()
    
    # 如果筛选的特征太少，保留至少10个最重要的特征
    if len(selected_features) < 10:
        selected_features = feature_importance_df.head(10)['Feature'].tolist()
    
    print(f"\n最终选择了 {len(selected_features)}/{X_train.shape[1]} 个特征")
    print(f"选择的特征: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

def select_features_manual(X_train, X_test, X_val):
    """手动选择指定的特征集"""
    print("\n===== 使用手动指定的特征 =====")
    
   # 指定要保留的特征
    selected_features = [
        # 原始tau特征
        'tau1', 'tau2', 'tau3', 'tau4',
        
        # 原始g特征
        'g1', 'g2', 'g3', 'g4',
        
        # tau与g的交互项
        'tau1_g1', 'tau2_g2', 'tau3_g3', 'tau4_g4',
        
        # tau的比率特征
        'tau_ratio'
    ]



    # 验证所有指定的特征都存在
    missing_features = [f for f in selected_features if f not in X_train.columns]
    if missing_features:
        print(f"警告: 以下指定特征不存在: {', '.join(missing_features)}")
        # 过滤掉不存在的特征
        selected_features = [f for f in selected_features if f in X_train.columns]
    
    print(f"使用指定的 {len(selected_features)} 个特征:")
    print(f"选择的特征: {', '.join(selected_features)}")
    
    return X_train[selected_features], X_test[selected_features], X_val[selected_features], selected_features

# 可以选择自动特征选择或手动特征选择

# 使用自动特征选择: 根据feature_correlation 和 LightGBM 输出的 Feature Importance 来选择特征
# X_train_final, X_test_final, X_val_final, selected_features = select_features(
#     X_train_featured, X_test_featured, X_val_featured, y_train)

# 使用手动特征选择：自己根据上面的特征工程和分析选择了几个觉得更好的特征
X_train_final, X_test_final, X_val_final, selected_features = select_features_manual(
   X_train_featured, X_test_featured, X_val_featured)

print(f"特征选择后特征数量: {X_train_final.shape[1]}")

## 6. 模型训练和评估函数

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, model_name, is_xgb=False):
    """评估模型性能并生成可视化结果"""
    # 准备评估报告的字典
    results = {}
    
    # 如果是XGBoost模型，需要转换为DMatrix格式
    if is_xgb:
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        # 预测概率
        y_val_prob = model.predict(dval)
        y_test_prob = model.predict(dtest)
        
        # 预测类别
        y_val_pred = (y_val_prob > 0.5).astype(int)
        y_test_pred = (y_test_prob > 0.5).astype(int)
    else:
        # 预测概率
        y_val_prob = model.predict_proba(X_val)[:, 1]
        y_test_prob = model.predict_proba(X_test)[:, 1]
        
        # 预测类别
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
    
    # 计算各种评估指标
    val_auc = roc_auc_score(y_val, y_val_prob)
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
    
    test_auc = roc_auc_score(y_test, y_test_prob)
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
    
    # 打印评估结果
    print(f"\n===== {model_name} 在验证集上的性能 =====")
    print(f"AUC: {val_auc:.4f}")
    print(f"Weighted F1 Score: {val_f1_weighted:.4f}")
    print(f"F1 Score: {val_f1:.4f}")
    print(f"Accuracy: {val_acc:.4f}")
    
    print(f"\n===== {model_name} 在测试集上的性能 =====")
    print(f"AUC: {test_auc:.4f}")
    print(f"Weighted F1 Score: {test_f1_weighted:.4f}")
    print(f"F1 Score: {test_f1:.4f}")
    print(f"Accuracy: {test_acc:.4f}")
    
    # 保存评估结果
    results = {
        'model_name': model_name,
        'validation': {
            'auc': val_auc,
            'accuracy': val_acc,
            'f1': val_f1,
            'f1_weighted': val_f1_weighted
        },
        'test': {
            'auc': test_auc,
            'accuracy': test_acc,
            'f1': test_f1,
            'f1_weighted': test_f1_weighted
        }
    }
    
    # 创建模型的基本目录
    model_dir = f'/data/jinming/ee_stable/{model_name.lower()}'
    plots_dir = f'{model_dir}/plots'
    results_dir = f'{model_dir}/results'
    
    # 绘制ROC曲线
    plt.figure(figsize=(12, 10))
    # 验证集
    fpr_val, tpr_val, _ = roc_curve(y_val, y_val_prob)
    plt.plot(fpr_val, tpr_val, label=f'Validation Set (AUC = {val_auc:.4f})')
    # 测试集
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
    plt.plot(fpr_test, tpr_test, label=f'Test Set (AUC = {test_auc:.4f})')
    # 参考线
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{plots_dir}/{model_name.lower()}_roc_curves.png')
    plt.close()
    
    # 绘制PR曲线
    plt.figure(figsize=(12, 10))
    # 验证集
    prec_val, rec_val, _ = precision_recall_curve(y_val, y_val_prob)
    avg_prec_val = average_precision_score(y_val, y_val_prob)
    plt.plot(rec_val, prec_val, label=f'Validation Set (AP = {avg_prec_val:.4f})')
    # 测试集
    prec_test, rec_test, _ = precision_recall_curve(y_test, y_test_prob)
    avg_prec_test = average_precision_score(y_test, y_test_prob)
    plt.plot(rec_test, prec_test, label=f'Test Set (AP = {avg_prec_test:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'PR Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{plots_dir}/{model_name.lower()}_pr_curves.png')
    plt.close()
    
    # 保存性能结果
    results_df = pd.DataFrame({
        'Metric': ['AUC', 'Accuracy', 'F1 Score', 'Weighted F1 Score'],
        'Validation Set': [val_auc, val_acc, val_f1, val_f1_weighted],
        'Test Set': [test_auc, test_acc, test_f1, test_f1_weighted]
    })
    results_df.to_csv(f'{results_dir}/{model_name.lower()}_auc_performance.csv', index=False)
    
    return results

## 7. 使用CatBoost训练模型 (AUC优化)

In [None]:
# 设置CatBoost的输出目录
plots_dir = '/data/jinming/ee_stable/catboost/plots'
models_dir = '/data/jinming/ee_stable/catboost/models'
results_dir = '/data/jinming/ee_stable/catboost/results'

# 定义CatBoost的AUC优化目标函数
def objective_catboost_auc(trial):
    """遍式Optuna优化目标函数 - 使用验证集上的AUC作为评价指标"""
    # 定义CatBoost参数搜索空间
    params = {
        'loss_function': 'Logloss',  # 二分类问题
        'eval_metric': 'AUC',
        'verbose': 0,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'iterations': 2000,          # 使用早停
        'depth': trial.suggest_int('depth', 4, 10),
        
        # 正则化参数
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
        
        # 其他参数
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 20),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),  # 列采样比例
        
        # CPU特定参数
        'task_type': 'CPU',
        'thread_count': n_proc,  # 使用全局设置的线程数
        
        'random_seed': 42
    }
    
    # 创建CatBoost模型
    model = cb.CatBoostClassifier(**params)
    
    # 在训练集上训练模型，使用验证集进行早停
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # 在验证集上预测
    y_val_prob = model.predict_proba(X_val_final)[:, 1]
    
    # 计算AUC分数
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # 打印当前试验的结果
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score

# 创廟Optuna study对象 - 优化方向是最大化AUC
print("\n===== 开始CatBoost模型的Optuna调参过程 (AUC) =====")
catboost_study_auc = optuna.create_study(direction='maximize', study_name='catboost_auc_optimization')

# 运行优化
print(f"开始运行 {n_trials_optuna} 次调参试验...")
start_time = time.time()
catboost_study_auc.optimize(objective_catboost_auc, n_trials=n_trials_optuna)
end_time = time.time()
print(f"调参完成! 耗时: {end_time - start_time:.2f}秒")

# 打印最佳参数和结果
print("\n===== 最佳参数 (AUC) =====")
print(f"最佳AUC分数: {catboost_study_auc.best_value:.4f}")
print("最佳参数组合:")
for key, value in catboost_study_auc.best_params.items():
    print(f"    {key}: {value}")

# 可视化调参过程
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(catboost_study_auc)
plt.title('CatBoost Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_catboost_auc_history.png')
plt.close()

# 可视化超参数重要性
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(catboost_study_auc)
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_catboost_auc_param_importances.png')
plt.close()

# 使用最佳参数训练最终模型
print("\n===== 使用最佳参数训练最终CatBoost模型 =====")
best_params_catboost = catboost_study_auc.best_params.copy()
best_params_catboost['loss_function'] = 'Logloss'
best_params_catboost['eval_metric'] = 'AUC'
best_params_catboost['random_seed'] = 42
best_params_catboost['task_type'] = 'CPU'
best_params_catboost['thread_count'] = n_proc

final_catboost_model = cb.CatBoostClassifier(**best_params_catboost)

# 记录训练时间和迭代次数
start_time = time.time()
final_catboost_model.fit(
    X_train_final, y_train,
    eval_set=[(X_val_final, y_val)],
    early_stopping_rounds=100,
    verbose=50  # 每50次迭代显示一次
)
end_time = time.time()

# 计算平均每迭代训练时间
training_time = end_time - start_time
actual_iterations = final_catboost_model.get_best_iteration() + 1  # +1因为迭代从0开始
avg_time_per_iteration = training_time / actual_iterations

print(f"最终模型训练完成! 总耗时: {training_time:.2f}秒")
print(f"实际迭代次数: {actual_iterations}")
print(f"平均每迭代时间: {avg_time_per_iteration:.4f}秒")

# 保存训练时间指标
catboost_time_metrics = {
    'model': 'CatBoost',
    'total_training_time': training_time,
    'iterations': actual_iterations,
    'avg_time_per_iteration': avg_time_per_iteration
}

# 保存最佳模型
model_path = f'{models_dir}/catboost_auc.cbm'
final_catboost_model.save_model(model_path)
print(f"\n最佳模型已保存到: {model_path}")

# 保存所选特征列表
with open(f'{models_dir}/catboost_selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"特征列表已保存到: {models_dir}/catboost_selected_features.txt")

# 使用评估函数评估模型
catboost_results = evaluate_model(
    final_catboost_model, 
    X_train_final, y_train, 
    X_val_final, y_val, 
    X_test_final, y_test,
    "CatBoost"
)

# 将时间指标添加到结果中
catboost_results['time_metrics'] = catboost_time_metrics

## 8. 使用LightGBM训练模型 (AUC优化)

In [None]:
# 设置LightGBM的输出目录
plots_dir = '/data/jinming/ee_stable/lightgbm/plots'
models_dir = '/data/jinming/ee_stable/lightgbm/models'
results_dir = '/data/jinming/ee_stable/lightgbm/results'

# 定义LightGBM的AUC优化目标函数
def objective_lightgbm_auc(trial):
    """功Optuna优化目标函数 - 使用验证集上的AUC作为评价指标"""
    # 定义LightGBM参数搜索空间
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 30, 1000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        
        # 正则化参数
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # 其他参数
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.5),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 10.0, log=True),
        
        # CPU并行参数
        'n_jobs': n_proc,    # 使用全局设置的线程数

        'random_state': 42
    }
    
    # 创建LightGBM模型
    model = lgb.LGBMClassifier(**params)
    
    # 在训练集上训练模型，使用验证集进行早停
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    # 在验证集上预测
    y_val_prob = model.predict_proba(X_val_final)[:, 1]
    
    # 计算AUC分数
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # 打印当前试验的结果
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score

# 创建 Optuna study对象 - 优化方向是最大化AUC
print("\n===== 开始LightGBM模型的Optuna调参过程 (AUC) =====")
lightgbm_study_auc = optuna.create_study(direction='maximize', study_name='lightgbm_auc_optimization')

# 运行优化
print(f"开始运行 {n_trials_optuna} 次调参试验...")
start_time = time.time()
lightgbm_study_auc.optimize(objective_lightgbm_auc, n_trials=n_trials_optuna)
end_time = time.time()
print(f"调参完成! 耗时: {end_time - start_time:.2f}秒")

# 打印最佳参数和结果
print("\n===== 最佳参数 (AUC) =====")
print(f"最佳AUC分数: {lightgbm_study_auc.best_value:.4f}")
print("最佳参数组合:")
for key, value in lightgbm_study_auc.best_params.items():
    print(f"    {key}: {value}")

# 可视化调参过程
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(lightgbm_study_auc)
plt.title('LightGBM Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_lightgbm_auc_history.png')
plt.close()

# 可视化超参数重要性
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(lightgbm_study_auc)
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_lightgbm_auc_param_importances.png')
plt.close()

# 使用最佳参数训练最终模型
print("\n===== 使用最佳参数训练最终LightGBM模型 =====")
best_params_lightgbm = lightgbm_study_auc.best_params.copy()
best_params_lightgbm['objective'] = 'binary'
best_params_lightgbm['metric'] = 'auc'
best_params_lightgbm['boosting_type'] = 'gbdt'
best_params_lightgbm['random_state'] = 42
best_params_lightgbm['verbosity'] = -1
best_params_lightgbm['n_jobs'] = n_proc

final_lightgbm_model = lgb.LGBMClassifier(**best_params_lightgbm)

# 记录训练时间和迭代次数
start_time = time.time()
eval_result = {}
final_lightgbm_model.fit(
    X_train_final, y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True),
               lgb.record_evaluation(eval_result)]
)
end_time = time.time()

# 计算平均每迭代训练时间
training_time = end_time - start_time
actual_iterations = final_lightgbm_model.best_iteration_ + 1  # +1因为从LightGBM的文档来看，迭代从1开始
avg_time_per_iteration = training_time / actual_iterations

print(f"最终模型训练完成! 总耗时: {training_time:.2f}秒")
print(f"实际迭代次数: {actual_iterations}")
print(f"平均每迭代时间: {avg_time_per_iteration:.4f}秒")

# 保存训练时间指标
lightgbm_time_metrics = {
    'model': 'LightGBM',
    'total_training_time': training_time,
    'iterations': actual_iterations,
    'avg_time_per_iteration': avg_time_per_iteration
}

# 保存最佳模型
model_path = f'{models_dir}/lightgbm_auc.pkl'
joblib.dump(final_lightgbm_model, model_path)
print(f"\n最佳模型已保存到: {model_path}")

# 保存所选特征列表
with open(f'{models_dir}/lightgbm_selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"特征列表已保存到: {models_dir}/lightgbm_selected_features.txt")

# 使用评估函数评估模型
lightgbm_results = evaluate_model(
    final_lightgbm_model, 
    X_train_final, y_train, 
    X_val_final, y_val, 
    X_test_final, y_test,
    "LightGBM"
)

# 将时间指标添加到结果中
lightgbm_results['time_metrics'] = lightgbm_time_metrics

## 9. 使用XGBoost训练模型 (AUC优化)

In [None]:
# 设置XGBoost的输出目录
plots_dir = '/data/jinming/ee_stable/xgboost/plots'
models_dir = '/data/jinming/ee_stable/xgboost/models'
results_dir = '/data/jinming/ee_stable/xgboost/results'

# 定义XGBoost的AUC优化目标函数
def objective_xgboost_auc(trial):
    """功Optuna优化目标函数 - 使用验证集上的AUC作为评价指标"""
    # 定义XGBoost参数搜索空间
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'verbosity': 0,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        # 正则化参数
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        
        # CPU特定参数
        'tree_method': 'hist',  # 使用直方图算法
        'n_jobs': n_proc,        # 使用全局设置的线程数
        
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
        'random_state': 42
    }

    # 准备数据
    dtrain = xgb.DMatrix(X_train_final, label=y_train)
    dval = xgb.DMatrix(X_val_final, label=y_val)
    
    # 设置评估集
    evals = [(dtrain, 'train'), (dval, 'val')]
    evals_result = {}
    
    # 训练模型，使用早停
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=2000,  # 最大迭代次数
        evals=evals,
        early_stopping_rounds=50,
        evals_result=evals_result,
        verbose_eval=False  # 只在每400次迭代时输出
    )
    
    # 在验证集上预测概率
    y_val_prob = model.predict(dval)
    
    # 计算AUC分数
    auc_score = roc_auc_score(y_val, y_val_prob)
    
    # 打印当前试验的结果
    print(f"Trial {trial.number}: AUC = {auc_score:.4f}")
    
    return auc_score

# 创建 Optuna study对象 - 优化方向是最大化AUC
print("\n===== 开始XGBoost模型的Optuna调参过程 (AUC) =====")
xgboost_study_auc = optuna.create_study(direction='maximize', study_name='xgboost_auc_optimization')

# 运行优化
print(f"开始运行 {n_trials_optuna} 次调参试验...")
start_time = time.time()
xgboost_study_auc.optimize(objective_xgboost_auc, n_trials=n_trials_optuna)
end_time = time.time()
print(f"调参完成! 耗时: {end_time - start_time:.2f}秒")

# 打印最佳参数和结果
print("\n===== 最佳参数 (AUC) =====")
print(f"最佳AUC分数: {xgboost_study_auc.best_value:.4f}")
print("最佳参数组合:")
for key, value in xgboost_study_auc.best_params.items():
    print(f"    {key}: {value}")

# 可视化调参过程
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_optimization_history(xgboost_study_auc)
plt.title('XGBoost Optimization History - AUC')
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_xgboost_auc_history.png')
plt.close()

# 可视化超参数重要性
plt.figure(figsize=(12, 8))
optuna.visualization.matplotlib.plot_param_importances(xgboost_study_auc)
plt.tight_layout()
plt.savefig(f'{plots_dir}/optuna_xgboost_auc_param_importances.png')
plt.close()

# 使用最佳参数训练最终模型
print("\n===== 使用最佳参数训练最终XGBoost模型 =====")
best_params_xgboost = xgboost_study_auc.best_params.copy()
best_params_xgboost['objective'] = 'binary:logistic'
best_params_xgboost['eval_metric'] = 'auc'
best_params_xgboost['tree_method'] = 'hist'
best_params_xgboost['n_jobs'] = n_proc
best_params_xgboost['verbosity'] = 1
best_params_xgboost['random_state'] = 42

# 准备数据
dtrain = xgb.DMatrix(X_train_final, label=y_train)
dval = xgb.DMatrix(X_val_final, label=y_val)
dtest = xgb.DMatrix(X_test_final, label=y_test)

# 设置评估集
evals = [(dtrain, 'train'), (dval, 'val')]
evals_result = {}

# 开始训练计时
start_time = time.time()

# 训练最终模型
final_xgboost_model = xgb.train(
    params=best_params_xgboost,
    dtrain=dtrain,
    num_boost_round=3000,  # 设置足够大的最大迭代次数
    evals=evals,
    early_stopping_rounds=50,
    evals_result=evals_result,
    verbose_eval=50  # 每50轮显示一次进度
)

# 结束训练计时
end_time = time.time()

# 计算平均每迭代训练时间
training_time = end_time - start_time
actual_iterations = final_xgboost_model.best_iteration + 1  # +1因为迭代从0开始
avg_time_per_iteration = training_time / actual_iterations

print(f"最终模型训练完成! 总耗时: {training_time:.2f}秒")
print(f"实际迭代次数: {actual_iterations}")
print(f"平均每迭代时间: {avg_time_per_iteration:.4f}秒")

# 保存训练时间指标
xgboost_time_metrics = {
    'model': 'XGBoost',
    'total_training_time': training_time,
    'iterations': actual_iterations,
    'avg_time_per_iteration': avg_time_per_iteration
}

# 保存最佳模型
model_path = f'{models_dir}/xgboost_auc.pkl'
joblib.dump(final_xgboost_model, model_path)
print(f"\n最佳模型已保存到: {model_path}")

# 保存所选特征列表
with open(f'{models_dir}/xgboost_selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))
print(f"特征列表已保存到: {models_dir}/xgboost_selected_features.txt")

# 使用评估函数评估模型
xgboost_results = evaluate_model(
    final_xgboost_model, 
    X_train_final, y_train, 
    X_val_final, y_val, 
    X_test_final, y_test,
    "XGBoost",
    is_xgb=True
)

# 将时间指标添加到结果中
xgboost_results['time_metrics'] = xgboost_time_metrics

## 10. 模型性能比较

In [None]:
# 创建模型性能比较表
def compare_models(results_list):
    """比较多个模型的性能"""
    # 提取验证集和测试集的指标
    val_metrics = []
    test_metrics = []
    time_metrics = []
    model_names = []
    
    for result in results_list:
        model_names.append(result['model_name'])
        val_metrics.append(result['validation'])
        test_metrics.append(result['test'])
        time_metrics.append(result['time_metrics'])
    
    # 创建验证集性能比较表
    val_comparison = pd.DataFrame({
        'Model': model_names,
        'AUC': [metrics['auc'] for metrics in val_metrics],
        'Accuracy': [metrics['accuracy'] for metrics in val_metrics],
        'F1 Score': [metrics['f1'] for metrics in val_metrics],
        'Weighted F1': [metrics['f1_weighted'] for metrics in val_metrics]
    })
    
    # 创建测试集性能比较表
    test_comparison = pd.DataFrame({
        'Model': model_names,
        'AUC': [metrics['auc'] for metrics in test_metrics],
        'Accuracy': [metrics['accuracy'] for metrics in test_metrics],
        'F1 Score': [metrics['f1'] for metrics in test_metrics],
        'Weighted F1': [metrics['f1_weighted'] for metrics in test_metrics]
    })
    
    # 创建时间性能比较表
    time_comparison = pd.DataFrame({
        'Model': model_names,
        'Total Training Time (s)': [metrics['total_training_time'] for metrics in time_metrics],
        'Iterations': [metrics['iterations'] for metrics in time_metrics],
        'Avg Time Per Iteration (s)': [metrics['avg_time_per_iteration'] for metrics in time_metrics]
    })
    
    return val_comparison, test_comparison, time_comparison

# 整合所有模型的结果
all_results = [catboost_results, lightgbm_results, xgboost_results]

# 生成比较表
val_comparison, test_comparison, time_comparison = compare_models(all_results)

print("\n===== 验证集上的模型性能比较 =====")
print(val_comparison)

print("\n===== 测试集上的模型性能比较 =====")
print(test_comparison)

print("\n===== 模型训练时间性能比较 =====")
print(time_comparison)


# 使用比较目录保存比较结果
print(f"\n保存比较结果到: {comparison_dir}")

# 保存比较结果到比较目录
val_comparison.to_csv(f'{comparison_dir}/validation_comparison.csv', index=False)
test_comparison.to_csv(f'{comparison_dir}/test_comparison.csv', index=False)
time_comparison.to_csv(f'{comparison_dir}/time_performance_comparison.csv', index=False)
print(f"比较结果已保存到: {comparison_dir}")

# 可视化比较结果
# 验证集AUC比较
plt.figure(figsize=(12, 6))
plt.bar(val_comparison['Model'], val_comparison['AUC'])
plt.title('Validation Set - AUC Comparison')
plt.ylabel('AUC Score')
plt.ylim(0.9, 1.0)  # 调整Y轴范围使差异更明显
for i, v in enumerate(val_comparison['AUC']):
    plt.text(i, v + 0.005, f"{v:.4f}", ha='center')
plt.tight_layout()
plt.savefig(f'{comparison_dir}/validation_auc_comparison.png')
plt.close()

# 测试集AUC比较
plt.figure(figsize=(12, 6))
plt.bar(test_comparison['Model'], test_comparison['AUC'])
plt.title('Test Set - AUC Comparison')
plt.ylabel('AUC Score')
plt.ylim(0.9, 1.0)  # 调整Y轴范围使差异更明显
for i, v in enumerate(test_comparison['AUC']):
    plt.text(i, v + 0.005, f"{v:.4f}", ha='center')
plt.tight_layout()
plt.savefig(f'{comparison_dir}/test_auc_comparison.png')
plt.close()

# 训练时间比较
plt.figure(figsize=(12, 6))
plt.bar(time_comparison['Model'], time_comparison['Avg Time Per Iteration (s)'])
plt.title('Average Time Per Iteration Comparison')
plt.ylabel('Time (seconds)')
for i, v in enumerate(time_comparison['Avg Time Per Iteration (s)']):
    plt.text(i, v + 0.005, f"{v:.4f}s", ha='center')
plt.tight_layout()
plt.savefig(f'{comparison_dir}/avg_time_per_iteration_comparison.png')
plt.close()

# 创建综合性能指标的雷达图
def plot_radar_chart(data, save_path, title):
    # 准备雷达图数据
    metrics = ['AUC', 'Accuracy', 'F1 Score', 'Weighted F1']
    models = data['Model']
    
    # 计算角度
    N = len(metrics)
    angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
    angles += angles[:1]  # 闭合图形
    
    # 创建图形
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # 为每个模型添加数据
    for i, model in enumerate(models):
        values = data.loc[i, metrics].values.flatten().tolist()
        values += values[:1]  # 闭合图形
        ax.plot(angles, values, linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.1)
    
    # 设置雷达图的刻度标签
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metrics)
    
    # 设置Y轴范围
    ax.set_ylim(0.8, 1.0)
    
    # 添加标题和图例
    plt.title(title, size=15, pad=20)
    plt.legend(loc='upper right')
    plt.tight_layout()
    
    # 保存图形
    plt.savefig(save_path)
    plt.close()

# 绘制雷达图
plot_radar_chart(val_comparison, f'{comparison_dir}/validation_radar_comparison.png', 'Validation Set - Model Performance Comparison')
plot_radar_chart(test_comparison, f'{comparison_dir}/test_radar_comparison.png', 'Test Set - Model Performance Comparison')

print("\n===== 模型比较可视化已保存到结果目录 =====")

## 11. SHAP值分析 - 特征重要性解释

In [None]:
# 导入SHAP库
import shap
import numpy as np
import matplotlib.pyplot as plt

print("\n===== 使用SHAP进行模型解释 =====")

# 从每个数据集中随机采样一部分数据用于SHAP分析(计算量很大)
shap_sample_size = min(500, X_train_final.shape[0])  # 最多使用500个样本
np.random.seed(42)  # 设置随机种子确保可重复性
shap_indices = np.random.choice(X_train_final.shape[0], shap_sample_size, replace=False)
X_shap = X_train_final.iloc[shap_indices]

# 分析各个模型的SHAP值
models_to_analyze = [
    {"name": "CatBoost", "model": final_catboost_model, "is_tree": True},
    {"name": "LightGBM", "model": final_lightgbm_model, "is_tree": True},
    {"name": "XGBoost", "model": final_xgboost_model, "is_tree": True, "is_xgb": True}
]

for model_info in models_to_analyze:
    model_name = model_info["name"]
    model = model_info["model"]
    is_tree = model_info["is_tree"]
    is_xgb = model_info.get("is_xgb", False)
    
    # 创建模型特定的SHAP结果目录
    model_shap_dir = f'{shap_dir}/{model_name.lower()}'
    os.makedirs(model_shap_dir, exist_ok=True)
    print(f"\n分析 {model_name} 模型 - 保存结果到 {model_shap_dir}")
    
    # 为不同的模型类型创建正确的解释器
    if is_xgb:
        # XGBoost模型需要特殊处理
        explainer = shap.TreeExplainer(model)
        # 创建DMatrix格式的数据
        X_shap_dmatrix = xgb.DMatrix(X_shap)
        shap_values = explainer.shap_values(X_shap)
    elif is_tree:
        # 对于基于树的模型
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_shap)
        # 处理CatBoost和LightGBM模型的特殊情况
        if model_name == "CatBoost" or model_name == "LightGBM":
            # 对于二分类模型，通常需要获取正类的SHAP值
            if isinstance(shap_values, list) and len(shap_values) > 1:
                shap_values = shap_values[1]  # 取正类的SHAP值
    else:
        # 对于其他类型的模型(如果有的话)
        explainer = shap.KernelExplainer(model.predict_proba, X_shap.iloc[:100])  # 对于非树模型，样本量更小
        shap_values = explainer.shap_values(X_shap.iloc[:100])
        # 取正类的SHAP值
        if isinstance(shap_values, list) and len(shap_values) > 1:
            shap_values = shap_values[1]  # 取正类的SHAP值
    
    # 绘制摘要图 - 所有特征的全局重要性
    plt.figure(figsize=(12, 10))
    shap.summary_plot(shap_values, X_shap, plot_type="bar", show=False)
    plt.title(f"{model_name} - SHAP Feature Importance")
    plt.tight_layout()
    plt.savefig(f'{model_shap_dir}/shap_importance.png')
    plt.close()
    
    # 绘制SHAP值的概述图
    plt.figure(figsize=(14, 12))
    shap.summary_plot(shap_values, X_shap, show=False)
    plt.title(f"{model_name} - SHAP Feature Impact Summary")
    plt.tight_layout()
    plt.savefig(f'{model_shap_dir}/shap_summary.png')
    plt.close()
    
    # 为每个重要特征绘制详细的依赖图(最多前5个重要特征)
    feature_importance = np.abs(shap_values).mean(0)
    feature_names = X_shap.columns.tolist()
    most_important_features_idx = np.argsort(-feature_importance)[:5]  # 取前5个最重要的特征
    
    for idx in most_important_features_idx:
        feature_name = feature_names[idx]
        plt.figure(figsize=(12, 8))
        shap.dependence_plot(idx, shap_values, X_shap, show=False)
        plt.title(f"{model_name} - SHAP Dependence Plot for '{feature_name}'")
        plt.tight_layout()
        plt.savefig(f'{model_shap_dir}/shap_dependence_{feature_name}.png')
        plt.close()
        
    print(f"{model_name} 的SHAP分析完成，图表已保存到: {model_shap_dir}")

# 创建一个比较不同模型的SHAP特征重要性的函数，结果保存到全局比较目录
def compare_shap_importance(models_to_analyze, X_shap):
    """比较不同模型的SHAP特征重要性"""
    print(f"\n比较不同模型的SHAP特征重要性 - 保存到 {shap_dir}")
    
    # 存储每个模型的特征重要性
    model_importance = {}
    
    for model_info in models_to_analyze:
        model_name = model_info["name"]
        model = model_info["model"]
        is_tree = model_info["is_tree"]
        is_xgb = model_info.get("is_xgb", False)
        
        # 计算SHAP值
        if is_xgb:
            explainer = shap.TreeExplainer(model)
            X_shap_dmatrix = xgb.DMatrix(X_shap)
            shap_values = explainer.shap_values(X_shap)
        elif is_tree:
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_shap)
            if isinstance(shap_values, list) and len(shap_values) > 1:
                shap_values = shap_values[1]  # 取正类的SHAP值
        else:
            # 其他类型的模型
            explainer = shap.KernelExplainer(model.predict_proba, X_shap.iloc[:100])
            shap_values = explainer.shap_values(X_shap.iloc[:100])
            if isinstance(shap_values, list) and len(shap_values) > 1:
                shap_values = shap_values[1]  # 取正类的SHAP值
                
        # 计算特征重要性
        feature_importance = np.abs(shap_values).mean(0)
        model_importance[model_name] = feature_importance
    
    # 创建DataFrame来比较所有模型的特征重要性
    importance_df = pd.DataFrame(index=X_shap.columns)
    
    for model_name, importance in model_importance.items():
        importance_df[model_name] = importance
    
    # 计算平均重要性并排序
    importance_df['Mean_Importance'] = importance_df.mean(axis=1)
    importance_df = importance_df.sort_values('Mean_Importance', ascending=False)
    
    # 保存特征重要性比较
    importance_df.to_csv(f'{shap_dir}/shap_feature_importance_comparison.csv')
    
    # 选择前10个重要特征进行可视化
    top_features = importance_df.head(10).index
    plot_df = importance_df.loc[top_features].drop('Mean_Importance', axis=1)
    
    # 可视化不同模型的特征重要性比较
    plt.figure(figsize=(14, 10))
    plot_df.plot(kind='bar', figsize=(14, 10))
    plt.title('SHAP Feature Importance Comparison Across Models (Top 10 Features)')
    plt.ylabel('SHAP Feature Importance (Mean |SHAP|)')
    plt.xlabel('Features')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'{shap_dir}/model_feature_importance_comparison.png')
    plt.close()
    
    return importance_df

# 比较不同模型的SHAP特征重要性
print("\n比较不同模型的SHAP特征重要性...")
importance_comparison = compare_shap_importance(models_to_analyze, X_shap)
print("特征重要性比较已完成并保存")

# 打印前10个最重要的特征
print("\n前10个最重要的特征 (基于平均SHAP重要性):")
print(importance_comparison[['Mean_Importance']].head(10))

## 12. 动态权重集成模型

在这个部分，我们将创建一个动态权重集成模型来整合前面训练的基础模型（CatBoost、LightGBM）的预测结果。与传统的固定权重集成不同，这里会基于每个数据点的特征为模型分配不同的权重，实现真正的动态权重分配。

权重预测器将使用单独的训练数据集train_2.csv进行训练，并使用Optuna进行超参数调优，在验证集上进行早停。

In [24]:
# 1. 加载保存好的基础模型
print("\n===== 加载基础模型 =====\n")

# 加载CatBoost模型
catboost_model_path = f'{base_dir}/catboost/models/catboost_auc.cbm'
loaded_catboost_model = cb.CatBoostClassifier()
loaded_catboost_model.load_model(catboost_model_path)
print(f"加载CatBoost模型: {catboost_model_path}")

# 加载LightGBM模型
lightgbm_model_path = f'{base_dir}/lightgbm/models/lightgbm_auc.pkl'
loaded_lightgbm_model = joblib.load(lightgbm_model_path)
print(f"加载LightGBM模型: {lightgbm_model_path}")

# 2. 加载权重预测器的训练数据 - 使用train_2.csv
print("\n===== 加载权重预测器的训练数据 =====\n")
train_2_df = pd.read_csv(f'{base_dir}/data/train_2.csv')
print(f"train_2.csv数据集维度: {train_2_df.shape}")

# 准备权重预测器的训练数据
drop_columns = ['stab', 'stabf_encoded', 'stabf']
X_train_2 = train_2_df.drop(drop_columns, axis=1)
y_train_2 = train_2_df['stabf_encoded']

# 对train_2数据应用特征工程
X_train_2_featured, _, _ = create_features(X_train_2, X_test, X_val)

# 应用特征选择（使用与之前相同的特征）
X_train_2_final = X_train_2_featured[selected_features]

print(f"权重预测器训练数据维度: {X_train_2_final.shape}")
print(f"特征数量: {X_train_2_final.shape[1]}")



===== 加载基础模型 =====

加载CatBoost模型: /data/jinming/ee_stable/catboost/models/catboost_auc.cbm
加载LightGBM模型: /data/jinming/ee_stable/lightgbm/models/lightgbm_auc.pkl

===== 加载权重预测器的训练数据 =====

train_2.csv数据集维度: (3000, 15)
权重预测器训练数据维度: (3000, 13)
特征数量: 13


In [25]:

# 3. 生成基础模型在各数据集上的预测结果作为元特征
print("\n===== 生成基础模型的预测结果作为元特征 =====\n")

# 在train_2数据集上的预测概率（用于训练权重预测器）
train_2_catboost_prob = loaded_catboost_model.predict_proba(X_train_2_final)[:, 1]
train_2_lightgbm_prob = loaded_lightgbm_model.predict_proba(X_train_2_final)[:, 1]

# 验证集上的预测概率
val_catboost_prob = loaded_catboost_model.predict_proba(X_val_final)[:, 1]
val_lightgbm_prob = loaded_lightgbm_model.predict_proba(X_val_final)[:, 1]

# 测试集上的预测概率
test_catboost_prob = loaded_catboost_model.predict_proba(X_test_final)[:, 1]
test_lightgbm_prob = loaded_lightgbm_model.predict_proba(X_test_final)[:, 1]

# 将预测概率转换为预测类别（0或1）
train_2_catboost_pred = (train_2_catboost_prob > 0.5).astype(int)
train_2_lightgbm_pred = (train_2_lightgbm_prob > 0.5).astype(int)

val_catboost_pred = (val_catboost_prob > 0.5).astype(int)
val_lightgbm_pred = (val_lightgbm_prob > 0.5).astype(int)

# 计算每个基础模型在train_2上的预测是否正确（0表示错误，1表示正确）
train_2_catboost_correct = (train_2_catboost_pred == y_train_2).astype(int)
train_2_lightgbm_correct = (train_2_lightgbm_pred == y_train_2).astype(int)

# 计算每个基础模型在验证集上的预测是否正确
val_catboost_correct = (val_catboost_pred == y_val).astype(int)
val_lightgbm_correct = (val_lightgbm_pred == y_val).astype(int)

print(f"train_2数据集上的模型准确率：")
print(f"CatBoost: {train_2_catboost_correct.mean():.4f}")
print(f"LightGBM: {train_2_lightgbm_correct.mean():.4f}")

print(f"\n验证集上的模型准确率：")
print(f"CatBoost: {val_catboost_correct.mean():.4f}")
print(f"LightGBM: {val_lightgbm_correct.mean():.4f}")


===== 生成基础模型的预测结果作为元特征 =====

train_2数据集上的模型准确率：
CatBoost: 0.9547
LightGBM: 0.9523

验证集上的模型准确率：
CatBoost: 0.9640
LightGBM: 0.9610


In [26]:
# 4. 创建元特征数据集 - 结合原始特征和模型预测

# 为train_2数据创建元特征数据框（用于训练权重预测器）
meta_features_train_2 = X_train_2_final.copy()

# 添加两个模型的预测概率作为新的特征
meta_features_train_2['catboost_prob'] = train_2_catboost_prob
meta_features_train_2['lightgbm_prob'] = train_2_lightgbm_prob

# 添加两个模型预测结果之间的不一致性作为特征
meta_features_train_2['model_disagreement'] = (train_2_catboost_pred != train_2_lightgbm_pred).astype(int)

# 添加预测概率的统计特征
meta_features_train_2['prob_mean'] = (train_2_catboost_prob + train_2_lightgbm_prob) / 2
meta_features_train_2['prob_std'] = np.std([train_2_catboost_prob, train_2_lightgbm_prob], axis=0)
meta_features_train_2['prob_range'] = np.abs(train_2_catboost_prob - train_2_lightgbm_prob)

# 为验证集创建元特征数据框（用于早停和评估）
meta_features_val = X_val_final.copy()

# 添加两个模型的预测概率作为新的特征
meta_features_val['catboost_prob'] = val_catboost_prob
meta_features_val['lightgbm_prob'] = val_lightgbm_prob

# 添加两个模型预测结果之间的不一致性作为特征
meta_features_val['model_disagreement'] = (val_catboost_pred != val_lightgbm_pred).astype(int)

# 添加预测概率的统计特征
meta_features_val['prob_mean'] = (val_catboost_prob + val_lightgbm_prob) / 2
meta_features_val['prob_std'] = np.std([val_catboost_prob, val_lightgbm_prob], axis=0)
meta_features_val['prob_range'] = np.abs(val_catboost_prob - val_lightgbm_prob)

print(f"train_2元特征数据框维度: {meta_features_train_2.shape}")
print(f"验证集元特征数据框维度: {meta_features_val.shape}")
print(f"元特征包括: {meta_features_train_2.columns.tolist()}")

train_2元特征数据框维度: (3000, 19)
验证集元特征数据框维度: (1000, 19)
元特征包括: ['tau1', 'tau2', 'tau3', 'tau4', 'g1', 'g2', 'g3', 'g4', 'tau1_g1', 'tau2_g2', 'tau3_g3', 'tau4_g4', 'tau_ratio', 'catboost_prob', 'lightgbm_prob', 'model_disagreement', 'prob_mean', 'prob_std', 'prob_range']


In [21]:
# 5. 定义和训练动态权重预测模型 - 使用Optuna调参

print("\n===== 使用Optuna调参训练动态权重预测模型 =====\n")

# 创建权重预测模型的输出目录
weight_models_dir = f'{base_dir}/weight_models'
os.makedirs(weight_models_dir, exist_ok=True)

# 根据train_2数据集上的表现分配基础模型的原始权重
# (使用每个模型是否正确预测作为目标值)
weight_targets = {
    'catboost': train_2_catboost_correct,
    'lightgbm': train_2_lightgbm_correct
}

# 创建权重预测模型
weight_predictors = {}
weight_studies = {}

for model_name, target in weight_targets.items():
    print(f"\n为{model_name}训练权重预测器...")
    
    # 定义Optuna优化目标函数
    def objective_weight_predictor(trial):
        """Optuna优化目标函数 - 使用验证集上的MSE作为评价指标"""
        # 定义CatBoost参数搜索空间
        params = {
            'loss_function': 'RMSE',
            'eval_metric': 'RMSE',
            'verbose': False,
            
            # 核心参数
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'iterations': 1000,  # 使用早停
            'depth': trial.suggest_int('depth', 4, 10),
            
            # 正则化参数
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
            
            # 其他参数
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 20),
            'rsm': trial.suggest_float('rsm', 0.1, 1.0),  # 列采样比例
            
            # CPU特定参数
            'task_type': 'CPU',
            'thread_count': n_proc,
            
            'random_seed': 42
        }
        
        # 创建CatBoost回归模型
        model = cb.CatBoostRegressor(**params)
        
        # 在train_2上训练模型，使用验证集进行早停
        model.fit(
            meta_features_train_2, target,
            eval_set=[(meta_features_val, weight_targets[model_name][:len(meta_features_val)])],
            early_stopping_rounds=50,
            verbose=False
        )
        
        # 在验证集上预测
        val_target = weight_targets[model_name][:len(meta_features_val)] if model_name == 'catboost' else val_lightgbm_correct
        y_val_pred = model.predict(meta_features_val)
        
        # 计算MSE分数（越小越好，所以返回负值）
        mse_score = np.mean((y_val_pred - val_target) ** 2)
        
        return -mse_score  # Optuna最大化目标，所以返回负MSE
    
    # 创建Optuna study对象
    study_name = f'{model_name}_weight_predictor_optimization'
    study = optuna.create_study(direction='maximize', study_name=study_name)
    weight_studies[model_name] = study
    
    # 运行优化
    print(f"开始运行 {n_trials_optuna} 次调参试验...")
    start_time = time.time()
    study.optimize(objective_weight_predictor, n_trials=n_trials_optuna)
    end_time = time.time()
    print(f"调参完成! 耗时: {end_time - start_time:.2f}秒")
    
    # 打印最佳参数和结果
    print(f"\n{model_name}权重预测器最佳参数:")
    print(f"最佳负MSE分数: {study.best_value:.4f}")
    print("最佳参数组合:")
    for key, value in study.best_params.items():
        print(f"    {key}: {value}")
    
    # 使用最佳参数训练最终模型
    best_params = study.best_params.copy()
    best_params['loss_function'] = 'RMSE'
    best_params['eval_metric'] = 'RMSE'
    best_params['verbose'] = False
    best_params['task_type'] = 'CPU'
    best_params['thread_count'] = n_proc
    best_params['random_seed'] = 42
    
    final_weight_model = cb.CatBoostRegressor(**best_params)
    
    # 训练权重预测模型
    final_weight_model.fit(
        meta_features_train_2, target,
        eval_set=[(meta_features_val, val_catboost_correct if model_name == 'catboost' else val_lightgbm_correct)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # 存储模型
    weight_predictors[model_name] = final_weight_model
    
    # 计算训练精度
    train_pred = final_weight_model.predict(meta_features_train_2)
    train_mse = np.mean((train_pred - target) ** 2)
    train_r2 = 1 - np.sum((train_pred - target) ** 2) / np.sum((target - np.mean(target)) ** 2)
    
    # 计算验证集精度
    val_target = val_catboost_correct if model_name == 'catboost' else val_lightgbm_correct
    val_pred = final_weight_model.predict(meta_features_val)
    val_mse = np.mean((val_pred - val_target) ** 2)
    val_r2 = 1 - np.sum((val_pred - val_target) ** 2) / np.sum((val_target - np.mean(val_target)) ** 2)
    
    print(f"\n{model_name}权重预测器训练完成:")
    print(f"训练集 - MSE: {train_mse:.4f}, R²: {train_r2:.4f}")
    print(f"验证集 - MSE: {val_mse:.4f}, R²: {val_r2:.4f}")
    
    # 保存权重预测模型
    weight_model_path = f'{weight_models_dir}/{model_name}_weight_predictor.cbm'
    final_weight_model.save_model(weight_model_path)
    print(f"保存{model_name}权重预测器到: {weight_model_path}")
    
    # 可视化调参过程
    plt.figure(figsize=(12, 8))
    optuna.visualization.matplotlib.plot_optimization_history(study)
    plt.title(f'{model_name.capitalize()} Weight Predictor Optimization History')
    plt.tight_layout()
    plt.savefig(f'{weight_models_dir}/{model_name}_weight_predictor_optuna_history.png')
    plt.close()
    
    # 可视化超参数重要性
    plt.figure(figsize=(12, 8))
    optuna.visualization.matplotlib.plot_param_importances(study)
    plt.title(f'{model_name.capitalize()} Weight Predictor Parameter Importance')
    plt.tight_layout()
    plt.savefig(f'{weight_models_dir}/{model_name}_weight_predictor_param_importance.png')
    plt.close()

print("\n所有权重预测器训练完成!")


lightgbm权重预测器训练完成:
训练集 - MSE: 0.0154, R²: 0.6606
验证集 - MSE: 0.0277, R²: 0.2610
保存lightgbm权重预测器到: /data/jinming/ee_stable/weight_models/lightgbm_weight_predictor.cbm

所有权重预测器训练完成!


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [22]:
# 6. 为测试集创建元特征并预测权重
print("\n===== 为测试集创建元特征并预测权重 =====\n")

# 创建测试集的元特征
meta_features_test = X_test_final.copy()

# 添加两个模型的预测概率作为新特征
meta_features_test['catboost_prob'] = test_catboost_prob
meta_features_test['lightgbm_prob'] = test_lightgbm_prob

# 添加两个模型预测结果之间的不一致性作为特征
test_catboost_pred = (test_catboost_prob > 0.5).astype(int)
test_lightgbm_pred = (test_lightgbm_prob > 0.5).astype(int)

meta_features_test['model_disagreement'] = (test_catboost_pred != test_lightgbm_pred).astype(int)

# 添加预测概率的统计特征
meta_features_test['prob_mean'] = (test_catboost_prob + test_lightgbm_prob) / 2
meta_features_test['prob_std'] = np.std([test_catboost_prob, test_lightgbm_prob], axis=0)
meta_features_test['prob_range'] = np.abs(test_catboost_prob - test_lightgbm_prob)

# 预测测试集的动态权重
print("为测试集的每个样本预测动态权重...")
test_weights = {}

for model_name, weight_model in weight_predictors.items():
    # 预测权重（确保权重为正值）
    weights = np.maximum(0, weight_model.predict(meta_features_test))
    test_weights[model_name] = weights
    print(f"{model_name}的权重统计:")
    print(f"  平均值: {weights.mean():.4f}")
    print(f"  中位数: {np.median(weights):.4f}")
    print(f"  最小值: {weights.min():.4f}")
    print(f"  最大值: {weights.max():.4f}")
    print(f"  标准差: {weights.std():.4f}")


===== 为测试集创建元特征并预测权重 =====

为测试集的每个样本预测动态权重...
catboost的权重统计:
  平均值: 0.9598
  中位数: 0.9957
  最小值: 0.2479
  最大值: 1.0203
  标准差: 0.0976
lightgbm的权重统计:
  平均值: 0.9608
  中位数: 0.9921
  最小值: 0.0454
  最大值: 1.0741
  标准差: 0.0958


In [23]:
# 7. 使用动态权重生成最终预测并评估性能
print("\n===== 生成动态权重集成预测并评估性能 =====\n")

# 组合预测结果 - 使用动态权重
test_dynamic_ensemble_prob = (
    test_weights['catboost'] * test_catboost_prob + 
    test_weights['lightgbm'] * test_lightgbm_prob
) / (
    test_weights['catboost'] + 
    test_weights['lightgbm'] + 1e-8  # 添加小的常数避免除零
)

# 转换为最终预测
test_dynamic_ensemble_pred = (test_dynamic_ensemble_prob > 0.5).astype(int)

# 计算动态权重集成的性能
dynamic_ensemble_acc = accuracy_score(y_test, test_dynamic_ensemble_pred)
dynamic_ensemble_auc = roc_auc_score(y_test, test_dynamic_ensemble_prob)
dynamic_ensemble_f1 = f1_score(y_test, test_dynamic_ensemble_pred)
dynamic_ensemble_f1_weighted = f1_score(y_test, test_dynamic_ensemble_pred, average='weighted')

print("动态权重集成模型在测试集上的性能:")
print(f"Accuracy: {dynamic_ensemble_acc:.4f}")
print(f"AUC: {dynamic_ensemble_auc:.4f}")
print(f"F1 Score: {dynamic_ensemble_f1:.4f}")
print(f"Weighted F1 Score: {dynamic_ensemble_f1_weighted:.4f}")

# 为比较添加动态集成模型的性能
test_comparison_with_ensemble = test_comparison.copy()
test_comparison_with_ensemble = pd.concat([
    test_comparison_with_ensemble,
    pd.DataFrame({
        'Model': ['Dynamic Ensemble (CB+LGBM)'],
        'AUC': [dynamic_ensemble_auc],
        'Accuracy': [dynamic_ensemble_acc],
        'F1 Score': [dynamic_ensemble_f1],
        'Weighted F1': [dynamic_ensemble_f1_weighted]
    })
], ignore_index=True)

# 保存更新的比较结果
test_comparison_with_ensemble.to_csv(f'{comparison_dir}/test_comparison_with_dynamic_ensemble.csv', index=False)

print("\n===== 所有模型性能比较 =====\n")
print(test_comparison_with_ensemble)

# 更新比较图表 - 测试集AUC比较
plt.figure(figsize=(15, 8))
colors = ['blue', 'green', 'orange', 'red']  # 为每个模型分配颜色
bars = plt.bar(test_comparison_with_ensemble['Model'], test_comparison_with_ensemble['AUC'], color=colors)
plt.title('Test Set - AUC Comparison with Dynamic Ensemble', fontsize=16)
plt.ylabel('AUC Score', fontsize=14)
plt.ylim(0.92, 1.0)  # 调整Y轴范围使差异更明显

# 在每个柱子上显示数值
for i, v in enumerate(test_comparison_with_ensemble['AUC']):
    plt.text(i, v + 0.005, f"{v:.4f}", ha='center', fontsize=12)

plt.xticks(rotation=15, fontsize=12)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(f'{comparison_dir}/test_auc_comparison_with_dynamic_ensemble.png', dpi=300)
plt.close()

# 分析动态权重集成相对于其他模型的提升
print("\n===== 动态权重集成相对于基础模型的性能提升 =====\n")
for idx, row in test_comparison.iterrows():
    model_name = row['Model']
    model_auc = row['AUC']
    auc_improvement = dynamic_ensemble_auc - model_auc
    acc_improvement = dynamic_ensemble_acc - row['Accuracy']
    f1_improvement = dynamic_ensemble_f1 - row['F1 Score']
    
    print(f"相对于{model_name}:")
    print(f"  AUC提升: {auc_improvement:+.4f}")
    print(f"  Accuracy提升: {acc_improvement:+.4f}")
    print(f"  F1 Score提升: {f1_improvement:+.4f}")
    print()


===== 生成动态权重集成预测并评估性能 =====

动态权重集成模型在测试集上的性能:
Accuracy: 0.9670
AUC: 0.9963
F1 Score: 0.9739
Weighted F1 Score: 0.9669


NameError: name 'test_comparison' is not defined

In [None]:
# 8. 分析动态权重分布和特征重要性
print("\n===== 分析动态权重分布和特征重要性 =====\n")

# 创建一个目录用于保存动态权重分析结果
dynamic_weights_dir = f'{base_dir}/dynamic_weights_analysis'
os.makedirs(dynamic_weights_dir, exist_ok=True)

# 8.1 可视化两个模型的权重分布
print("8.1 权重分布分析")
plt.figure(figsize=(15, 8))

# 子图1: 权重分布直方图
plt.subplot(2, 2, 1)
sns.histplot(test_weights['catboost'], kde=True, color='blue', alpha=0.6, label='CatBoost')
sns.histplot(test_weights['lightgbm'], kde=True, color='green', alpha=0.6, label='LightGBM')
plt.title('Distribution of Dynamic Weights for Each Model', fontsize=14)
plt.xlabel('Weight Value', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

# 子图2: 权重散点图
plt.subplot(2, 2, 2)
plt.scatter(test_weights['catboost'], test_weights['lightgbm'], alpha=0.6, s=20)
plt.xlabel('CatBoost Weight', fontsize=12)
plt.ylabel('LightGBM Weight', fontsize=12)
plt.title('CatBoost vs LightGBM Weights', fontsize=14)
plt.grid(True, alpha=0.3)

# 子图3: 权重相关性
plt.subplot(2, 2, 3)
weight_corr = np.corrcoef(test_weights['catboost'], test_weights['lightgbm'])[0, 1]
plt.text(0.5, 0.6, f'Correlation: {weight_corr:.4f}', ha='center', va='center', 
         transform=plt.gca().transAxes, fontsize=16, 
         bbox=dict(boxstyle='round', facecolor='lightblue'))
plt.text(0.5, 0.4, 'Weight Statistics:', ha='center', va='center', 
         transform=plt.gca().transAxes, fontsize=14, weight='bold')
stats_text = f"""CatBoost: μ={test_weights['catboost'].mean():.3f}, σ={test_weights['catboost'].std():.3f}
LightGBM: μ={test_weights['lightgbm'].mean():.3f}, σ={test_weights['lightgbm'].std():.3f}"""
plt.text(0.5, 0.3, stats_text, ha='center', va='center', 
         transform=plt.gca().transAxes, fontsize=11)
plt.title('Weight Statistics Summary', fontsize=14)
plt.axis('off')

# 子图4: 权重差异分布
plt.subplot(2, 2, 4)
weight_diff = test_weights['catboost'] - test_weights['lightgbm']
sns.histplot(weight_diff, kde=True, color='purple', alpha=0.6)
plt.title('Weight Difference Distribution\n(CatBoost - LightGBM)', fontsize=14)
plt.xlabel('Weight Difference', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(x=0, color='red', linestyle='--', alpha=0.7, label='Zero Difference')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{dynamic_weights_dir}/weight_distributions_analysis.png', dpi=300)
plt.close()

# 8.2 分析权重预测模型的特征重要性
print("\n8.2 权重预测器特征重要性分析")

for model_name, weight_model in weight_predictors.items():
    print(f"\n分析{model_name}权重预测器:")
    
    # 获取特征重要性
    importance = weight_model.get_feature_importance()
    feature_names = meta_features_train_2.columns
    
    # 按重要性排序
    indices = np.argsort(importance)[::-1]
    
    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        'Feature': [feature_names[i] for i in indices],
        'Importance': importance[indices]
    })
    
    # 保存特征重要性结果
    importance_df.to_csv(f'{dynamic_weights_dir}/{model_name}_weight_predictor_feature_importance.csv', index=False)
    
    # 可视化特征重要性
    plt.figure(figsize=(14, 8))
    top_features = min(15, len(importance))  # 显示前15个最重要的特征
    
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(top_features))
    plt.title(f'{model_name.capitalize()} Weight Predictor - Top {top_features} Feature Importance', fontsize=16)
    plt.xlabel('Feature Importance', fontsize=14)
    plt.ylabel('Features', fontsize=14)
    plt.tight_layout()
    plt.savefig(f'{dynamic_weights_dir}/{model_name}_weight_predictor_feature_importance.png', dpi=300)
    plt.close()
    
    # 打印最重要的10个特征
    print(f"Top 10最重要的特征:")
    for i in range(min(10, len(indices))):
        print(f"  {i+1}. {feature_names[indices[i]]}: {importance[indices[i]]:.4f}")

# 8.3 分析预测的权重与模型性能的关系
print("\n8.3 权重与模型性能关系分析")

test_catboost_correct = (test_catboost_pred == y_test).astype(int)
test_lightgbm_correct = (test_lightgbm_pred == y_test).astype(int)

# 创建一个数据框用于分析
weight_performance_df = pd.DataFrame({
    'Catboost_Weight': test_weights['catboost'],
    'Lightgbm_Weight': test_weights['lightgbm'],
    'Catboost_Correct': test_catboost_correct,
    'Lightgbm_Correct': test_lightgbm_correct,
    'Ensemble_Correct': (test_dynamic_ensemble_pred == y_test).astype(int)
})

# 计算每个模型权重与其预测正确性的相关性
print("权重与模型预测正确性的相关性:")
for model in ['catboost', 'lightgbm']:
    corr = np.corrcoef(weight_performance_df[f'{model.capitalize()}_Weight'], 
                      weight_performance_df[f'{model.capitalize()}_Correct'])[0, 1]
    print(f"  {model.capitalize()}权重与预测正确性相关性: {corr:.4f}")

# 可视化权重与预测正确性的关系
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 为每个模型创建权重分布对比图
for i, model in enumerate(['catboost', 'lightgbm']):
    # 子图1: 权重分布按预测正确性分组
    ax1 = axes[i, 0]
    correct_weights = weight_performance_df[weight_performance_df[f'{model.capitalize()}_Correct'] == 1][f'{model.capitalize()}_Weight']
    incorrect_weights = weight_performance_df[weight_performance_df[f'{model.capitalize()}_Correct'] == 0][f'{model.capitalize()}_Weight']
    
    sns.kdeplot(correct_weights, label='Correct Predictions', color='green', shade=True, ax=ax1)
    sns.kdeplot(incorrect_weights, label='Incorrect Predictions', color='red', shade=True, ax=ax1)
    
    ax1.set_title(f'{model.capitalize()} Weight Distribution by Prediction Correctness')
    ax1.set_xlabel('Weight Value')
    ax1.set_ylabel('Density')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 子图2: 权重箱线图
    ax2 = axes[i, 1]
    box_data = [correct_weights, incorrect_weights]
    bp = ax2.boxplot(box_data, labels=['Correct', 'Incorrect'], patch_artist=True)
    bp['boxes'][0].set_facecolor('green')
    bp['boxes'][0].set_alpha(0.6)
    bp['boxes'][1].set_facecolor('red')
    bp['boxes'][1].set_alpha(0.6)
    
    ax2.set_title(f'{model.capitalize()} Weight Distribution (Box Plot)')
    ax2.set_ylabel('Weight Value')
    ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{dynamic_weights_dir}/weight_vs_prediction_correctness.png', dpi=300)
plt.close()

# 8.4 权重选择效果分析
print("\n8.4 权重选择效果分析")

# 分析在不同情况下权重分配的合理性
# 1. 当两个模型都正确时
both_correct = (test_catboost_correct == 1) & (test_lightgbm_correct == 1)
# 2. 当只有CatBoost正确时
only_catboost_correct = (test_catboost_correct == 1) & (test_lightgbm_correct == 0)
# 3. 当只有LightGBM正确时
only_lightgbm_correct = (test_catboost_correct == 0) & (test_lightgbm_correct == 1)
# 4. 当两个模型都错误时
both_incorrect = (test_catboost_correct == 0) & (test_lightgbm_correct == 0)

scenarios = {
    'Both Correct': both_correct,
    'Only CatBoost Correct': only_catboost_correct,
    'Only LightGBM Correct': only_lightgbm_correct,
    'Both Incorrect': both_incorrect
}

print("不同预测场景下的权重分配统计:")
scenario_stats = []

for scenario_name, mask in scenarios.items():
    if mask.sum() > 0:  # 确保该场景有样本
        cb_weight_mean = test_weights['catboost'][mask].mean()
        lgb_weight_mean = test_weights['lightgbm'][mask].mean()
        count = mask.sum()
        
        scenario_stats.append({
            'Scenario': scenario_name,
            'Count': count,
            'CatBoost_Weight_Mean': cb_weight_mean,
            'LightGBM_Weight_Mean': lgb_weight_mean,
            'Weight_Ratio_CB/LGB': cb_weight_mean / (lgb_weight_mean + 1e-8)
        })
        
        print(f"\n{scenario_name} ({count} samples):")
        print(f"  CatBoost平均权重: {cb_weight_mean:.4f}")
        print(f"  LightGBM平均权重: {lgb_weight_mean:.4f}")
        print(f"  权重比例 (CB/LGB): {cb_weight_mean / (lgb_weight_mean + 1e-8):.4f}")

# 保存场景统计
scenario_stats_df = pd.DataFrame(scenario_stats)
scenario_stats_df.to_csv(f'{dynamic_weights_dir}/weight_allocation_by_scenario.csv', index=False)

print(f"\n动态权重分析完成，所有结果保存到: {dynamic_weights_dir}")
print("\n===== 动态权重集成模型训练和分析完成! =====")

备选: 使用Weighted F1作为优化目标的模型训练



```python
# CatBoost - Weighted F1优化
def objective_catboost_wf1(trial):
    # 定义CatBoost参数搜索空间
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'Logloss',
        'verbose': 0,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'iterations': 2000,
        'depth': trial.suggest_int('depth', 3, 10),
        
        # 正则化参数
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10.0),
        
        # 其他参数
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'rsm': trial.suggest_float('rsm', 0.1, 1.0),
        
        'thread_count': 16,
        'random_seed': 42
    }
    
    # 创建CatBoost模型
    model = cb.CatBoostClassifier(**params)
    
    # 训练模型
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # 在验证集上预测
    y_val_pred = model.predict(X_val_final)
    
    # 计算weighted F1分数
    f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
    
    print(f"Trial {trial.number}: Weighted F1 = {f1_weighted:.4f}")
    
    return f1_weighted

# LightGBM - Weighted F1优化
def objective_lightgbm_wf1(trial):
    # 定义LightGBM参数搜索空间
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 30, 1000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        
        # 正则化参数
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # 其他参数
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.5),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 10.0, log=True),
        
        'n_jobs': 16,
        'random_state': 42
    }
    
    # 创建LightGBM模型
    model = lgb.LGBMClassifier(**params)
    
    # 训练模型
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_val_final, y_val)],
        eval_metric='logloss',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    # 在验证集上预测
    y_val_pred = model.predict(X_val_final)
    
    # 计算weighted F1分数
    f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
    
    print(f"Trial {trial.number}: Weighted F1 = {f1_weighted:.4f}")
    
    return f1_weighted

# XGBoost - Weighted F1优化
def objective_xgboost_wf1(trial):
    # 定义XGBoost参数搜索空间
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'verbosity': 0,
        
        # 核心参数
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        # 正则化参数
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        
        # CPU特定参数
        'tree_method': 'hist',
        'n_jobs': 16,
        
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
        'random_state': 42
    }

    dtrain = xgb.DMatrix(X_train_final, label=y_train)
    dval = xgb.DMatrix(X_val_final, label=y_val)
    
    evals = [(dtrain, 'train'), (dval, 'val')]
    evals_result = {}
    
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=2000,
        evals=evals,
        early_stopping_rounds=50,
        evals_result=evals_result,
        verbose_eval=False
    )
    
    y_val_prob = model.predict(dval)
    y_val_pred = (y_val_prob > 0.5).astype(int)
    
    f1_weighted = f1_score(y_val, y_val_pred, average='weighted')
    
    print(f"Trial {trial.number}: Weighted F1 = {f1_weighted:.4f}")
    
    return f1_weighted
```