In [None]:
# 导入所需库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, average_precision_score, roc_curve,fbeta_score
)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegressionCV
import shap
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (12, 8)

# 设置numpy随机种子
np.random.seed(42)


In [None]:
df = df = pd.read_parquet('')
#数据
df.info()

In [None]:
# 第一部分：数据预处理与特征工程
#省略此部分

In [None]:
# 特征工程后，分离最终验证集
# ===============================

# 获取所有正样本的索引
positive_indices = y[y == 1].index.tolist()
print(f"总正样本数量: {len(positive_indices)}")
# 从正样本中随机选择30个作为最终验证集
final_validation_indices = np.random.choice(positive_indices, size=30, replace=False)
training_positive_indices = [idx for idx in positive_indices if idx not in final_validation_indices]

print(f"最终验证集正样本: 30个")
print(f"训练用正样本: {len(training_positive_indices)}个")

# 为最终验证集选择合理数量的负样本
negative_indices = y[y == 0].index.tolist()
# 选择合理的负样本数量：使用原始比例，即116550个负样本
final_validation_negative_count = 116550 
final_validation_negative_indices = np.random.choice(negative_indices, 
                                                   size=final_validation_negative_count, 
                                                   replace=False)

print(f"最终验证集负样本: {final_validation_negative_count}个")
print(f"最终验证集总样本: {30 + final_validation_negative_count}个")
print(f"最终验证集正负比例: 1:{final_validation_negative_count//30}")

# 创建最终验证集
final_validation_all_indices = list(final_validation_indices) + list(final_validation_negative_indices)
X_final_validation = X.loc[final_validation_all_indices]
y_final_validation = y.loc[final_validation_all_indices]

# 从训练数据中移除最终验证集样本（使用更高效的方法）
final_validation_indices_set = set(final_validation_all_indices)
remaining_mask = ~X.index.isin(final_validation_all_indices)
X_training_pool = X[remaining_mask]
y_training_pool = y[remaining_mask]

print(f"数据分布:")
print(f"训练池 - 正样本: {(y_training_pool == 1).sum()}个")
print(f"训练池 - 负样本: {(y_training_pool == 0).sum()}个")
print(f"最终验证集 - 正样本: {(y_final_validation == 1).sum()}个")
print(f"最终验证集 - 负样本: {(y_final_validation == 0).sum()}个")
print(f"最终验证集转化率: {y_final_validation.mean():.6f}")


In [None]:
# 多重欠采样策略：构建多个平衡数据集进行交叉验证（使用119个正样本）
# =======================================================


print("多重欠采样平衡数据集构建策略（保留30个正样本作为最终验证）")
print("策略：119个正样本 + 多次随机抽取119个负样本")
print("模型：逻辑回归 + 交叉验证")
print("最终验证：30个正样本的独立测试集")

def create_multiple_balanced_datasets(X, y, n_datasets=100, positive_sample_multiplier=1):
    """
    创建多个平衡数据集
    
    参数:
    - X: 特征矩阵
    - y: 标签
    - n_datasets: 要创建的数据集数量
    - positive_sample_multiplier: 负样本数量相对于正样本的倍数（1表示1:1平衡）
    """
    
    # 分离正负样本
    positive_indices = np.where(y == 1)[0]
    negative_indices = np.where(y == 0)[0]
    
    n_positive = len(positive_indices)
    n_negative_per_dataset = n_positive * positive_sample_multiplier
    
    print(f"📊 数据分布:")
    print(f"   正样本数量: {n_positive}")
    print(f"   负样本数量: {len(negative_indices)}")
    print(f"   每个数据集负样本抽取: {n_negative_per_dataset}")
    print(f"   将创建 {n_datasets} 个平衡数据集")
    
    datasets = []
    
    # 设置随机种子，确保可重现性
    #确保不重复抽样
    for i in range(n_datasets):
        # 随机抽取负样本
        selected_negative_indices = np.random.choice(
            negative_indices, 
            size=n_negative_per_dataset, 
            replace=False
        )
        
        # 合并正负样本索引
        balanced_indices = np.concatenate([positive_indices, selected_negative_indices])
        
        # 创建平衡数据集
        X_balanced = X.iloc[balanced_indices]
        y_balanced = y.iloc[balanced_indices]
        
        datasets.append({
            'X': X_balanced,
            'y': y_balanced,
            'indices': balanced_indices,
            'dataset_id': i + 1
        })
        
        print(f"   数据集 {i+1}: {len(y_balanced)} 样本 (正:{sum(y_balanced)}, 负:{len(y_balanced)-sum(y_balanced)})")
    
    return datasets



from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tabulate import tabulate  # 如未安装：pip install tabulate
import numpy as np


# 执行多重欠采样策略（使用训练池数据）
print("开始执行多重欠采样策略...")

# 创建多个平衡数据集（使用剩余的119个正样本）
balanced_datasets = create_multiple_balanced_datasets(
    X_training_pool, y_training_pool, 
    n_datasets=10,  # 创建100个数据集
    positive_sample_multiplier=1  # 1:1 平衡
)


In [None]:

def train_logistic_regression_on_datasets(datasets, feature_names):
    """
    在多个数据集上训练逻辑回归模型并进行交叉验证
    包含L2正则化、概率校准和SHAP分析
    """
    
    print(f"\n🤖 开始训练逻辑回归模型...")
    print("=" * 50)
    
    # 存储所有结果
    all_results = []
    shap_values_all = []
    
    # L2正则化参数网格
    C_values = [0.001, 0.01, 0.1, 1, 10, 100]
    
    for i, dataset in enumerate(datasets):
        print(f"\n--- 训练数据集 {dataset['dataset_id']} ---")
        
        X_train = dataset['X']
        y_train = dataset['y']
        
        # 标准化特征
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        
        # 1. L2正则化参数调优 - 以F2分数为优化目标
        print("🔍 进行L2正则化参数调优（优化F2分数）...")
        param_grid = {
            'C': C_values,
            'l1_ratio': [0],  # 纯L2正则化
            'penalty': ['elasticnet'],
            'solver': ['saga'],
            'max_iter': [1000],
            'class_weight': ['balanced']
        }
        
        # 定义F2分数评估器
        from sklearn.metrics import fbeta_score, make_scorer
        f2_scorer = make_scorer(fbeta_score, beta=2, zero_division=0)
        
        base_model = LogisticRegression(random_state=42)
        grid_search = GridSearchCV(
            base_model, 
            param_grid, 
            cv=5, 
            scoring=f2_scorer,  # 使用F2分数作为优化目标
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train_scaled, y_train)
        
        best_model = grid_search.best_estimator_
        best_C = grid_search.best_params_['C']
        best_f2_score = grid_search.best_score_
        print(f"最佳L2正则化参数 C = {best_C}")
        print(f"最佳F2分数 = {best_f2_score:.4f}")
        
        # 2. 交叉验证评估（重点关注F2分数）
        cv_scores = {
            'accuracy': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='accuracy'),
            'precision': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='precision'),
            'recall': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='recall'),
            'f1': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='f1'),
            'f2': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring=f2_scorer),
            'roc_auc': cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
        }
        
        # 3. 训练最终模型
        best_model.fit(X_train_scaled, y_train)
        
        # 4. 概率校准
        print("📊 进行概率校准...")
        # 使用Platt scaling进行概率校准
        calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv=3)
        calibrated_model.fit(X_train_scaled, y_train)
        
        # 5. SHAP分析
        print("🔬 进行SHAP分析...")
        try:
            # 为线性模型创建SHAP explainer
            explainer = shap.LinearExplainer(best_model, X_train_scaled)
            shap_values = explainer.shap_values(X_train_scaled)
            
            # 计算特征重要性
            feature_importance = np.abs(shap_values).mean(axis=0)
            
            shap_result = {
                'explainer': explainer,
                'shap_values': shap_values,
                'feature_importance': feature_importance,
                'feature_names': feature_names
            }
        except Exception as e:
            print(f"SHAP分析出错: {e}")
            shap_result = None
        
        # 6. 存储结果
        result = {
            'dataset_id': dataset['dataset_id'],
            'model': best_model,
            'calibrated_model': calibrated_model,
            'scaler': scaler,
            'best_C': best_C,
            'cv_scores': cv_scores,
            'mean_scores': {metric: scores.mean() for metric, scores in cv_scores.items()},
            'std_scores': {metric: scores.std() for metric, scores in cv_scores.items()},
            'shap_result': shap_result
        }
        
        all_results.append(result)
        
        # 打印交叉验证结果
        print(f"交叉验证结果 (5折):")
        for metric, scores in cv_scores.items():
            print(f"  {metric:10}: {scores.mean():.4f} (±{scores.std():.4f})")
    
    return all_results

def plot_calibration_curves(y_true, y_proba_orig, y_proba_calib):
    """绘制概率校准曲线"""
    plt.figure(figsize=(15, 5))
    
    # 子图1：可靠性图
    plt.subplot(1, 3, 1)
    fraction_of_positives_orig, mean_predicted_value_orig = calibration_curve(y_true, y_proba_orig, n_bins=10)
    fraction_of_positives_calib, mean_predicted_value_calib = calibration_curve(y_true, y_proba_calib, n_bins=10)
    
    plt.plot(mean_predicted_value_orig, fraction_of_positives_orig, "s-", label="原始模型", color='red')
    plt.plot(mean_predicted_value_calib, fraction_of_positives_calib, "o-", label="校准后模型", color='blue')
    plt.plot([0, 1], [0, 1], "k--", label="完美校准")
    plt.xlabel("平均预测概率")
    plt.ylabel("实际正例比例")
    plt.title("可靠性图（校准曲线）")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 子图2：概率分布
    plt.subplot(1, 3, 2)
    plt.hist(y_proba_orig, bins=20, alpha=0.7, label='原始预测概率', color='red', density=True)
    plt.hist(y_proba_calib, bins=20, alpha=0.7, label='校准后预测概率', color='blue', density=True)
    plt.xlabel('预测概率')
    plt.ylabel('密度')
    plt.title('预测概率分布')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 子图3：ROC曲线
    plt.subplot(1, 3, 3)
    fpr_orig, tpr_orig, _ = roc_curve(y_true, y_proba_orig)
    fpr_calib, tpr_calib, _ = roc_curve(y_true, y_proba_calib)
    
    plt.plot(fpr_orig, tpr_orig, label=f'原始模型 (AUC = {roc_auc_score(y_true, y_proba_orig):.3f})', color='red')
    plt.plot(fpr_calib, tpr_calib, label=f'校准后模型 (AUC = {roc_auc_score(y_true, y_proba_calib):.3f})', color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='随机模型')
    plt.xlabel('假正例率')
    plt.ylabel('真正例率')
    plt.title('ROC曲线')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


In [None]:
# 训练多个逻辑回归模型
training_results = train_logistic_regression_on_datasets(balanced_datasets, feature_names)


In [None]:
# 在最终验证集上测试模型性能
# =====================================

def evaluate_on_final_validation(training_results, X_final_val, y_final_val):
    """
    在最终验证集（30个正样本）上评估集成模型性能
    """
    
    print(f"最终验证集评估（真正的未见过数据）")
    
    # 收集所有模型的预测概率
    all_predictions = []
    all_predictions_binary = []
    
    for result in training_results:
        model = result['model']
        scaler = result['scaler']
        
        # 标准化验证集
        X_val_scaled = scaler.transform(X_final_val)
        
        # 预测概率
        y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
        y_pred = model.predict(X_val_scaled)
        
        all_predictions.append(y_pred_proba)
        all_predictions_binary.append(y_pred)
    
    # 集成预测（平均概率）
    ensemble_proba = np.mean(all_predictions, axis=0)
    ensemble_pred = (ensemble_proba >= 0.5).astype(int)
    # 不同阈值下的性能
    print(f"最终验证集基本信息:")
    print(f"   总样本数: {len(y_final_val)}")
    print(f"   正样本数: {y_final_val.sum()}")
    print(f"   负样本数: {len(y_final_val) - y_final_val.sum()}")
    print(f"   转化率: {y_final_val.mean():.6f}")
    
    print(f"\n📈 最终验证集在不同阈值下的性能:")
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    final_threshold_results = []
    
    print("阈值    精确率   召回率    F1分数   识别转化用户数  预测正样本数")
    print("-" * 70)
    
    for threshold in thresholds:
        pred_at_threshold = (ensemble_proba >= threshold).astype(int)
        
        precision = precision_score(y_final_val, pred_at_threshold, zero_division=0)
        recall = recall_score(y_final_val, pred_at_threshold, zero_division=0)
        f1 = f1_score(y_final_val, pred_at_threshold, zero_division=0)
        identified_converted = int(recall * y_final_val.sum())
        predicted_positive = pred_at_threshold.sum()
        
        final_threshold_results.append({
            'threshold': threshold,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'identified_converted': identified_converted,
            'predicted_positive': predicted_positive
        })
        
        print(f"{threshold:.1f}     {precision:.4f}   {recall:.4f}    {f1:.4f}        {identified_converted}/{y_final_val.sum()}           {predicted_positive}")
    
    # 找到最佳性能
    best_recall_result = max(final_threshold_results, key=lambda x: x['recall'])
    best_f1_result = max(final_threshold_results, key=lambda x: x['f1'])
    
    print(f"最终验证集最佳性能:")
    print(f"最高召回率: {best_recall_result['recall']:.4f} (阈值: {best_recall_result['threshold']:.1f})")
    print(f"   识别转化用户: {best_recall_result['identified_converted']}/{y_final_val.sum()}")
    print(f"   对应精确率: {best_recall_result['precision']:.4f}")
    print(f"   预测正样本数: {best_recall_result['predicted_positive']}")
    
    print(f"\n最高F1分数: {best_f1_result['f1']:.4f} (阈值: {best_f1_result['threshold']:.1f})")
    print(f"   召回率: {best_f1_result['recall']:.4f}")
    print(f"   精确率: {best_f1_result['precision']:.4f}")
    print(f"   识别转化用户: {best_f1_result['identified_converted']}/{y_final_val.sum()}")
    print(f"   预测正样本数: {best_f1_result['predicted_positive']}")
    
    # 模型置信度分析
    print(f"预测置信度分析:")
    positive_probas = ensemble_proba[y_final_val == 1]  # 真正转化用户的预测概率
    negative_probas = ensemble_proba[y_final_val == 0]  # 真正未转化用户的预测概率
    
    print(f"真正转化用户的平均预测概率: {positive_probas.mean():.4f}")
    print(f"真正转化用户的预测概率范围: [{positive_probas.min():.4f}, {positive_probas.max():.4f}]")
    print(f"真正未转化用户的平均预测概率: {negative_probas.mean():.4f}")
    print(f"真正未转化用户的预测概率范围: [{negative_probas.min():.4f}, {negative_probas.max():.4f}]")
    
    # 绘制概率分布
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(positive_probas, bins=20, alpha=0.7, label='转化用户', color='red', density=True)
    plt.hist(negative_probas, bins=20, alpha=0.7, label='未转化用户', color='blue', density=True)
    plt.xlabel('预测概率')
    plt.ylabel('密度')
    plt.title('最终验证集：预测概率分布')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    # 混淆矩阵（使用最佳F1阈值）
    best_pred = (ensemble_proba >= best_f1_result['threshold']).astype(int)
    cm = confusion_matrix(y_final_val, best_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'最终验证集混淆矩阵\\n(阈值={best_f1_result["threshold"]:.1f})')
    plt.xlabel('预测值')
    plt.ylabel('实际值')
    
    plt.tight_layout()
    plt.show()
    
    # 5. SHAP分析
    print("🔬 进行SHAP分析...")
    try:
        # 为线性模型创建SHAP explainer
        explainer = shap.LinearExplainer(result['model'], X_final_val)
        shap_values = explainer.shap_values(X_final_val)
        
        # 计算特征重要性
        feature_importance = np.abs(shap_values).mean(axis=0)
        
        shap_result = {
            'explainer': explainer,
            'shap_values': shap_values,
            'feature_importance': feature_importance,
            'feature_names': feature_names
        }
    except Exception as e:
        print(f"SHAP分析出错: {e}")
        shap_result = None
    print(shap_result)




    return {
        'ensemble_proba': ensemble_proba,
        'ensemble_pred': ensemble_pred,
        'threshold_results': final_threshold_results,
        'best_recall': best_recall_result,
        'best_f1': best_f1_result,
        'positive_probas': positive_probas,
        'negative_probas': negative_probas,
        'shap_result': shap_result
    }

# 执行最终验证集评估
print("开始最终验证集评估...")
final_validation_results = evaluate_on_final_validation(
    training_results, 
    X_final_validation, 
    y_final_validation
)
ensemble_results = final_validation_results

In [None]:
X_full_test = X_final_validation
y_full_test = y_final_validation

In [None]:
# 定义缺失的函数和生成策略信息
# =====================================

def generate_threshold_strategies(ensemble_results):
    """
    从集成结果中生成不同阈值策略
    """
    
    threshold_results = ensemble_results['threshold_results']
    
    # 找到最佳策略
    best_recall = max(threshold_results, key=lambda x: x['recall'])
    best_f1 = max(threshold_results, key=lambda x: x['f1'])
    best_precision = max(threshold_results, key=lambda x: x['precision'])
    
    # 找到业务平衡点（F1在0.3以上且召回率较高）
    balanced_candidates = [r for r in threshold_results if r['f1'] >= 0.3]
    business_balanced = max(balanced_candidates, key=lambda x: x['recall']) if balanced_candidates else best_f1
    
    strategies = [
        ('最高召回率策略', best_recall),
        ('最高F1策略', best_f1), 
        ('最高精确率策略', best_precision),
        ('业务平衡策略', business_balanced)
    ]
    
    print("🎯 生成的阈值策略:")
    print("="*60)
    for name, strategy in strategies:
        print(f"{name}:")
        print(f"  阈值: {strategy['threshold']:.1f}")
        print(f"  召回率: {strategy['recall']:.4f}")
        print(f"  精确率: {strategy['precision']:.4f}")
        print(f"  F1分数: {strategy['f1']:.4f}")
        print(f"  识别转化用户: {strategy['identified_converted']}")
        print("")
    
    return strategies

def display_final_confusion_matrices(ensemble_results, y_test):
    """
    显示关键阈值的混淆矩阵
    """
    
    print("📊 显示关键阈值的混淆矩阵")
    print("="*50)
    
    # 生成策略
    strategies = generate_threshold_strategies(ensemble_results)
    
    # 选择几个关键策略显示混淆矩阵
    key_strategies = [strategies[0], strategies[1]]  # 最高召回率和最高F1
    
    fig, axes = plt.subplots(1, len(key_strategies), figsize=(6*len(key_strategies), 5))
    if len(key_strategies) == 1:
        axes = [axes]
    
    for i, (name, strategy) in enumerate(key_strategies):
        threshold = strategy['threshold']
        predictions = (ensemble_results['ensemble_proba'] >= threshold).astype(int)
        
        cm = confusion_matrix(y_test, predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f'{name}\n阈值={threshold:.1f}')
        axes[i].set_xlabel('预测值')
        axes[i].set_ylabel('实际值')
    
    plt.tight_layout()
    plt.show()
    
    return strategies

# 生成策略信息
print("🔧 修复缺失的变量...")
final_strategies = generate_threshold_strategies(ensemble_results)

print("\n✅ 策略信息生成完成！")


In [None]:
# 定义集成预测器类 (移到模块级别以支持pickle序列化)
# =========================================================

class EnsemblePredictor:
    """
    集成预测器类 - 用于加载和使用训练好的多个逻辑回归模型
    """
    def __init__(self, models, feature_names, threshold_strategies):
        self.models = models  # 包含model和scaler的列表
        self.feature_names = feature_names
        self.threshold_strategies = threshold_strategies
        self.n_models = len(models)
        
    def predict_proba(self, X):
        """预测概率（集成平均）"""
        if isinstance(X, pd.DataFrame):
            X = X[self.feature_names].values
        
        all_probas = []
        for model_info in self.models:
            X_scaled = model_info['scaler'].transform(X)
            proba = model_info['model'].predict_proba(X_scaled)[:, 1]
            all_probas.append(proba)
        
        return np.mean(all_probas, axis=0)
    
    def predict(self, X, threshold=0.5):
        """预测类别"""
        probas = self.predict_proba(X)
        return (probas >= threshold).astype(int)
    
    def predict_with_strategy(self, X, strategy_name):
        """使用特定策略预测"""
        probas = self.predict_proba(X)
        
        strategy_thresholds = {
            '最高召回率策略': self.threshold_strategies[0][1]['threshold'],
            '最高F1策略': self.threshold_strategies[1][1]['threshold'],
            '最高精确率策略': self.threshold_strategies[2][1]['threshold'],
            '业务平衡策略': self.threshold_strategies[3][1]['threshold']
        }
        
        threshold = strategy_thresholds.get(strategy_name, 0.5)
        return (probas >= threshold).astype(int), probas
    
    def get_feature_names(self):
        """获取特征名称"""
        return self.feature_names
    
    def get_model_info(self):
        """获取模型信息"""
        return {
            'n_models': self.n_models,
            'feature_names': self.feature_names,
            'available_strategies': ['最高召回率策略', '最高F1策略', '最高精确率策略', '业务平衡策略']
        }

print("✅ EnsemblePredictor 类定义完成，支持pickle序列化")


In [None]:
# 修复版：保存模型（使用全局EnsemblePredictor类）
# =============================================
import os
import pickle
import joblib
from datetime import datetime
def save_ensemble_model_fixed(training_results, ensemble_results, feature_names, strategies):
    """
    保存训练好的集成模型和相关信息（修复pickle错误）
    """
    print(f"\n💾 保存模型...")
    print("="*50)
    
    # 创建保存目录
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_dir = f"./saved_models_{timestamp}"
    os.makedirs(model_dir, exist_ok=True)
    
    # 1. 保存所有单个模型
    print("保存10个单独的逻辑回归模型...")
    individual_models = {}
    for result in training_results:
        model_id = result['dataset_id']
        model_info = {
            'model': result['model'],
            'scaler': result['scaler'],
            'cv_scores': result['cv_scores'],
            'mean_scores': result['mean_scores'],
            'std_scores': result['std_scores']
        }
        
        # 保存单个模型
        model_filename = f"{model_dir}/model_{model_id}.pkl"
        joblib.dump(model_info, model_filename)
        individual_models[f'model_{model_id}'] = model_filename
        print(f"  ✅ 模型 {model_id} 已保存: {model_filename}")
    
    # 2. 保存集成模型预测器（使用全局定义的类）
    print("\\n保存集成模型预测器...")
    
    # 创建集成预测器（使用已定义的全局EnsemblePredictor类）
    ensemble_predictor = EnsemblePredictor(
        models=[{'model': r['model'], 'scaler': r['scaler']} for r in training_results],
        feature_names=feature_names,
        threshold_strategies=strategies
    )
    
    # 保存集成预测器
    ensemble_filename = f"{model_dir}/ensemble_predictor.pkl"
    joblib.dump(ensemble_predictor, ensemble_filename)
    print(f"  ✅ 集成预测器已保存: {ensemble_filename}")
    
    # 3. 保存模型元信息
    print("\\n保存模型元信息...")
    model_metadata = {
        'timestamp': timestamp,
        'model_type': 'Multiple Undersampling Logistic Regression Ensemble',
        'n_models': len(training_results),
        'feature_names': feature_names,
        'n_features': len(feature_names),
        'training_strategy': '149正样本 + 10次随机149负样本',
        'ensemble_results': ensemble_results,
        'threshold_strategies': {name: result for name, result in strategies},
        'individual_model_files': individual_models,
        'ensemble_file': ensemble_filename,
        'model_directory': model_dir
    }
    
    metadata_filename = f"{model_dir}/model_metadata.pkl"
    with open(metadata_filename, 'wb') as f:
        pickle.dump(model_metadata, f)
    print(f"  ✅ 模型元信息已保存: {metadata_filename}")
    
    # 4. 保存使用说明
    print("\\n生成使用说明...")
    usage_guide = f'''# 模型使用指南
保存时间: {timestamp}

## 文件说明:
- ensemble_predictor.pkl: 集成预测器（推荐使用）
- model_1.pkl ~ model_10.pkl: 10个单独的逻辑回归模型
- model_metadata.pkl: 模型元信息和性能数据

## 使用示例:

```python
import joblib
import pandas as pd

# 加载集成预测器
ensemble_model = joblib.load('{ensemble_filename}')

# 准备新数据（确保特征顺序与训练时一致）
# X_new = your_new_data[feature_names]

# 方法1: 使用默认阈值预测
predictions = ensemble_model.predict(X_new)
probabilities = ensemble_model.predict_proba(X_new)

# 方法2: 使用特定策略预测
predictions, probabilities = ensemble_model.predict_with_strategy(X_new, '最高召回率策略')

# 可用策略:
# - '最高召回率策略': 最大化转化用户识别
# - '最高F1策略': 平衡精确率和召回率  
# - '最高精确率策略': 最小化误判
# - '业务平衡策略': 适中的业务平衡点
```

## 性能总结:
'''
    
    for name, result in strategies:
        usage_guide += f"- {name}: 阈值{result['threshold']:.1f}, 召回率{result['recall']:.3f}, 精确率{result['precision']:.3f}\\n"
    
    with open(f"{model_dir}/使用指南.txt", 'w', encoding='utf-8') as f:
        f.write(usage_guide)
    print(f"  ✅ 使用指南已保存: {model_dir}/使用指南.txt")
    
    print(f"\\n🎉 所有模型文件已保存到目录: {model_dir}")
    print(f"📁 目录包含:")
    for file in os.listdir(model_dir):
        file_path = os.path.join(model_dir, file)
        file_size = os.path.getsize(file_path) / 1024  # KB
        print(f"   📄 {file} ({file_size:.1f} KB)")
    
    return model_dir, ensemble_predictor

# 重新执行模型保存（使用修复版）
print("🔧 使用修复版函数重新保存模型...")

# 显示混淆矩阵（如果还没有显示）
try:
    final_strategies
    print("✅ 混淆矩阵已显示")
except NameError:
    print("⚠️  重新生成混淆矩阵...")
    final_strategies = display_final_confusion_matrices(ensemble_results, y_full_test)

# 保存模型（使用修复版函数）
model_directory_fixed, saved_ensemble_model_fixed = save_ensemble_model_fixed(
    training_results, ensemble_results, feature_names, final_strategies
)

print(f"\\n✅ 模型保存完成！目录: {model_directory_fixed}")
print(f"🎉 现在可以安全地加载和使用模型了！")


In [None]:
# 可视化分析和结果汇总
# =======================

def plot_comprehensive_analysis(training_results, ensemble_results, y_full_test):
    """
    绘制综合分析图表
    """
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. 交叉验证性能对比
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    dataset_ids = [result['dataset_id'] for result in training_results]
    
    for i, metric in enumerate(metrics):
        if i < 5:  # 只绘制前5个指标
            mean_values = [result['mean_scores'][metric] for result in training_results]
            std_values = [result['std_scores'][metric] for result in training_results]
            
            if metric == 'recall':
                axes[0, 0].errorbar(dataset_ids, mean_values, yerr=std_values, 
                                  marker='o', label=metric, linewidth=2, markersize=6)
            elif metric == 'precision':
                axes[0, 1].errorbar(dataset_ids, mean_values, yerr=std_values, 
                                  marker='s', label=metric, linewidth=2, markersize=6)
            elif metric == 'f1':
                axes[0, 2].errorbar(dataset_ids, mean_values, yerr=std_values, 
                                  marker='^', label=metric, linewidth=2, markersize=6)
    
    axes[0, 0].set_title('各数据集召回率对比', fontsize=14)
    axes[0, 0].set_xlabel('数据集ID')
    axes[0, 0].set_ylabel('召回率')
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].legend()
    
    axes[0, 1].set_title('各数据集精确率对比', fontsize=14)
    axes[0, 1].set_xlabel('数据集ID')
    axes[0, 1].set_ylabel('精确率')
    axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].legend()
    
    axes[0, 2].set_title('各数据集F1分数对比', fontsize=14)
    axes[0, 2].set_xlabel('数据集ID')
    axes[0, 2].set_ylabel('F1分数')
    axes[0, 2].grid(True, alpha=0.3)
    axes[0, 2].legend()
    
    # 2. 阈值性能曲线
    thresholds = [result['threshold'] for result in ensemble_results['threshold_results']]
    recalls = [result['recall'] for result in ensemble_results['threshold_results']]
    precisions = [result['precision'] for result in ensemble_results['threshold_results']]
    f1s = [result['f1'] for result in ensemble_results['threshold_results']]
    
    axes[1, 0].plot(thresholds, recalls, 'o-', label='召回率', linewidth=2, markersize=6)
    axes[1, 0].plot(thresholds, precisions, 's-', label='精确率', linewidth=2, markersize=6)
    axes[1, 0].plot(thresholds, f1s, '^-', label='F1分数', linewidth=2, markersize=6)
    axes[1, 0].set_xlabel('预测阈值')
    axes[1, 0].set_ylabel('性能指标')
    axes[1, 0].set_title('集成模型阈值性能曲线', fontsize=14)
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 3. 转化用户识别数量
    identified_counts = [result['identified_converted'] for result in ensemble_results['threshold_results']]
    total_converted = y_full_test.sum()
    
    axes[1, 1].bar(range(len(thresholds)), identified_counts, alpha=0.7, color='skyblue')
    axes[1, 1].axhline(y=total_converted, color='red', linestyle='--', 
                      label=f'总转化用户数: {total_converted}')
    axes[1, 1].set_xlabel('阈值索引')
    axes[1, 1].set_ylabel('识别的转化用户数')
    axes[1, 1].set_title('不同阈值识别的转化用户数', fontsize=14)
    axes[1, 1].set_xticks(range(len(thresholds)))
    axes[1, 1].set_xticklabels([f'{t:.1f}' for t in thresholds], rotation=45)
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    # 4. 混淆矩阵 (最佳F1阈值)
    best_f1_idx = np.argmax(f1s)
    best_threshold = thresholds[best_f1_idx]
    best_predictions = (ensemble_results['ensemble_proba'] >= best_threshold).astype(int)
    
    cm = confusion_matrix(y_full_test, best_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 2])
    axes[1, 2].set_title(f'混淆矩阵 (阈值={best_threshold:.1f})', fontsize=14)
    axes[1, 2].set_xlabel('预测值')
    axes[1, 2].set_ylabel('实际值')
    
    plt.tight_layout()
    plt.show()


# 执行可视化和报告生成
plot_comprehensive_analysis(training_results, ensemble_results, y_full_test)


In [None]:
# 特征重要性分析（包含置信度计算）
# ==========================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

def analyze_feature_importance_with_confidence(training_results, X_test, y_test, feature_names):
    """
    分析特征重要性并计算置信度
    
    参数:
    - training_results: 训练结果列表
    - X_test: 测试特征
    - y_test: 测试标签  
    - feature_names: 特征名称列表
    
    返回:
    - 特征重要性分析结果
    """
    
    print("🔍 开始特征重要性分析...")
    print("=" * 60)
    
    # 1. 收集所有模型的系数重要性
    coefficients_matrix = []
    
    for result in training_results:
        model = result['model']
        # 获取逻辑回归系数（绝对值表示重要性）
        coeffs = np.abs(model.coef_[0])
        coefficients_matrix.append(coeffs)
    
    coefficients_matrix = np.array(coefficients_matrix)
    
    # 2. 计算系数重要性统计
    coef_importance = {
        'feature': feature_names,
        'mean_importance': np.mean(coefficients_matrix, axis=0),
        'std_importance': np.std(coefficients_matrix, axis=0),
        'min_importance': np.min(coefficients_matrix, axis=0),
        'max_importance': np.max(coefficients_matrix, axis=0)
    }
    
    # 计算置信区间 (95%)
    confidence_intervals = []
    for i in range(len(feature_names)):
        feature_coeffs = coefficients_matrix[:, i]
        # 计算95%置信区间
        ci_lower, ci_upper = stats.t.interval(
            confidence=0.95,
            df=len(feature_coeffs)-1,
            loc=np.mean(feature_coeffs),
            scale=stats.sem(feature_coeffs)
        )
        confidence_intervals.append((ci_lower, ci_upper))
    
    coef_importance['confidence_interval'] = confidence_intervals
    
    # 3. 计算变异系数（稳定性指标）
    cv_scores = coef_importance['std_importance'] / (coef_importance['mean_importance'] + 1e-10)
    coef_importance['coefficient_variation'] = cv_scores
    
    # 4. 使用置换重要性验证（取前3个模型进行计算，节省时间）
    print("🔄 计算置换重要性（使用10个模型验证）...")
    
    permutation_scores = []
    for i, result in enumerate(training_results[:10]):  # 只用10个模型
        model = result['model']
        scaler = result['scaler']
        
        # 标准化测试数据
        X_test_scaled = scaler.transform(X_test)
        
        # 计算置换重要性
        perm_importance = permutation_importance(
            model, X_test_scaled, y_test,
            scoring='roc_auc',
            n_repeats=5,
            random_state=42
        )
        
        permutation_scores.append(perm_importance.importances_mean)
        print(f"   模型 {i+1}/10 完成")
    
    # 计算置换重要性统计
    permutation_matrix = np.array(permutation_scores)
    perm_importance = {
        'mean_importance': np.mean(permutation_matrix, axis=0),
        'std_importance': np.std(permutation_matrix, axis=0)
    }
    
    # 5. 创建综合重要性数据框
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'coef_importance': coef_importance['mean_importance'],
        'coef_std': coef_importance['std_importance'],
        'coef_cv': coef_importance['coefficient_variation'],
        'confidence_lower': [ci[0] for ci in coef_importance['confidence_interval']],
        'confidence_upper': [ci[1] for ci in coef_importance['confidence_interval']],
        'perm_importance': perm_importance['mean_importance'],
        'perm_std': perm_importance['std_importance']
    })
    
    # 计算综合重要性得分（系数重要性 + 置换重要性的标准化组合）
    coef_norm = (importance_df['coef_importance'] - importance_df['coef_importance'].min()) / \
                (importance_df['coef_importance'].max() - importance_df['coef_importance'].min() + 1e-10)
    perm_norm = (importance_df['perm_importance'] - importance_df['perm_importance'].min()) / \
                (importance_df['perm_importance'].max() - importance_df['perm_importance'].min() + 1e-10)
    
    importance_df['combined_importance'] = (coef_norm + perm_norm) / 2
    
    # 计算稳定性得分（变异系数越小，稳定性越高）
    importance_df['stability_score'] = 1 / (1 + importance_df['coef_cv'])
    
    # 按综合重要性排序
    importance_df = importance_df.sort_values('combined_importance', ascending=False)
    
    print("✅ 特征重要性分析完成！")
    
    return importance_df

def plot_feature_importance_analysis(importance_df, top_n=20):
    """
    可视化特征重要性分析结果
    """
    
    # 取前N个最重要的特征
    top_features = importance_df.head(top_n)
    
    # 创建图表
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    
    # 1. 系数重要性排序（带置信区间）
    ax1 = axes[0, 0]
    y_pos = np.arange(len(top_features))
    
    # 绘制误差条
    ax1.barh(y_pos, top_features['coef_importance'], 
             xerr=[top_features['coef_importance'] - top_features['confidence_lower'],
                   top_features['confidence_upper'] - top_features['coef_importance']],
             alpha=0.7, capsize=3)
    
    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(top_features['feature'], fontsize=10)
    ax1.set_xlabel('系数重要性（绝对值）')
    ax1.set_title(f'前{top_n}个特征的系数重要性（带95%置信区间）', fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    
    # 2. 置换重要性
    ax2 = axes[0, 1]
    y_pos = np.arange(len(top_features))
    
    bars = ax2.barh(y_pos, top_features['perm_importance'], 
                    xerr=top_features['perm_std'],
                    alpha=0.7, capsize=3, color='orange')
    
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(top_features['feature'], fontsize=10)
    ax2.set_xlabel('置换重要性（ROC AUC损失）')
    ax2.set_title(f'前{top_n}个特征的置换重要性', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    # 3. 综合重要性得分
    ax3 = axes[1, 0]
    
    # 使用颜色表示稳定性
    colors = plt.cm.RdYlGn(top_features['stability_score'])
    bars = ax3.barh(y_pos, top_features['combined_importance'], color=colors, alpha=0.8)
    
    ax3.set_yticks(y_pos)
    ax3.set_yticklabels(top_features['feature'], fontsize=10)
    ax3.set_xlabel('综合重要性得分')
    ax3.set_title(f'前{top_n}个特征的综合重要性得分\\n（颜色深浅表示稳定性）', fontsize=14, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    
    # 添加颜色条
    sm = plt.cm.ScalarMappable(cmap=plt.cm.RdYlGn, 
                              norm=plt.Normalize(vmin=top_features['stability_score'].min(), 
                                               vmax=top_features['stability_score'].max()))
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax3)
    cbar.set_label('稳定性得分')
    
    # 4. 重要性相关性分析
    ax4 = axes[1, 1]
    
    # 散点图：系数重要性 vs 置换重要性
    scatter = ax4.scatter(top_features['coef_importance'], 
                         top_features['perm_importance'],
                         c=top_features['stability_score'],
                         s=100, alpha=0.7, cmap='RdYlGn')
    
    # 添加特征名称标注（仅显示前10个）
    for i, (idx, row) in enumerate(top_features.head(10).iterrows()):
        ax4.annotate(row['feature'], 
                    (row['coef_importance'], row['perm_importance']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8, alpha=0.8)
    
    ax4.set_xlabel('系数重要性')
    ax4.set_ylabel('置换重要性')
    ax4.set_title('重要性方法一致性分析', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3)
    
    # 计算相关系数
    correlation = np.corrcoef(top_features['coef_importance'], 
                             top_features['perm_importance'])[0, 1]
    ax4.text(0.05, 0.95, f'相关系数: {correlation:.3f}', 
             transform=ax4.transAxes, fontsize=12,
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
    
    plt.colorbar(scatter, ax=ax4, label='稳定性得分')
    
    plt.tight_layout()
    plt.show()

def generate_feature_importance_report(importance_df):
    """
    生成特征重要性分析报告
    """
    
    print("\\n" + "="*80)
    print("🎯 特征重要性分析报告")
    print("="*80)
    
    # 1. Top 10 最重要特征
    print("\\n🏆 Top 10 最重要特征:")
    print("-" * 60)
    
    top_10 = importance_df.head(10)
    for i, (idx, row) in enumerate(top_10.iterrows(), 1):
        print(f"{i:2d}. {row['feature']}")
        print(f"     综合重要性: {row['combined_importance']:.4f}")
        print(f"     系数重要性: {row['coef_importance']:.4f} (±{row['coef_std']:.4f})")
        print(f"     置换重要性: {row['perm_importance']:.4f} (±{row['perm_std']:.4f})")
        print(f"     稳定性得分: {row['stability_score']:.4f}")
        print(f"     95%置信区间: [{row['confidence_lower']:.4f}, {row['confidence_upper']:.4f}]")
        print()
    
    # 2. 特征分类分析
    print("\\n📊 特征分类分析:")
    print("-" * 40)
    
    # 根据特征名称分类
    feature_categories = {
        '通话相关': [],
        '转化模式': [],
        '渠道相关': [],
        '时间相关': [],
        '其他': []
    }
    
    for _, row in importance_df.iterrows():
        feature = row['feature']
        if any(keyword in feature for keyword in ['通话', '打通', '完播', '拨打']):
            feature_categories['通话相关'].append((feature, row['combined_importance']))
        elif any(keyword in feature for keyword in ['转化', '模式', '高']):
            feature_categories['转化模式'].append((feature, row['combined_importance']))
        elif any(keyword in feature for keyword in ['渠道', '白名单']):
            feature_categories['渠道相关'].append((feature, row['combined_importance']))
        elif any(keyword in feature for keyword in ['时段', '工作日', '周']):
            feature_categories['时间相关'].append((feature, row['combined_importance']))
        else:
            feature_categories['其他'].append((feature, row['combined_importance']))
    
    for category, features in feature_categories.items():
        if features:
            avg_importance = np.mean([imp for _, imp in features])
            print(f"{category}: {len(features)}个特征, 平均重要性: {avg_importance:.4f}")
            # 显示该类别最重要的特征
            top_feature = max(features, key=lambda x: x[1])
            print(f"   最重要: {top_feature[0]} ({top_feature[1]:.4f})")
    
    # 3. 稳定性分析
    print("\\n📈 模型稳定性分析:")
    print("-" * 40)
    
    high_stability = importance_df[importance_df['stability_score'] > 0.8]
    medium_stability = importance_df[(importance_df['stability_score'] > 0.6) & 
                                   (importance_df['stability_score'] <= 0.8)]
    low_stability = importance_df[importance_df['stability_score'] <= 0.6]
    
    print(f"高稳定性特征 (>0.8): {len(high_stability)}个")
    print(f"中等稳定性特征 (0.6-0.8): {len(medium_stability)}个")
    print(f"低稳定性特征 (≤0.6): {len(low_stability)}个")
    
    if len(low_stability) > 0:
        print(f"\\n⚠️  低稳定性特征:")
        for _, row in low_stability.head(5).iterrows():
            print(f"   {row['feature']}: 稳定性={row['stability_score']:.3f}")
    
    # 4. 置信度分析
    print("\\n🎯 置信度分析:")
    print("-" * 40)
    
    # 计算置信区间宽度
    confidence_width = importance_df['confidence_upper'] - importance_df['confidence_lower']
    
    narrow_ci = importance_df[confidence_width < confidence_width.quantile(0.25)]
    wide_ci = importance_df[confidence_width > confidence_width.quantile(0.75)]
    
    print(f"高置信度特征 (窄置信区间): {len(narrow_ci)}个")
    print(f"低置信度特征 (宽置信区间): {len(wide_ci)}个")
    print(f"平均置信区间宽度: {confidence_width.mean():.4f}")
    
    # 5. 业务解释
    print("\\n💼 业务解释:")
    print("-" * 40)
    
    top_3 = importance_df.head(3)
    print("最重要的3个特征业务含义:")
    for i, (_, row) in enumerate(top_3.iterrows(), 1):
        feature = row['feature']
        print(f"{i}. {feature}")
        
        # 简单的业务解释
        if '转化模式得分' in feature:
            print("   → 综合转化行为模式评分，是最强的转化预测指标")
        elif '通话' in feature:
            print("   → 通话相关行为，反映客户参与度和兴趣程度")
        elif '渠道' in feature or '白名单' in feature:
            print("   → 客户来源渠道，不同渠道的客户转化倾向不同")
        elif 'AI' in feature or '人工' in feature:
            print("   → 客户意向评估，直接反映转化可能性")
        else:
            print("   → 其他重要的客户行为或属性特征")
    
    print("\\n" + "="*80)
    print("✅ 特征重要性分析报告生成完成！")
    print("="*80)
    
    return importance_df

# 执行特征重要性分析
print("🚀 开始执行特征重要性分析...")
importance_results = analyze_feature_importance_with_confidence(
    training_results, 
    X_full_test, 
    y_full_test, 
    feature_names
)

# 可视化分析结果
plot_feature_importance_analysis(importance_results, top_n=20)

# 生成详细报告
final_importance_df = generate_feature_importance_report(importance_results)


In [None]:
# 保存特征重要性结果
# =======================

import os
from datetime import datetime

# 创建保存目录
save_dir = "特征重要性分析结果"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 生成时间戳
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# 保存详细的重要性数据
importance_file = f"{save_dir}/特征重要性分析_{timestamp}.xlsx"
with pd.ExcelWriter(importance_file, engine='openpyxl') as writer:
    # 保存完整的重要性分析结果
    final_importance_df.to_excel(writer, sheet_name='特征重要性排序', index=False)
    
    # 创建Top 10汇总
    top_10_summary = final_importance_df.head(10)[['feature', 'combined_importance', 
                                                    'coef_importance', 'perm_importance', 
                                                    'stability_score']].copy()
    top_10_summary.columns = ['特征名称', '综合重要性', '系数重要性', '置换重要性', '稳定性得分']
    top_10_summary.to_excel(writer, sheet_name='Top10特征汇总', index=False)
    
    # 创建分类汇总
    feature_categories_summary = []
    feature_categories = {
        '通话相关': [],
        '转化模式': [],
        '渠道相关': [],
        '时间相关': [],
        '其他': []
    }
    
    for _, row in final_importance_df.iterrows():
        feature = row['feature']
        if any(keyword in feature for keyword in ['通话', '打通', '完播', '拨打']):
            feature_categories['通话相关'].append(row['combined_importance'])
        elif any(keyword in feature for keyword in ['转化', '模式', '高']):
            feature_categories['转化模式'].append(row['combined_importance'])
        elif any(keyword in feature for keyword in ['渠道', '白名单']):
            feature_categories['渠道相关'].append(row['combined_importance'])
        elif any(keyword in feature for keyword in ['时段', '工作日', '周']):
            feature_categories['时间相关'].append(row['combined_importance'])
        else:
            feature_categories['其他'].append(row['combined_importance'])
    
    for category, importances in feature_categories.items():
        if importances:
            feature_categories_summary.append({
                '特征类别': category,
                '特征数量': len(importances),
                '平均重要性': np.mean(importances),
                '最高重要性': np.max(importances),
                '最低重要性': np.min(importances)
            })
    
    category_df = pd.DataFrame(feature_categories_summary)
    category_df.to_excel(writer, sheet_name='特征分类汇总', index=False)

print(f"\\n💾 特征重要性分析结果已保存到: {importance_file}")

# 创建简化的重要性字典（供其他模块使用）
importance_dict = dict(zip(final_importance_df['feature'], 
                          final_importance_df['combined_importance']))

# 显示保存的文件信息
print(f"\\n📂 保存的文件包含以下内容:")
print(f"   📄 特征重要性排序: 包含{len(final_importance_df)}个特征的完整分析")
print(f"   📄 Top10特征汇总: 最重要的10个特征详情")  
print(f"   📄 特征分类汇总: 按业务类别归类的重要性统计")

# 快速查看Top 5特征及其置信度
print(f"\\n🏆 Top 5 特征重要性快速预览:")
print("=" * 70)
for i, (_, row) in enumerate(final_importance_df.head(5).iterrows(), 1):
    ci_width = row['confidence_upper'] - row['confidence_lower']
    print(f"{i}. {row['feature']}")
    print(f"   重要性: {row['combined_importance']:.4f}")
    print(f"   置信区间: [{row['confidence_lower']:.4f}, {row['confidence_upper']:.4f}] (宽度: {ci_width:.4f})")
    print(f"   稳定性: {row['stability_score']:.4f}")
    print()

print("✅ 特征重要性分析完成！可以在其他分析中使用 `importance_dict` 变量")


In [None]:
shap_feature_importanc = final_validation_results['shap_result']['feature_importance'] 
shap_feature_importanc

In [None]:
final_validation_results['shap_result']['feature_names']  

In [None]:
# 1. 提取 shap_values 数组，计算绝对值并按特征求均值
shap_importance = final_validation_results['shap_result']['feature_importance']   # 长度为 n_features
shap_features = final_validation_results['shap_result']['feature_names']  
# 2. 构建新的 DataFrame 或合并到已有 DataFrame
shap_df = pd.DataFrame({
    'feature': feature_names,
    'shap_importance': shap_feature_importanc
})
shap_df
# 如果你已有 importance_df，做 merge：
final_importance_df = final_importance_df.merge(shap_df, on='feature', how='left')
final_importance_df

In [None]:
# 实际置信区间分析工具
# =====================

def analyze_confidence_intervals(importance_df):
    """
    分析实际数据的置信区间并提供建议
    """
    
    print("🔍 您的数据置信区间分析")
    print("=" * 50)
    
    # 计算置信区间宽度
    importance_df['ci_width'] = importance_df['confidence_upper'] - importance_df['confidence_lower']
    #相对区间宽度
    importance_df['ci_width']= importance_df['ci_width'] / (importance_df['combined_importance'].abs() + 1e-8)
    # 分类特征
    high_confidence = importance_df[importance_df['ci_width'] <= 0.02]
    medium_confidence = importance_df[(importance_df['ci_width'] > 0.02) & 
                                    (importance_df['ci_width'] <= 0.15)]
    low_confidence = importance_df[importance_df['ci_width'] > 0.15]
    
    print(f"信度分类结果:")
    print(f" 高置信度特征: {len(high_confidence)}个 (宽度 ≤ 0.02)")
    print(f" 中等置信度特征: {len(medium_confidence)}个 (宽度 0.02-0.15)")
    print(f" 低置信度特征: {len(low_confidence)}个 (宽度 > 0.15)")
    
    print(f"\n🏆 最可信的Top 10特征 (置信区间最窄):")
    print("-" * 45)
    
    # 按置信区间宽度排序，找出最可信的特征
    most_reliable = importance_df.nsmallest(10, 'ci_width')
    
    for i, (_, row) in enumerate(most_reliable.iterrows(), 1):
        print(f"{i}. {row['feature']}")
        print(f"   重要性: {row['combined_importance']:.4f}")
        print(f"   置信区间: [{row['confidence_lower']:.4f}, {row['confidence_upper']:.4f}]")
        print(f"   区间宽度: {row['ci_width']:.4f} ✅")
        print()
    
    if len(low_confidence) > 0:
        print(f"需要注意的低置信度特征:")
        print("-" * 35)
        
        for i, (_, row) in enumerate(low_confidence.head(3).iterrows(), 1):
            print(f"{i}. {row['feature']}")
            print(f"   重要性: {row['combined_importance']:.4f}")
            print(f"   置信区间: [{row['confidence_lower']:.4f}, {row['confidence_upper']:.4f}]")
            print(f"   区间宽度: {row['ci_width']:.4f}")
            print(f"   → 建议: 这个特征的重要性不稳定，使用时需谨慎")
            print()
    
    # 综合建议
    print(f"综合建议:")
    print("-" * 15)
    
    reliable_important = importance_df[
        (importance_df['combined_importance'] > 0.5) & 
        (importance_df['ci_width'] <= 0.05)
    ]
    
    if len(reliable_important) > 0:
        print(f"推荐核心特征 ({len(reliable_important)}个): 重要性高且可信度高")
        for _, row in reliable_important.head(3).iterrows():
            print(f"   • {row['feature']}")
    
    unreliable_features = importance_df[importance_df['ci_width'] > 0.1]
    if len(unreliable_features) > 0:
        print(f"建议重新评估 ({len(unreliable_features)}个): 置信区间过宽")
        print(f"   → 可能需要更多数据或特征工程优化")
    
    return importance_df

def create_confidence_summary_table(importance_df):
    """
    创建置信度汇总表
    """
    
    print(f"置信度汇总表")
    print("=" * 80)
    
    # 选择关键列并重命名
    summary_cols = ['feature', 'combined_importance', 'confidence_lower', 
                   'confidence_upper', 'ci_width', 'stability_score']
    
    summary_df = importance_df[summary_cols].head(10).copy()
    
    # 重命名列
    summary_df.columns = ['特征名称', '综合重要性', '置信下限', '置信上限', 
                         '区间宽度', '稳定性得分']
    
    # 添加可信度评级
    def get_confidence_rating(width):
        if width <= 0.02:
            return " 高"
        elif width <= 0.15:
            return " 中"
        else:
            return " 低"
    
    summary_df['可信度'] = summary_df['区间宽度'].apply(get_confidence_rating)
    
    # 格式化数值
    for col in ['综合重要性', '置信下限', '置信上限', '区间宽度', '稳定性得分']:
        summary_df[col] = summary_df[col].round(4)
    
    print(summary_df.to_string(index=False))
    
    return summary_df

# 检查是否有importance结果可以分析
try:
    if 'final_importance_df' in locals():
        print("🚀 分析您的实际特征重要性置信区间...")
        
        # 分析置信区间
        analyzed_df = analyze_confidence_intervals(final_importance_df)
        
        # 创建汇总表
        summary_table = create_confidence_summary_table(analyzed_df)
        
    else:
        print(" 请先运行特征重要性分析代码以获得 final_importance_df")
        
except Exception as e:
    print(f"❌ 分析过程中出错: {e}")
    print(" 请确保已经运行了特征重要性分析的代码")


In [None]:
# 绘制最可信的Top 10特征图表
# ============================

def plot_most_reliable_top10_features(importance_df):
    """
    绘制最可信的Top 10特征的详细图表
    """
    
    print("绘制最可信的Top 10特征...")
    
    # 计算置信区间宽度
    if 'ci_width' not in importance_df.columns:
        importance_df['ci_width'] = importance_df['confidence_upper'] - importance_df['confidence_lower']
        importance_df['ci_width']=  (importance_df['confidence_upper'] - importance_df['confidence_lower'])/ (importance_df['combined_importance'].abs() + 1e-8)
    # 按shap重要性排序，选择最可信的Top 10
    importance_df['shap_importance']
    most_reliable_top10 = importance_df.nsmallest(10, 'shap_importance').copy()
    
    # 创建大图表
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    
    # 设置颜色方案
    colors = plt.cm.RdYlGn(np.linspace(0.3, 1.0, 10))  # 绿色系表示可信度高
    
    # 1. 特征重要性排序（带置信区间）
    ax1 = axes[0, 0]
    y_pos = np.arange(len(most_reliable_top10))
    
    # 绘制条形图
    bars = ax1.barh(y_pos, most_reliable_top10['combined_importance'], 
                   color=colors, alpha=0.8, height=0.6)
    
    # 绘制置信区间
    for i, (_, row) in enumerate(most_reliable_top10.iterrows()):
        # 标注置信区间宽度
        ax1.text(row['combined_importance'] + 0.002, i, 
                f'{row["ci_width"]:.4f}', 
                verticalalignment='center', fontsize=9, 
                bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.7))
    
    ax1.set_yticks(y_pos)
    ax1.set_yticklabels(most_reliable_top10['feature'], fontsize=11)
    ax1.set_xlabel('综合重要性得分', fontsize=12, fontweight='bold')
    ax1.set_title('SHAPTop 10特征重要性排序(表示95%置信区间宽度,越小越可信)', 
                 fontsize=14, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.set_xlim(0, max(most_reliable_top10['combined_importance']) * 1.2)
    
    # 2. 置信区间宽度比较
    ax2 = axes[0, 1]
    
    bars2 = ax2.bar(range(len(most_reliable_top10)), most_reliable_top10['ci_width'], 
                   color=colors, alpha=0.8)
    
    ax2.set_xticks(range(len(most_reliable_top10)))
    ax2.set_xticklabels([f'特征{i+1}' for i in range(len(most_reliable_top10))], 
                       rotation=45, ha='right')
    ax2.set_ylabel('置信区间宽度', fontsize=12, fontweight='bold')
    ax2.set_title('Top 10特征的置信区间宽度(越小表示越可信)', 
                 fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3)
    
    # 在每个柱子上标注数值
    for i, (bar, width) in enumerate(zip(bars2, most_reliable_top10['ci_width'])):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0005, 
                f'{width:.4f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # 添加分界线（可信度标准）
    ax2.axhline(y=0.02, color='green', linestyle='--', alpha=0.7, linewidth=2,
               label='高可信度线 (< 0.02)')
    ax2.axhline(y=0.05, color='orange', linestyle='--', alpha=0.7, linewidth=2,
               label='中等可信度线 (< 0.05)')
    ax2.legend()
    
    # 3. 稳定性得分对比
    ax3 = axes[1, 0]
    
    bars3 = ax3.bar(range(len(most_reliable_top10)), most_reliable_top10['stability_score'], 
                   color=colors, alpha=0.8)
    
    ax3.set_xticks(range(len(most_reliable_top10)))
    ax3.set_xticklabels([f'特征{i+1}' for i in range(len(most_reliable_top10))], 
                       rotation=45, ha='right')
    ax3.set_ylabel('稳定性得分', fontsize=12, fontweight='bold')
    ax3.set_title('Top 10特征的稳定性得分(越高表示越稳定)', 
                 fontsize=14, fontweight='bold')
    ax3.grid(True, alpha=0.3)
    ax3.set_ylim(0, 1.1)
    
    # 在每个柱子上标注数值
    for i, (bar, score) in enumerate(zip(bars3, most_reliable_top10['stability_score'])):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{score:.3f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    # 4. 综合评估雷达图风格的散点图
    ax4 = axes[1, 1]
    
    # 散点图：重要性 vs 可信度（用置信区间宽度的倒数表示）
    reliability_score = 1 / (most_reliable_top10['ci_width'] + 0.001)  # 避免除零
    
    scatter = ax4.scatter(most_reliable_top10['combined_importance'], 
                         reliability_score,
                         c=most_reliable_top10['stability_score'],
                         s=200, alpha=0.8, cmap='RdYlGn', edgecolors='black')
    
    # 添加特征名称标注
    for i, (_, row) in enumerate(most_reliable_top10.iterrows()):
        ax4.annotate(f'特征{i+1}', 
                    (row['combined_importance'], reliability_score.iloc[i]),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=10, fontweight='bold')
    
    ax4.set_xlabel('综合重要性得分', fontsize=12, fontweight='bold')
    ax4.set_ylabel('可信度得分 (1/置信区间宽度)', fontsize=12, fontweight='bold')
    ax4.set_title('特征重要性 vs 可信度分析(颜色表示稳定性，越绿越稳定)', 
                 fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3)
    
    # 添加颜色条
    cbar = plt.colorbar(scatter, ax=ax4)
    cbar.set_label('稳定性得分', fontsize=11)
    
    plt.tight_layout()
    plt.show()
    
    # 输出特征对照表
    print("\\n 特征编号对照表:")
    print("=" * 60)
    for i, (_, row) in enumerate(most_reliable_top10.iterrows(), 1):
        print(f"特征{i}: {row['feature']}")
    
    print("\\n 最可信Top 10特征统计:")
    print("=" * 40)
    print(f"平均重要性得分: {most_reliable_top10['combined_importance'].mean():.4f}")
    print(f"平均置信区间宽度: {most_reliable_top10['ci_width'].mean():.4f}")
    print(f"平均稳定性得分: {most_reliable_top10['stability_score'].mean():.4f}")
    
    high_confidence_count = len(most_reliable_top10[most_reliable_top10['ci_width'] <= 0.02])
    print(f"\\n高可信度特征数量: {high_confidence_count}/10")
    print(f"可信度优秀率: {high_confidence_count/10*100:.1f}%")
    
    return most_reliable_top10

# 执行绘图函数
try:
    if 'final_importance_df' in locals():
        print(" 开始绘制最可信的Top 10特征图表...")
        
        # 绘制图表
        top10_reliable = plot_most_reliable_top10_features(final_importance_df)
        
        print("\\n 图表绘制完成！")
        
    else:
        print("  请先运行特征重要性分析代码以获得 final_importance_df")
        print(" 运行顺序：特征重要性分析 → 置信区间分析 → 绘制图表")
        
except Exception as e:
    print(f"绘图过程中出错: {e}")
    print(" 请确保已经运行了特征重要性分析的代码")


In [None]:
##使用模型

In [None]:
# 定义集成预测器类 (必须在加载模型前定义，以支持pickle序列化)
# ================================================================

class EnsemblePredictor:
    """
    集成预测器类 - 用于加载和使用训练好的多个逻辑回归模型
    """
    def __init__(self, models, feature_names, threshold_strategies):
        self.models = models  # 包含model和scaler的列表
        self.feature_names = feature_names
        self.threshold_strategies = threshold_strategies
        self.n_models = len(models)
        
    def predict_proba(self, X):
        """预测概率（集成平均）"""
        if isinstance(X, pd.DataFrame):
            X = X[self.feature_names].values
        
        all_probas = []
        for model_info in self.models:
            X_scaled = model_info['scaler'].transform(X)
            proba = model_info['model'].predict_proba(X_scaled)[:, 1]
            all_probas.append(proba)
        
        return np.mean(all_probas, axis=0)
    
    def predict(self, X, threshold=0.5):
        """预测类别"""
        probas = self.predict_proba(X)
        return (probas >= threshold).astype(int)
    
    def predict_with_strategy(self, X, strategy_name):
        """使用特定策略预测"""
        probas = self.predict_proba(X)
        
        strategy_thresholds = {
            '最高召回率策略': self.threshold_strategies[0][1]['threshold'],
            '最高F1策略': self.threshold_strategies[1][1]['threshold'],
            '最高精确率策略': self.threshold_strategies[2][1]['threshold'],
            '业务平衡策略': self.threshold_strategies[3][1]['threshold']
        }
        
        threshold = strategy_thresholds.get(strategy_name, 0.5)
        return (probas >= threshold).astype(int), probas
    
    def get_feature_names(self):
        """获取特征名称"""
        return self.feature_names
    
    def get_model_info(self):
        """获取模型信息"""
        return {
            'n_models': self.n_models,
            'feature_names': self.feature_names,
            'available_strategies': ['最高召回率策略', '最高F1策略', '最高精确率策略', '业务平衡策略']
        }

print("✅ EnsemblePredictor 类定义完成，支持pickle序列化")


In [None]:
# 1. 导入必要的库
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import os
import joblib

warnings.filterwarnings('ignore')
print("✅ 库导入完成")


In [None]:
# 2. 加载训练好的模型
# 注意：修改为您实际的模型路径

try:
    # 请修改为您实际的模型文件路径
    model_path = ''
    if os.path.exists(model_path):
        # 使用joblib加载模型（推荐方式）
        trained_model = joblib.load(model_path)
        print(f"✅ 模型加载成功: {model_path}")
        print(f"📊 模型信息: {trained_model.get_model_info()}")
    else:
        print(f"❌ 模型文件不存在: {model_path}")
        raise FileNotFoundError(f"模型文件不存在: {model_path}")
except Exception as e:
    print(f"❌ 模型加载失败: {e}")
    print(f"💡 可能原因：")
    print(f"   1. 模型文件路径不正确")
    print(f"   2. 需要先运行EnsemblePredictor类定义")
    print(f"   3. 模型文件版本不兼容")
    raise


In [None]:
# 第一部分：数据预处理与特征工程（针对不平衡样本优化）
# =================================================



In [None]:
# 6. 模型预测
# 预测转化概率
try:
    # 使用EnsemblePredictor的预测方法
    prediction_probabilities = trained_model.predict_proba(X_new)
    print(f"✅ 使用集成模型预测完成")
    print(f"🎯 模型包含 {trained_model.n_models} 个子模型")
    print(f"📊 使用特征: {trained_model.get_feature_names()}")
    print(f"✅ 预测完成，预测了{len(prediction_probabilities)}个用户的转化概率")
    print(f"📊 概率范围: {prediction_probabilities.min():.3f} - {prediction_probabilities.max():.3f}")
    print(f"📊 平均概率: {prediction_probabilities.mean():.3f}")
    
except Exception as e:
    print(f"❌ 预测失败: {e}")
    raise


In [None]:
processed_new_data=df
# 7. 准备Excel输出数据
# 创建详细的用户概率表
user_probability_table = pd.DataFrame({
    '用户ID': processed_new_data['加密手机号码'],
    '转化概率': prediction_probabilities,
    '概率百分比': (prediction_probabilities * 100).round(1),
})
final_output_df = user_probability_table

print(f"✅ 数据整理完成，准备保存Excel文件")
# 将 final_output_df 和 df 通过“加密手机号”连接
merged_df = final_output_df.merge(
    df, 
    left_on='用户ID', 
    right_on='加密手机号码', 
    how='left'  # or 'inner' if只保留交集
)


In [None]:
merged_df.drop(columns=["用户ID"], inplace=True)
merged_df

In [None]:
merged_df.to_excel('全量预测结果f2优化100次.xlsx', index=False)

In [None]:
df=merged_df

In [None]:
import numpy as np
import pandas as pd

# 假设 df 已经存在，且含有 "转化概率" 列
thresholds = np.arange(0, 1, 0.1)

results = {
    'threshold': [],
    'count': [],
    'proportion': []
}

total = len(df)

for t in thresholds:
    count = (df['转化概率'] >= t).sum()
    proportion = count / total
    results['threshold'].append(t)
    results['count'].append(count)
    results['proportion'].append(proportion)

result_df = pd.DataFrame(results)
print(result_df)
