In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import logging
from pathlib import Path
import itertools
from math import comb

In [6]:
def load_prediction_data(file_path):
    """加载预测结果数据"""
    try:
        data = pd.read_csv(file_path)
        print(f"成功加载数据，共 {len(data)} 条记录，{len(data.columns)} 列")
        return data
    except Exception as e:
        print(f"加载数据失败: {str(e)}")
        return None

def get_model_names(data):
    model_names = set()
    for col in data.columns:
        if '_pred' in col:
            model_name = col.replace('_pred', '')
            model_names.add(model_name)
    return sorted(list(model_names))

def filter_models_by_keywords(all_models, keywords):
    """根据关键字筛选模型"""
    if not keywords:
        return all_models
    
    filtered = []
    for model in all_models:
        if any(keyword in model for keyword in keywords):
            filtered.append(model)
    
    return filtered[::-1]

In [7]:
def get_label(data):
    unique_labels = sorted(data['类别'].unique())
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
    id_to_label = {idx: label for label, idx in label_to_id.items()}
    data['label'] = data['类别'].map(label_to_id)

    return label_to_id, id_to_label, data

label_to_id, id_to_label, data = get_label(pd.read_csv('dataset/train_all.csv'))
label_to_id

{'地域歧视': 0,
 '基于外表的刻板印象(SA)': 1,
 '基于文化背景的刻板印象(SCB)': 2,
 '宗教迷信': 3,
 '微侵犯(MA)': 4,
 '性侵犯(SO)': 5,
 '政治敏感': 6,
 '犯罪': 7,
 '种族歧视': 8,
 '色情': 9}

In [8]:
def ensemble_selected_models(data, selected_models, strategy='weighted_prob', top_n=4):
    """单策略集成函数（被多策略集成调用）"""
    valid_models = []
    for model in selected_models:
        pred_col = f"{model}_pred"
        prob_col = f"{model}_prob"
        if pred_col in data.columns and prob_col in data.columns:
            valid_models.append(model)
        else:
            print(f"警告: 模型 {model} 缺少预测列，已跳过")
    
    if not valid_models:
        raise ValueError("没有有效的模型用于集成")
    
    model_weights = None
    if strategy == 'model_weighted':
        model_weights = {}
        for model in valid_models:
            f1_value = float(model.split('_')[1]) if 'valF1' in model else 1.0
            model_weights[model] = f1_value
        total_weight = sum(model_weights.values())
        model_weights = {k: v/total_weight for k, v in model_weights.items()}
    
    final_preds = []
    for idx, row in data.iterrows():
        model_preds = []
        model_probs = []
        model_names = []
        
        for model in valid_models:
            try:
                pred = row[f"{model}_pred"]
                prob = float(row[f"{model}_prob"])
                model_preds.append(pred)
                model_probs.append(prob)
                model_names.append(model)
            except Exception as e:
                print(f"警告: 处理样本 {idx} 的模型 {model} 时出错: {str(e)}")
        
        if not model_preds:
            final_preds.append(None)
            continue
        
        if strategy == 'voting':
            label_counts = defaultdict(int)
            for pred in model_preds:
                label_counts[pred] += 1
            final_pred = max(label_counts, key=label_counts.get)
        
        elif strategy == 'weighted_prob':
            prob_scores = defaultdict(float)
            for pred, prob in zip(model_preds, model_probs):
                prob_scores[pred] += prob
            final_pred = max(prob_scores, key=prob_scores.get)
        
        elif strategy == 'prob_rank':
            sorted_pairs = sorted(zip(model_preds, model_probs), key=lambda x: x[1], reverse=True)
            rank_scores = defaultdict(float)
            for i, (pred, _) in enumerate(sorted_pairs):
                rank_scores[pred] += 1 - (i / len(sorted_pairs))
            final_pred = max(rank_scores, key=rank_scores.get)
        
        else:
            raise ValueError(f"不支持的策略: {strategy}")

        final_preds.append(final_pred)
    
    return final_preds, valid_models

def ensemble_all_strategies(data, selected_models, top_n=4):
    """多策略集成函数，生成所有策略结果"""
    strategies = [
        'voting', 
        'weighted_prob', 
        'prob_rank'
    ]
    
    all_predictions = {}
    all_fake_labels = defaultdict(dict)
    
    for strategy in strategies:
        print(f"\n正在执行 {strategy} 策略集成...")
        preds, valid_models = ensemble_selected_models(
            data=data,
            selected_models=selected_models,
            strategy=strategy,
            top_n=top_n
        )
        all_predictions[strategy] = preds
        
        for idx, row in data.iterrows():
            model_preds = []
            model_probs = []
            for model in valid_models:
                try:
                    model_preds.append(row[f"{model}_pred"])
                    model_probs.append(float(row[f"{model}_prob"]))
                except:
                    continue
            if not model_preds:
                continue
                
    result_df = data.copy()
    
    # 添加所有策略预测结果
    for strategy, preds in all_predictions.items():
        result_df[f'ensemble_{strategy}'] = preds
    
    final_preds = []
    for idx, row in result_df.iterrows():
        strategy_preds = [row[f'ensemble_{s}'] for s in strategies if pd.notna(row[f'ensemble_{s}'])]
        if not strategy_preds:
            final_preds.append(None)
            continue
        pred_counts = defaultdict(int)
        for pred in strategy_preds:
            pred_counts[pred] += 1
        final_pred = max(pred_counts, key=pred_counts.get)
        final_preds.append(final_pred)
    
    result_df['ensemble_final'] = final_preds
    return result_df


In [9]:
# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def compare_predictions(output_data, submit_file, diffs_output_file, original_data=None):
    """
    比较新生成的预测结果与提交文件的差异
    """
    try:
        # 读取提交文件
        submit_data = pd.read_csv(submit_file)
        logger.info(f"成功加载提交文件: {submit_file}，共 {len(submit_data)} 条记录")
        
        # 确保两个数据集都有'id'和'类别'列
        required_cols = ['id', '类别']
        if not all(col in output_data.columns for col in required_cols):
            missing = [col for col in required_cols if col not in output_data.columns]
            logger.error(f"输出数据缺少必要的列: {missing}")
            return False
        
        if not all(col in submit_data.columns for col in required_cols):
            missing = [col for col in required_cols if col not in submit_data.columns]
            logger.error(f"提交文件缺少必要的列: {missing}")
            return False
        
        # 按照id进行合并，比较相同id的预测结果
        comparison = pd.merge(
            output_data, 
            submit_data, 
            on='id', 
            suffixes=('_new', '_submit'),
            how='outer'  # 包含所有id，包括仅存在于一个文件中的
        )
        
        # 计算各类情况的数量
        total_ids = len(comparison)
        only_new = sum(comparison['类别_submit'].isna())
        only_submit = sum(comparison['类别_new'].isna())
        both_exist = total_ids - only_new - only_submit
        
        # 打印总体比较结果
        print(f"\n===== 结果一致性比较 =====")
        print(f"总ID数量: {total_ids}")
        print(f"仅存在于新结果中的ID: {only_new}")
        print(f"仅存在于提交文件中的ID: {only_submit}")
        print(f"两边都存在的ID: {both_exist}")
        
        # 分析两边都存在的ID
        if both_exist > 0:
            # 筛选两边都存在且预测不一致的样本
            diffs = comparison[
                ~comparison['类别_new'].isna() & 
                ~comparison['类别_submit'].isna() & 
                (comparison['类别_new'] != comparison['类别_submit'])
            ][['id', '类别_new', '类别_submit']]
            
            # 计算匹配率
            matched = both_exist - len(diffs)
            match_rate = matched / both_exist if both_exist > 0 else 0
            
            print(f"两边都存在且预测一致的ID: {matched}")
            print(f"两边都存在且预测不一致的ID: {len(diffs)}")
            print(f"一致性比例: {matched}/{both_exist} ({match_rate:.2%})")
            
            # 保存差异样本
            if not diffs.empty:
                # 如果提供了原始数据，补充完整信息
                if original_data is not None and 'id' in original_data.columns:
                    # 提取差异样本的id列表
                    diff_ids = diffs['id'].tolist()
                    
                    # 从原始数据中筛选这些id的完整记录
                    all_diffs_in_data = original_data[original_data['id'].isin(diff_ids)].copy()
                    
                    # 合并差异预测结果（新预测 vs 提交预测）
                    all_diffs_in_data = pd.merge(
                        all_diffs_in_data,
                        diffs,
                        on='id',
                        how='left'
                    )

                    # 调整列顺序：关键列放前面
                    front_cols = ['id', '文本', '类别_new', '类别_submit']
                    existing_front_cols = [col for col in front_cols if col in all_diffs_in_data.columns]
                    remaining_cols = [col for col in all_diffs_in_data.columns if col not in existing_front_cols]
                    all_diffs_in_data = all_diffs_in_data[existing_front_cols + remaining_cols]
                else:
                    # 如果没有原始数据，仅保存差异的id和预测结果
                    all_diffs_in_data = diffs
                    logger.warning("未提供原始数据，仅保存差异的ID和预测结果")

                # 确保输出目录存在
                Path(diffs_output_file).parent.mkdir(parents=True, exist_ok=True)
                
                # 保存调整后的结果
                # all_diffs_in_data.to_csv(diffs_output_file, index=False)
                logger.info(f"已保存 {len(all_diffs_in_data)} 条预测不一致的样本至: {diffs_output_file}")
            else:
                print("\n所有共同ID的预测结果完全一致，没有差异样本")
        else:
            print("\n没有共同的ID可以进行比较")
            
        return True, all_diffs_in_data
        
    except FileNotFoundError:
        logger.error(f"提交文件不存在: {submit_file}")
        return False
    except Exception as e:
        logger.error(f"比较过程中发生错误: {str(e)}", exc_info=True)
        return False

In [8]:
import pandas as pd
import itertools
from math import comb
from collections import defaultdict

# ------------------------------
# 1. 准备数据与候选模型池
# ------------------------------
# 加载预测结果数据
data = pd.read_csv('dataset/test_predictions_all_data.csv')

# 筛选所有候选模型（含focal_loss或rdrop）
all_model_cols = [col for col in data.columns if '_pred' in col]
all_models = [col.replace('_pred', '') for col in all_model_cols]
candidate_models = [model for model in all_models 
                   if 'focal_loss' in model.lower() or 'rdrop' in model.lower()]

all_focal_models = [model for model in candidate_models 
                   if 'focal_loss' in model.lower() or 'multi_gpu' in model.lower()]
other_models = [model for model in candidate_models if model not in all_focal_models]

print(f"必选focalloss相关模型数量: {len(all_focal_models)} 个")
print(f"可选其他模型数量: {len(other_models)} 个")

# 基本约束检查
min_total = 10
max_total = 15
n_focal = len(all_focal_models)

if n_focal == 0:
    raise ValueError("未找到任何focalloss相关模型，无法生成组合")
if n_focal > max_total:
    raise ValueError(f"必选模型数量({n_focal})超过最大组合大小({max_total})，无法满足要求")

# ------------------------------
# 2. 计算有效组合范围与数量
# ------------------------------
# 计算需要补充的模型数量范围
min_supplement = max(0, min_total - n_focal)
max_supplement = max_total - n_focal

if max_supplement < 0:
    raise ValueError(f"必选模型数量({n_focal})超过最大组合大小({max_total})")
if min_supplement > len(other_models):
    raise ValueError(f"需要补充至少{min_supplement}个模型，但可选模型仅{len(other_models)}个")

# 调整最大补充数量
max_supplement = min(max_supplement, len(other_models))
if max_supplement < min_supplement:
    raise ValueError(f"可选模型不足，无法满足补充数量要求（需{min_supplement}-{max_supplement}个）")

# 计算总组合数
total_combos = 0
size_counts = {}
for supplement in range(min_supplement, max_supplement + 1):
    total_size = n_focal + supplement
    cnt = comb(len(other_models), supplement) if len(other_models) >= supplement else 0
    total_combos += cnt
    size_counts[total_size] = cnt

print("\n===== 必含所有focalloss相关模型的组合数量 =====")
for total_size, cnt in size_counts.items():
    print(f"总大小为{total_size}的组合数: {cnt:,}（必选{ n_focal}个 + 补充{total_size - n_focal}个）")
print(f"总组合数: {total_combos:,}")


# ------------------------------
# 3. 生成必含所有focalloss相关模型的组合
# ------------------------------
def generate_all_focal_combos(all_focal, others, min_supplement, max_supplement):
    """生成必含所有focalloss相关模型的组合"""
    combos = []
    combo_id = 1
    n_focal = len(all_focal)
    
    for supplement in range(min_supplement, max_supplement + 1):
        total_size = n_focal + supplement
        print(f"生成总大小为{total_size}的组合（必选{ n_focal}个 + 补充{supplement}个）...")
        
        # 从可选模型中选择补充模型
        for supplement_models in itertools.combinations(others, supplement):
            full_combo = all_focal + list(supplement_models)
            combos.append({
                'size': total_size,
                'models': full_combo,
                'combo_id': f"combo_{combo_id}_size_{total_size}",
                'focal_count': n_focal,
                'supplement_count': supplement
            })
            combo_id += 1
    return combos

# 生成所有组合
all_combos = generate_all_focal_combos(
    all_focal=all_focal_models,
    others=other_models,
    min_supplement=min_supplement,
    max_supplement=max_supplement
)

print(f"\n所有组合生成完成，共{len(all_combos):,}个（均包含所有{ n_focal}个必选模型）")


# ------------------------------
# 4. 计算组合集成结果与基准的差异
# ------------------------------
def calculate_diff(pred_df, benchmark_df):
    """计算预测结果与基准的差异数量及详情"""
    merged = pred_df.merge(benchmark_df, on='id', how='inner')
    total = len(merged)
    diff = merged[merged['类别'] != merged['benchmark_类别']]
    return {
        'total_samples': total,
        'diff_count': len(diff),
        'diff_rate': len(diff) / total,
        'diff_details': diff
    }

# 加载基准文件
SUBMIT_FILE = "dataset/updated_ensemble_results_7275.csv"
benchmark_df = pd.read_csv(SUBMIT_FILE)[['id', '类别']].rename(columns={'类别': 'benchmark_类别'})

# 存储所有组合的结果（使用生成器分批处理，减少内存占用）
combo_results = []
batch_size = 100  # 每批处理100个组合

for i in range(0, len(all_combos), batch_size):
    batch_combos = all_combos[i:i+batch_size]
    print(f"\n处理第{i//batch_size + 1}批组合（共{len(batch_combos)}个）")
    
    for combo in batch_combos:
        try:
            # 生成该组合的集成结果
            ensemble_df = ensemble_all_strategies(
                data=data,
                selected_models=combo['models'],
                top_n=5
            )
            
            # 提取最终集成结果
            pred_df = ensemble_df[['id', 'ensemble_final']].rename(
                columns={'ensemble_final': '类别'}
            )
            
            # 计算与基准的差异
            diff_stats = calculate_diff(pred_df, benchmark_df)
            
            # 保存结果（仅保留必要信息，节省内存）
            combo_results.append({
                'combo_id': combo['combo_id'],
                'size': combo['size'],
                'focal_count': combo['focal_count'],
                'supplement_count': combo['supplement_count'],
                'diff_count': diff_stats['diff_count'],
                'diff_rate': diff_stats['diff_rate'],
                'models': combo['models'],
                'pred_df': pred_df,
                'diff_details': diff_stats['diff_details']
            })
            print(f"组合 {combo['combo_id']} 处理完成 | 差异数: {diff_stats['diff_count']}")
        
        except Exception as e:
            print(f"组合 {combo['combo_id']} 处理失败: {str(e)}")
            continue


# ------------------------------
# 5. 按差异最小排序并输出最优结果
# ------------------------------
if not combo_results:
    raise ValueError("没有成功处理的组合结果")

# 按差异数量升序排序（差异越小越好）
sorted_results = sorted(combo_results, key=lambda x: x['diff_count'])

# 输出前5名最优组合
print("\n" + "="*80)
print("按与基准差异最小排序的组合（前5名）")
print("="*80)
for i, res in enumerate(sorted_results[:10], 1):
    print(f"\n第{i}名: {res['combo_id']}（{res['size']}个模型）")
    print(f"必选模型: {res['focal_count']}个 | 补充模型: {res['supplement_count']}个")
    print(f"差异样本数: {res['diff_count']} | 差异率: {res['diff_rate']:.4f}")
    print("模型列表:")
    for model in res['models']:
        print(f"  - {model}")

# 保存最优组合的结果
best_combo = sorted_results[0]
print("\n" + "="*80)
print(f"最优组合: {best_combo['combo_id']}（差异样本数最少）")
print(f"差异样本数: {best_combo['diff_count']} | 差异率: {best_combo['diff_rate']:.4f}")
print("="*80)

# 保存最优组合的集成结果和差异详情
best_combo['pred_df'].to_csv(f"dataset/best_combo_{best_combo['combo_id']}_predictions.csv", index=False)
best_combo['diff_details'].to_csv(f"dataset/best_combo_{best_combo['combo_id']}_diffs.csv", index=False)
print(f"最优组合预测结果已保存至: dataset/best_combo_{best_combo['combo_id']}_predictions.csv")
print(f"差异样本详情已保存至: dataset/best_combo_{best_combo['combo_id']}_diffs.csv")

必选focalloss相关模型数量: 10 个
可选其他模型数量: 10 个

===== 必含所有focalloss相关模型的组合数量 =====
总大小为10的组合数: 1（必选10个 + 补充0个）
总大小为11的组合数: 10（必选10个 + 补充1个）
总大小为12的组合数: 45（必选10个 + 补充2个）
总大小为13的组合数: 120（必选10个 + 补充3个）
总大小为14的组合数: 210（必选10个 + 补充4个）
总大小为15的组合数: 252（必选10个 + 补充5个）
总组合数: 638
生成总大小为10的组合（必选10个 + 补充0个）...
生成总大小为11的组合（必选10个 + 补充1个）...
生成总大小为12的组合（必选10个 + 补充2个）...
生成总大小为13的组合（必选10个 + 补充3个）...
生成总大小为14的组合（必选10个 + 补充4个）...
生成总大小为15的组合（必选10个 + 补充5个）...

所有组合生成完成，共638个（均包含所有10个必选模型）

处理第1批组合（共100个）

正在执行 voting 策略集成...

正在执行 weighted_prob 策略集成...

正在执行 prob_rank 策略集成...
组合 combo_1_size_10 处理完成 | 差异数: 17

正在执行 voting 策略集成...

正在执行 weighted_prob 策略集成...

正在执行 prob_rank 策略集成...
组合 combo_2_size_11 处理完成 | 差异数: 38

正在执行 voting 策略集成...

正在执行 weighted_prob 策略集成...

正在执行 prob_rank 策略集成...
组合 combo_3_size_11 处理完成 | 差异数: 32

正在执行 voting 策略集成...

正在执行 weighted_prob 策略集成...

正在执行 prob_rank 策略集成...
组合 combo_4_size_11 处理完成 | 差异数: 43

正在执行 voting 策略集成...

正在执行 weighted_prob 策略集成...

正在执行 prob_rank 策略集成...
组合 combo_5_size_11

In [None]:
"""
第1名: combo_1_size_10（10个模型）
必选模型: 10个 | 补充模型: 0个
差异样本数: 17 | 差异率: 0.0027
模型列表:
  - chinese-roberta-wwm-ext_focal_loss_fold_1_chinese-roberta-wwm-ext_focal_valF1_0_7328_20250818_000623
  - chinese-roberta-wwm-ext_focal_loss_fold_2_chinese-roberta-wwm-ext_focal_valF1_0_7226_20250818_024536
  - chinese-roberta-wwm-ext_focal_loss_fold_3_chinese-roberta-wwm-ext_focal_valF1_0_7005_20250818_053611
  - chinese-roberta-wwm-ext_focal_loss_fold_4_chinese-roberta-wwm-ext_focal_valF1_0_6913_20250818_082419
  - chinese-roberta-wwm-ext_focal_loss_fold_5_chinese-roberta-wwm-ext_focal_valF1_0_7282_20250818_110457
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_1_chinese-roberta-wwm-ext_rdrop_valF1_0_7577_20250818_154517
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7351_20250818_193058
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_3_chinese-roberta-wwm-ext_rdrop_valF1_0_7240_20250818_231603
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_6883_20250819_031413
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7379_20250819_071253

第2名: combo_23_size_12（12个模型）
必选模型: 10个 | 补充模型: 2个
差异样本数: 23 | 差异率: 0.0037
模型列表:
  - chinese-roberta-wwm-ext_focal_loss_fold_1_chinese-roberta-wwm-ext_focal_valF1_0_7328_20250818_000623
  - chinese-roberta-wwm-ext_focal_loss_fold_2_chinese-roberta-wwm-ext_focal_valF1_0_7226_20250818_024536
  - chinese-roberta-wwm-ext_focal_loss_fold_3_chinese-roberta-wwm-ext_focal_valF1_0_7005_20250818_053611
  - chinese-roberta-wwm-ext_focal_loss_fold_4_chinese-roberta-wwm-ext_focal_valF1_0_6913_20250818_082419
  - chinese-roberta-wwm-ext_focal_loss_fold_5_chinese-roberta-wwm-ext_focal_valF1_0_7282_20250818_110457
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_1_chinese-roberta-wwm-ext_rdrop_valF1_0_7577_20250818_154517
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7351_20250818_193058
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_3_chinese-roberta-wwm-ext_rdrop_valF1_0_7240_20250818_231603
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_6883_20250819_031413
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7379_20250819_071253
  - chinese-roberta-wwm-ext_rdrop_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7080_20250819_204751
  - chinese-roberta-wwm-ext_rdrop_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7460_20250820_081334

第3名: combo_40_size_12（12个模型）
必选模型: 10个 | 补充模型: 2个
差异样本数: 25 | 差异率: 0.0040
模型列表:
  - chinese-roberta-wwm-ext_focal_loss_fold_1_chinese-roberta-wwm-ext_focal_valF1_0_7328_20250818_000623
  - chinese-roberta-wwm-ext_focal_loss_fold_2_chinese-roberta-wwm-ext_focal_valF1_0_7226_20250818_024536
  - chinese-roberta-wwm-ext_focal_loss_fold_3_chinese-roberta-wwm-ext_focal_valF1_0_7005_20250818_053611
  - chinese-roberta-wwm-ext_focal_loss_fold_4_chinese-roberta-wwm-ext_focal_valF1_0_6913_20250818_082419
  - chinese-roberta-wwm-ext_focal_loss_fold_5_chinese-roberta-wwm-ext_focal_valF1_0_7282_20250818_110457
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_1_chinese-roberta-wwm-ext_rdrop_valF1_0_7577_20250818_154517
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7351_20250818_193058
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_3_chinese-roberta-wwm-ext_rdrop_valF1_0_7240_20250818_231603
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_6883_20250819_031413
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7379_20250819_071253
  - chinese-roberta-wwm-ext_rdrop_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_7115_20250820_042705
  - chinese-roberta-wwm-ext_label_smooth_fold_4_chinese-roberta-wwm-ext_rdrop_ema_valF1_0.72321_20250822_200620

第4名: combo_36_size_12（12个模型）
必选模型: 10个 | 补充模型: 2个
差异样本数: 26 | 差异率: 0.0041
模型列表:
  - chinese-roberta-wwm-ext_focal_loss_fold_1_chinese-roberta-wwm-ext_focal_valF1_0_7328_20250818_000623
  - chinese-roberta-wwm-ext_focal_loss_fold_2_chinese-roberta-wwm-ext_focal_valF1_0_7226_20250818_024536
  - chinese-roberta-wwm-ext_focal_loss_fold_3_chinese-roberta-wwm-ext_focal_valF1_0_7005_20250818_053611
  - chinese-roberta-wwm-ext_focal_loss_fold_4_chinese-roberta-wwm-ext_focal_valF1_0_6913_20250818_082419
  - chinese-roberta-wwm-ext_focal_loss_fold_5_chinese-roberta-wwm-ext_focal_valF1_0_7282_20250818_110457
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_1_chinese-roberta-wwm-ext_rdrop_valF1_0_7577_20250818_154517
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7351_20250818_193058
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_3_chinese-roberta-wwm-ext_rdrop_valF1_0_7240_20250818_231603
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_6883_20250819_031413
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7379_20250819_071253
  - chinese-roberta-wwm-ext_rdrop_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_7115_20250820_042705
  - chinese-roberta-wwm-ext_rdrop_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7460_20250820_081334

第5名: combo_27_size_12（12个模型）
必选模型: 10个 | 补充模型: 2个
差异样本数: 27 | 差异率: 0.0043
模型列表:
  - chinese-roberta-wwm-ext_focal_loss_fold_1_chinese-roberta-wwm-ext_focal_valF1_0_7328_20250818_000623
  - chinese-roberta-wwm-ext_focal_loss_fold_2_chinese-roberta-wwm-ext_focal_valF1_0_7226_20250818_024536
  - chinese-roberta-wwm-ext_focal_loss_fold_3_chinese-roberta-wwm-ext_focal_valF1_0_7005_20250818_053611
  - chinese-roberta-wwm-ext_focal_loss_fold_4_chinese-roberta-wwm-ext_focal_valF1_0_6913_20250818_082419
  - chinese-roberta-wwm-ext_focal_loss_fold_5_chinese-roberta-wwm-ext_focal_valF1_0_7282_20250818_110457
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_1_chinese-roberta-wwm-ext_rdrop_valF1_0_7577_20250818_154517
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7351_20250818_193058
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_3_chinese-roberta-wwm-ext_rdrop_valF1_0_7240_20250818_231603
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_4_chinese-roberta-wwm-ext_rdrop_valF1_0_6883_20250819_031413
  - chinese-roberta-wwm-ext_rdrop_multi_gpu_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7379_20250819_071253
  - chinese-roberta-wwm-ext_rdrop_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7080_20250819_204751
  - chinese-roberta-wwm-ext_label_smooth_fold_4_chinese-roberta-wwm-ext_rdrop_ema_valF1_0.72321_20250822_200620

================================================================================
最优组合: combo_1_size_10（差异样本数最少）
差异样本数: 17 | 差异率: 0.0027
================================================================================
最优组合预测结果已保存至: dataset/best_combo_combo_1_size_10_predictions.csv
差异样本详情已保存至: dataset/best_combo_combo_1_size_10_diffs.csv
"""

In [None]:
# 生成该组合的集成结果
# 加载预测结果数据
data = pd.read_csv('dataset/test_predictions_all_data.csv')


all_model_cols = [col for col in data.columns if '_pred' in col]
all_models = [col.replace('_pred', '') for col in all_model_cols]
candidate_models = [model for model in all_models if 'multi_gpu' in model or 'focal_loss' in model]

# print(len(candidate_models))
candidate_models = candidate_models + ['chinese-roberta-wwm-ext_rdrop_fold_2_chinese-roberta-wwm-ext_rdrop_valF1_0_7080_20250819_204751', 
                                       'chinese-roberta-wwm-ext_rdrop_fold_5_chinese-roberta-wwm-ext_rdrop_valF1_0_7460_20250820_081334']
ensemble_df = ensemble_all_strategies(
    data=data,
    selected_models=candidate_models,
    top_n=5
)

# 提取最终集成结果
pred_df = ensemble_df[['id', 'ensemble_voting']].rename(
    columns={'ensemble_prob_rank': '类别'}
)

pred_df.to_csv('./dataset/second_pred.csv')

In [110]:
# import pandas as pd
# import itertools
# from collections import defaultdict
# import concurrent.futures
# from tqdm import tqdm
# import functools

# # 配置参数
# INPUT_FILE = 'dataset/updated_ensemble_results_7306.csv'
# SUBMIT_FILE = "dataset/updated_ensemble_results_7301.csv"
# BEST_COMBINATIONS_FILE = "dataset/best_model_combinations.csv"

# # ensemble_results = pred_df
# ensemble_results = pd.read_csv(INPUT_FILE)
# submit_data = pd.read_csv(SUBMIT_FILE)

# def calculate_difference(ensemble_results, submit_data):
#     """计算集成结果与提交文件的差异，并返回不同的数据"""
#     # 确保两个数据集都有'id'和'类别'列
#     required_cols = ['id', '类别']
    
#     if not all(col in ensemble_results.columns for col in required_cols):
#         return float('inf'), "输出数据缺少必要的列", None
#     if not all(col in submit_data.columns for col in required_cols):
#         return float('inf'), "提交文件缺少必要的列", None
    
#     # 按照id进行合并
#     comparison = pd.merge(
#         ensemble_results, 
#         submit_data, 
#         on='id', 
#         suffixes=('_new', '_submit'),
#         how='inner'  # 只比较两边都存在的ID
#     )

#     if len(comparison) == 0:
#         return float('inf'), "没有共同的ID可以比较", None

#     # 计算不一致的数量
#     diff_mask = comparison['类别_new'] != comparison['类别_submit']
#     diffs = sum(diff_mask)
#     # 计算差异率
#     diff_rate = diffs / len(comparison)
#     print(f"差异率: {diff_rate:.4f} ({diffs}/{len(comparison)})")

#     # 提取不同的数据
#     different_data = comparison[diff_mask][['id', '类别_new', '类别_submit']]
#     # 重命名列以便更清晰
#     different_data = different_data.rename(columns={
#         '类别_new': '集成结果类别',
#         '类别_submit': '提交文件类别'
#     })

#     return diff_rate, f"发现 {diffs} 处差异", different_data

# # 执行比较
# diff_rate, message, different_data = calculate_difference(ensemble_results, submit_data)
# print(message)

# # 打印不同的数据
# if different_data is not None and not different_data.empty:
#     print("\n不同的数据如下:")
#     print(different_data.to_string(index=False))  # 不显示索引
# else:
#     print("\n没有发现不同的数据")

差异率: 0.0048 (30/6276)
发现 30 处差异

不同的数据如下:
  id 集成结果类别 提交文件类别
  99   宗教迷信   政治敏感
 308   宗教迷信   政治敏感
 560   政治敏感     色情
 704     犯罪     色情
 797     色情   政治敏感
1332   政治敏感     色情
1443   种族歧视   政治敏感
1498   政治敏感     色情
1726   宗教迷信   政治敏感
1788     色情   政治敏感
1961   政治敏感     色情
2104   政治敏感     色情
2112   政治敏感     色情
2469   政治敏感     色情
2826   政治敏感     色情
3093     色情   政治敏感
3177     犯罪   政治敏感
3456   政治敏感     色情
3487     犯罪     色情
3507   政治敏感     色情
3520     色情   政治敏感
3807   宗教迷信   政治敏感
4478     犯罪     色情
4554   政治敏感     色情
4864   宗教迷信   政治敏感
4878   宗教迷信   政治敏感
5421     色情   政治敏感
5628   宗教迷信   政治敏感
5743   宗教迷信   政治敏感
6246   政治敏感     色情


In [None]:
import pandas as pd
import ast
# --------------------------
# 步骤1：重新加载并计算target_category
# --------------------------
# 1.1 加载原始数据并筛选data_change
data = pd.read_csv('dataset/test_predictions_all_data.csv')
data['avg_prob'] = data[[col for col in data.columns if 'prob' in col]].mean(axis=1)
data_change = data[(data['avg_prob'] < 0.70) & 
                 ((data['base_vote_pred'] == '色情') | 
                  (data['base_vote_pred'] == '政治敏感'))]

col_pred = [col for col in data.columns if 'pred' in col and 'base_vote_pred' not in col]
col_prob = [col.replace('pred', 'prob') for col in col_pred]
valid_pairs = [(p, pr) for p, pr in zip(col_pred, col_prob) if pr in data.columns]

def get_top5_categories_only(row):
    pred_prob = []
    for pred_col, prob_col in valid_pairs:
        category = row[pred_col]
        prob = row[prob_col]
        if pd.notna(category) and pd.notna(prob):
            pred_prob.append((category, float(prob)))
    pred_prob.sort(key=lambda x: x[1], reverse=True)
    return [item[0] for item in pred_prob[:5]]

data_change['top5_categories_only'] = data_change.apply(get_top5_categories_only, axis=1)

def get_target_category(row):
    base_pred = row['base_vote_pred']
    top5_raw = row['top5_categories_only']
    
    if isinstance(top5_raw, str):
        try:
            top5_list = ast.literal_eval(top5_raw)
        except:
            top5_list = []
    else:
        top5_list = top5_raw if isinstance(top5_raw, list) else []
    
    deduplicated = []
    seen_cats = set()
    for cat in top5_list:
        if cat not in seen_cats and pd.notna(cat):
            seen_cats.add(cat)
            deduplicated.append(cat)

    if len(deduplicated) == 1:
        return deduplicated[0]
    else:
        diff_from_base = [cat for cat in deduplicated if cat != base_pred]
        return diff_from_base[0] if diff_from_base else deduplicated[0]

data_change['target_category'] = data_change.apply(get_target_category, axis=1)

# --------------------------
# 步骤2：加载提交文件，按id更新类别
# --------------------------

submit_df = pd.read_csv("dataset/second_pred.csv")

SUBMIT_CATEGORY_COL = "类别"

update_mapping = data_change.set_index('id')['target_category'].to_dict()
submit_df.loc[submit_df['id'].isin(update_mapping.keys()), SUBMIT_CATEGORY_COL] = \
    submit_df['id'].map(update_mapping)

# --------------------------
# 步骤3：保存更新后的提交文件
# --------------------------

UPDATED_SUBMIT_FILE = "dataset/updated_ensemble_results_with_target_category.csv"
submit_df.to_csv(UPDATED_SUBMIT_FILE, index=False, encoding='utf-8')