# SemEval 2026 Task 12: Abductive Event Reasoning
## Baseline 2: UnifiedQA + RoBERTa MCQA

本Notebook实现两个经典baseline:
1. **UnifiedQA** (T5-based) - 生成式，零样本推理
2. **RoBERTa for Multiple Choice** - 判别式，需要微调

### 参考文献
- [UnifiedQA: Crossing Format Boundaries with a Single QA System](https://arxiv.org/abs/2005.00700)
- [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)

## 1. 环境配置

In [None]:
# 安装依赖
!pip install -q transformers torch sentencepiece accelerate tqdm

In [None]:
# 下载数据集
!git clone https://github.com/sooo66/semeval2026-task12-dataset.git
DATASET_DIR = "semeval2026-task12-dataset"

## 2. 数据加载与预处理

In [None]:
import json
import os
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional, Set

@dataclass
class AERInstance:
    id: str
    topic_id: str
    target_event: str
    options: Dict[str, str]
    golden_answer: Optional[List[str]] = None
    docs: Optional[List[Dict]] = None


def load_aer_data(data_dir: str) -> List[AERInstance]:
    """加载AER数据集"""
    data_path = Path(data_dir)

    # 加载文档
    docs_map = {}
    with open(data_path / "docs.json", 'r', encoding='utf-8') as f:
        for item in json.load(f):
            docs_map[item['topic_id']] = item

    # 加载问题
    instances = []
    with open(data_path / "questions.jsonl", 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            data = json.loads(line)
            options = {k: data.get(f"option_{k}", "") for k in "ABCD"}
            golden = [a.strip() for a in data.get("golden_answer", "").split(",")] if data.get("golden_answer") else None
            docs = docs_map.get(data["topic_id"], {}).get("docs", [])

            instances.append(AERInstance(
                id=data["id"],
                topic_id=data["topic_id"],
                target_event=data["target_event"],
                options=options,
                golden_answer=golden,
                docs=docs
            ))

    return instances


# 加载数据
train_data = load_aer_data(f"{DATASET_DIR}/train_data")
dev_data = load_aer_data(f"{DATASET_DIR}/dev_data")

print(f"训练集: {len(train_data)} 样本")
print(f"开发集: {len(dev_data)} 样本")

In [None]:
def prepare_context(instance: AERInstance, max_len: int = 1500) -> str:
    """准备上下文"""
    if not instance.docs:
        return ""

    parts = []
    total = 0
    for doc in instance.docs[:5]:
        title = doc.get("title", "")
        content = doc.get("content", doc.get("summary", ""))[:400]
        text = f"[{title}] {content}" if title else content
        if total + len(text) > max_len:
            break
        parts.append(text)
        total += len(text)
    return " ".join(parts)


def format_for_unifiedqa(instance: AERInstance, include_context: bool = True) -> str:
    """
    格式化为UnifiedQA输入

    格式: question \\n (A) opt1 (B) opt2 (C) opt3 (D) opt4
    带context: context \\n question \\n options
    """
    question = f"What is the most plausible cause of: {instance.target_event}"
    options = " ".join([f"({k}) {v}" for k, v in instance.options.items()])

    if include_context:
        context = prepare_context(instance)
        if context:
            return f"{context} \\n {question} \\n {options}"

    return f"{question} \\n {options}"


def format_for_roberta(instance: AERInstance, include_context: bool = True) -> dict:
    """
    格式化为RoBERTa MCQA输入

    返回 SWAG-style 格式
    """
    question = f"What is the most plausible cause of: {instance.target_event}"

    if include_context:
        context = prepare_context(instance)
        if context:
            question = f"Context: {context}\n\nQuestion: {question}"

    label = None
    labels_all = None
    if instance.golden_answer:
        label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
        labels_all = [label_map[a] for a in instance.golden_answer]
        label = labels_all[0]

    return {
        "id": instance.id,
        "sent1": question,
        "sent2": "",
        "ending0": instance.options["A"],
        "ending1": instance.options["B"],
        "ending2": instance.options["C"],
        "ending3": instance.options["D"],
        "label": label,
        "labels_all": labels_all
    }


# 测试格式化
sample = train_data[0]
print("=== UnifiedQA 格式 ===")
print(format_for_unifiedqa(sample, include_context=False)[:500])
print("\n=== RoBERTa MCQA 格式 ===")
roberta_sample = format_for_roberta(sample, include_context=False)
print(f"Question: {roberta_sample['sent1'][:200]}...")
print(f"Options: {roberta_sample['ending0'][:50]}...")

## 3. 评估函数

In [None]:
import re

def calculate_score(pred: Set[str], gold: Set[str]) -> float:
    if not pred:
        return 0.0
    if pred == gold:
        return 1.0
    if pred < gold:  # 真子集
        return 0.5
    return 0.0


def parse_prediction(text: str) -> Set[str]:
    """解析模型输出"""
    result = set()
    for opt in "ABCD":
        if re.search(rf'\b{opt}\b', text.upper()):
            result.add(opt)
    return result


def evaluate(preds: List[Set[str]], golds: List[Set[str]]) -> dict:
    scores = [calculate_score(p, g) for p, g in zip(preds, golds)]
    n = len(scores)
    return {
        "score": sum(scores) / n,
        "exact_match": sum(1 for s in scores if s == 1.0) / n,
        "partial_match": sum(1 for s in scores if s == 0.5) / n,
        "wrong": sum(1 for s in scores if s == 0.0) / n
    }

## 4. UnifiedQA Baseline (零样本)

### 可用模型
- `allenai/unifiedqa-t5-small` (~250MB)
- `allenai/unifiedqa-t5-base` (~900MB) ✓ 推荐
- `allenai/unifiedqa-t5-large` (~3GB)
- `allenai/unifiedqa-v2-t5-base-1363200` (v2版本，更强)

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

class UnifiedQABaseline:
    def __init__(self, model_name="allenai/unifiedqa-t5-base"):
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        print(f"加载模型: {model_name}")
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.model.eval()
        print(f"模型已加载到 {self.device}")

    def predict(self, inputs: List[str], batch_size: int = 8) -> List[str]:
        results = []

        for i in tqdm(range(0, len(inputs), batch_size), desc="UnifiedQA"):
            batch = inputs[i:i+batch_size]

            encoded = self.tokenizer(
                batch,
                return_tensors="pt",
                max_length=512,
                truncation=True,
                padding=True
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    **encoded,
                    max_length=32,
                    num_beams=4,
                    early_stopping=True
                )

            decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            results.extend(decoded)

        return results


def run_unifiedqa_eval(data: List[AERInstance],
                       model_name: str = "allenai/unifiedqa-t5-base",
                       include_context: bool = True,
                       max_samples: int = None):
    """运行UnifiedQA评估"""

    if max_samples:
        data = data[:max_samples]

    # 准备输入
    inputs = [format_for_unifiedqa(inst, include_context) for inst in data]
    goldens = [set(inst.golden_answer) for inst in data if inst.golden_answer]

    # 预测
    model = UnifiedQABaseline(model_name)
    raw_outputs = model.predict(inputs)
    predictions = [parse_prediction(out) for out in raw_outputs]

    # 评估
    results = evaluate(predictions, goldens)

    print("\n" + "="*50)
    print(f"UnifiedQA Results ({model_name})")
    print("="*50)
    print(f"Score: {results['score']:.4f}")
    print(f"Exact Match: {results['exact_match']:.4f}")
    print(f"Partial Match: {results['partial_match']:.4f}")

    return results, predictions, raw_outputs

In [None]:
# 运行UnifiedQA评估 (取消注释运行)

# 使用base模型，不含context（更快）
# uqa_results, uqa_preds, uqa_raw = run_unifiedqa_eval(
#     dev_data,
#     model_name="allenai/unifiedqa-t5-base",
#     include_context=False,
#     max_samples=50  # 先测试50个样本
# )

# 使用v2模型，含context（更准确）
# uqa_results, uqa_preds, uqa_raw = run_unifiedqa_eval(
#     dev_data,
#     model_name="allenai/unifiedqa-v2-t5-base-1363200",
#     include_context=True
# )

## 5. RoBERTa MCQA Baseline (需要微调)

### 可用模型
- `roberta-base` (~500MB)
- `roberta-large` (~1.4GB)
- `microsoft/deberta-v3-base` (~400MB) ✓ 推荐

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMultipleChoice


class MCQADataset(Dataset):
    def __init__(self, data: List[dict], tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item["sent1"]
        choices = [item[f"ending{i}"] for i in range(4)]

        encodings = []
        for choice in choices:
            enc = self.tokenizer(
                question, choice,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            encodings.append({k: v.squeeze(0) for k, v in enc.items()})

        result = {
            "input_ids": torch.stack([e["input_ids"] for e in encodings]),
            "attention_mask": torch.stack([e["attention_mask"] for e in encodings]),
            "id": item["id"]
        }

        if item.get("label") is not None:
            result["labels"] = torch.tensor(item["label"])
            result["labels_all"] = item.get("labels_all", [item["label"]])

        return result


class RoBERTaMCQA:
    def __init__(self, model_name="roberta-base"):
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        print(f"加载模型: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMultipleChoice.from_pretrained(model_name).to(self.device)
        print(f"模型已加载到 {self.device}")

    def train(self, train_data: List[dict], dev_data: List[dict] = None,
              epochs=3, batch_size=4, lr=2e-5):
        """训练模型"""
        from transformers import get_linear_schedule_with_warmup

        train_dataset = MCQADataset(train_data, self.tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps
        )

        print(f"开始训练: {len(train_data)} 样本, {epochs} epochs")

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0

            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
                optimizer.zero_grad()

                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.device),
                    attention_mask=batch["attention_mask"].to(self.device),
                    labels=batch["labels"].to(self.device)
                )

                loss = outputs.loss
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()

            print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

            if dev_data:
                dev_results = self.evaluate(dev_data, batch_size)
                print(f"Dev Score: {dev_results['score']:.4f}")

    def evaluate(self, data: List[dict], batch_size=4) -> dict:
        """评估模型"""
        self.model.eval()
        dataset = MCQADataset(data, self.tokenizer)
        loader = DataLoader(dataset, batch_size=batch_size)

        predictions = []
        goldens = []
        label_map = {0: "A", 1: "B", 2: "C", 3: "D"}

        with torch.no_grad():
            for batch in tqdm(loader, desc="Evaluating"):
                outputs = self.model(
                    input_ids=batch["input_ids"].to(self.device),
                    attention_mask=batch["attention_mask"].to(self.device)
                )

                preds = outputs.logits.argmax(dim=-1).cpu().tolist()

                for i, pred in enumerate(preds):
                    predictions.append({label_map[pred]})
                    if "labels_all" in batch:
                        labels = batch["labels_all"][i]
                        goldens.append({label_map[l] for l in labels})

        return evaluate(predictions, goldens)

In [None]:
# 准备RoBERTa格式数据
train_roberta = [format_for_roberta(inst, include_context=True) for inst in train_data]
dev_roberta = [format_for_roberta(inst, include_context=True) for inst in dev_data]

print(f"RoBERTa格式 - 训练: {len(train_roberta)}, 开发: {len(dev_roberta)}")

In [None]:
# 训练RoBERTa (取消注释运行)

# roberta = RoBERTaMCQA("roberta-base")
# roberta.train(
#     train_roberta,
#     dev_roberta,
#     epochs=3,
#     batch_size=4
# )

# 最终评估
# final_results = roberta.evaluate(dev_roberta)
# print(f"\nFinal Dev Score: {final_results['score']:.4f}")

## 6. 零样本对比: 不训练直接预测

如果你想快速测试，可以使用RoBERTa的零样本预测（不微调）

In [None]:
# 零样本RoBERTa (不推荐，效果较差)
# roberta_zs = RoBERTaMCQA("roberta-base")
# zs_results = roberta_zs.evaluate(dev_roberta[:50])
# print(f"Zero-shot Score: {zs_results['score']:.4f}")

## 7. 结果对比

In [None]:
import pandas as pd

# 汇总结果 (填入你的实验结果)
results_summary = {
    "Random Baseline": {"score": 0.15, "exact_match": 0.10, "partial_match": 0.10},
    # "UnifiedQA-base": uqa_results,
    # "UnifiedQA-v2": uqa_v2_results,
    # "RoBERTa-base (fine-tuned)": final_results,
}

df = pd.DataFrame(results_summary).T
print("\n=== 结果对比 ===")
print(df.round(4).to_string())

## 8. 生成提交文件

In [None]:
def generate_submission(predictions: List[Set[str]], ids: List[str], output_path: str):
    """生成提交文件"""
    with open(output_path, 'w') as f:
        for id_, pred in zip(ids, predictions):
            answer = ",".join(sorted(pred)) if pred else "A"
            f.write(f"{id_}\t{answer}\n")
    print(f"提交文件已保存到: {output_path}")

# 生成提交 (取消注释)
# test_data = load_aer_data(f"{DATASET_DIR}/test_data")
# test_inputs = [format_for_unifiedqa(inst) for inst in test_data]
# test_outputs = model.predict(test_inputs)
# test_preds = [parse_prediction(out) for out in test_outputs]
# generate_submission(test_preds, [inst.id for inst in test_data], "submission.txt")