# SemEval 2026 Task 12: Abductive Event Reasoning
## Baseline 实验 Notebook

本 Notebook 提供了多种 baseline 方法来运行 AER 任务:
1. **OpenAI API** (GPT-4o, GPT-4o-mini)
2. **Anthropic API** (Claude-3.5-Sonnet)
3. **HuggingFace** (Llama-3.1, Qwen2, etc.)
4. **Ollama** (本地部署)

## 1. 环境配置

In [None]:
# 安装依赖
!pip install -q openai anthropic transformers torch tqdm

In [None]:
# 下载数据集
!git clone https://github.com/sooo66/semeval2026-task12-dataset.git

## 2. 数据加载和探索

In [None]:
import json
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional, Set

@dataclass
class AERInstance:
    """单个AER任务实例"""
    id: str
    topic_id: str
    target_event: str
    options: Dict[str, str]
    golden_answer: Optional[List[str]] = None
    docs: Optional[List[Dict]] = None


def load_aer_data(data_dir: str) -> List[AERInstance]:
    """加载AER数据集"""
    data_path = Path(data_dir)
    
    # 加载文档
    docs_map = {}
    with open(data_path / "docs.json", 'r', encoding='utf-8') as f:
        for item in json.load(f):
            docs_map[item['topic_id']] = item
    
    # 加载问题
    instances = []
    with open(data_path / "questions.jsonl", 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            data = json.loads(line)
            
            options = {k: data.get(f"option_{k}", "") for k in "ABCD"}
            golden = [a.strip() for a in data.get("golden_answer", "").split(",")] if data.get("golden_answer") else None
            docs = docs_map.get(data["topic_id"], {}).get("docs", [])
            
            instances.append(AERInstance(
                id=data["id"],
                topic_id=data["topic_id"],
                target_event=data["target_event"],
                options=options,
                golden_answer=golden,
                docs=docs
            ))
    
    return instances

In [None]:
# 加载数据
train_data = load_aer_data("semeval2026-task12-dataset/train_data")
dev_data = load_aer_data("semeval2026-task12-dataset/dev_data")

print(f"训练集: {len(train_data)} 样本")
print(f"开发集: {len(dev_data)} 样本")

# 查看样例
sample = train_data[0]
print(f"\n=== 样例 ===")
print(f"ID: {sample.id}")
print(f"目标事件: {sample.target_event}")
print(f"\n选项:")
for opt, text in sample.options.items():
    print(f"  {opt}: {text[:100]}..." if len(text) > 100 else f"  {opt}: {text}")
print(f"\n正确答案: {sample.golden_answer}")
print(f"相关文档数: {len(sample.docs) if sample.docs else 0}")

## 3. 评估函数

In [None]:
import re

def calculate_score(prediction: Set[str], golden: Set[str]) -> float:
    """计算单个实例分数"""
    if not prediction:
        return 0.0
    if prediction == golden:
        return 1.0
    if prediction < golden:  # 真子集
        return 0.5
    return 0.0


def parse_prediction(pred_str: str) -> Set[str]:
    """解析模型输出"""
    result = set()
    for opt in "ABCD":
        if re.search(rf'\b{opt}\b', pred_str.upper()):
            result.add(opt)
    return result


def evaluate(predictions: List[Set[str]], goldens: List[Set[str]]) -> dict:
    """评估整个数据集"""
    scores = [calculate_score(p, g) for p, g in zip(predictions, goldens)]
    n = len(scores)
    
    exact = sum(1 for s in scores if s == 1.0)
    partial = sum(1 for s in scores if s == 0.5)
    
    return {
        "score": sum(scores) / n,
        "exact_match": exact / n,
        "partial_match": partial / n,
        "wrong": (n - exact - partial) / n
    }

## 4. Prompt 模板

In [None]:
def create_prompt(instance: AERInstance, use_context: bool = True, max_context_len: int = 3000) -> tuple:
    """创建 Prompt"""
    
    system_prompt = """You are an expert in causal reasoning. Given an observed event, identify the most plausible and direct cause(s) from the options.

Rules:
1. Select the DIRECT cause(s) of the target event
2. Multiple options may be correct if they are equally direct causes
3. The cause must happen BEFORE the effect
4. Distinguish between direct causes and indirect/background factors

Output format: Answer with ONLY the letter(s), separated by commas if multiple.
Example outputs: "A" or "A, B"""

    user_prompt = f"Target Event: {instance.target_event}\n\n"
    
    # 添加上下文
    if use_context and instance.docs:
        context_parts = []
        total_len = 0
        for doc in instance.docs[:5]:
            title = doc.get("title", "Document")
            content = doc.get("content", doc.get("summary", ""))[:600]
            doc_text = f"[{title}]\n{content}"
            if total_len + len(doc_text) > max_context_len:
                break
            context_parts.append(doc_text)
            total_len += len(doc_text)
        
        if context_parts:
            user_prompt += "Context Documents:\n" + "\n\n".join(context_parts) + "\n\n"
    
    user_prompt += "Options:\n"
    for opt, text in instance.options.items():
        user_prompt += f"{opt}. {text}\n"
    
    user_prompt += "\nAnswer:"
    
    return system_prompt, user_prompt

## 5. Baseline 方法

### 方法 1: OpenAI API

In [None]:
# 设置 API Key
import os
os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # 替换为你的API Key

In [None]:
from openai import OpenAI
from tqdm import tqdm

def run_openai_baseline(data: List[AERInstance], 
                        model_name: str = "gpt-4o-mini",
                        use_context: bool = True,
                        max_samples: int = None) -> dict:
    """运行 OpenAI baseline"""
    
    client = OpenAI()
    
    if max_samples:
        data = data[:max_samples]
    
    predictions = []
    goldens = []
    
    for instance in tqdm(data, desc=f"Running {model_name}"):
        system_prompt, user_prompt = create_prompt(instance, use_context)
        
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                max_tokens=50
            )
            pred = parse_prediction(response.choices[0].message.content)
        except Exception as e:
            print(f"Error on {instance.id}: {e}")
            pred = set()
        
        predictions.append(pred)
        goldens.append(set(instance.golden_answer) if instance.golden_answer else set())
    
    return evaluate(predictions, goldens)

# 运行示例 (取消注释来运行)
# results = run_openai_baseline(dev_data, model_name="gpt-4o-mini", max_samples=10)
# print(results)

### 方法 2: HuggingFace 本地模型 (推荐用于 Colab)

In [None]:
# 安装额外依赖
!pip install -q accelerate bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def run_hf_baseline(data: List[AERInstance],
                    model_name: str = "Qwen/Qwen2.5-7B-Instruct",
                    use_context: bool = True,
                    max_samples: int = None) -> dict:
    """运行 HuggingFace 本地模型 baseline"""
    
    print(f"加载模型: {model_name}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_4bit=True  # 4-bit量化以节省显存
    )
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=50
    )
    
    if max_samples:
        data = data[:max_samples]
    
    predictions = []
    goldens = []
    
    for instance in tqdm(data, desc=f"Running {model_name.split('/')[-1]}"):
        system_prompt, user_prompt = create_prompt(instance, use_context)
        
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        try:
            output = pipe(messages, do_sample=False)
            response = output[0]["generated_text"][-1]["content"]
            pred = parse_prediction(response)
        except Exception as e:
            print(f"Error on {instance.id}: {e}")
            pred = set()
        
        predictions.append(pred)
        goldens.append(set(instance.golden_answer) if instance.golden_answer else set())
    
    return evaluate(predictions, goldens)

# 运行示例
# 推荐模型 (按显存需求从低到高):
# - Qwen/Qwen2.5-1.5B-Instruct (约3GB)
# - Qwen/Qwen2.5-7B-Instruct (约5GB with 4-bit)
# - meta-llama/Llama-3.1-8B-Instruct (约6GB with 4-bit)

# results = run_hf_baseline(dev_data, model_name="Qwen/Qwen2.5-7B-Instruct", max_samples=10)
# print(results)

### 方法 3: 简单 Random Baseline

In [None]:
import random

def run_random_baseline(data: List[AERInstance], seed: int = 42) -> dict:
    """随机基线"""
    random.seed(seed)
    
    predictions = []
    goldens = []
    
    for instance in data:
        # 随机选择1-2个选项
        n_choices = random.randint(1, 2)
        pred = set(random.sample(["A", "B", "C", "D"], n_choices))
        
        predictions.append(pred)
        goldens.append(set(instance.golden_answer) if instance.golden_answer else set())
    
    return evaluate(predictions, goldens)

# 运行随机基线
random_results = run_random_baseline(dev_data)
print("Random Baseline 结果:")
print(f"  Score: {random_results['score']:.4f}")
print(f"  Exact Match: {random_results['exact_match']:.4f}")

## 6. 结果对比和分析

In [None]:
# 汇总所有结果
all_results = {
    "Random": random_results,
    # 取消注释以添加其他结果:
    # "GPT-4o-mini": gpt_results,
    # "Qwen2.5-7B": qwen_results,
}

import pandas as pd

df = pd.DataFrame(all_results).T
df = df.round(4)
print("\n=== 结果对比 ===")
print(df.to_string())

## 7. 错误分析

In [None]:
def analyze_errors(data: List[AERInstance], 
                   predictions: List[Set[str]], 
                   goldens: List[Set[str]],
                   n_examples: int = 5):
    """分析错误案例"""
    
    errors = []
    for i, (inst, pred, gold) in enumerate(zip(data, predictions, goldens)):
        score = calculate_score(pred, gold)
        if score < 1.0:
            errors.append({
                "instance": inst,
                "prediction": pred,
                "golden": gold,
                "score": score
            })
    
    print(f"\n总错误数: {len(errors)}/{len(data)} ({len(errors)/len(data)*100:.1f}%)")
    print(f"\n=== 错误案例分析 (前{n_examples}个) ===")
    
    for i, error in enumerate(errors[:n_examples]):
        inst = error["instance"]
        print(f"\n--- 案例 {i+1} ---")
        print(f"目标事件: {inst.target_event}")
        print(f"预测: {error['prediction']} | 正确: {error['golden']}")
        print(f"选项:")
        for opt in error['golden']:
            print(f"  [正确] {opt}: {inst.options[opt][:80]}...")
        for opt in error['prediction'] - error['golden']:
            print(f"  [错误] {opt}: {inst.options[opt][:80]}...")

# 使用示例 (需要先运行某个baseline获得predictions)
# analyze_errors(dev_data, predictions, goldens)