# 推理逻辑数据集实战指南 (Practical Guide to Reasoning Datasets)

本notebook展示如何加载和使用各种推理逻辑数据集来训练模型。

## 环境准备

In [None]:
# 安装必要的库
# !pip install datasets transformers torch pandas numpy

import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset, concatenate_datasets
import pandas as pd
import json

## 1. 数学推理数据集

### GSM8K - 小学数学应用题

In [None]:
# 加载GSM8K数据集
gsm8k = load_dataset("gsm8k", "main")

# 查看数据集结构
print("数据集结构:", gsm8k)
print("\n训练集大小:", len(gsm8k["train"]))
print("测试集大小:", len(gsm8k["test"]))

# 查看第一个样本
example = gsm8k["train"][0]
print("\n示例问题:")
print(example["question"])
print("\n答案和推理步骤:")
print(example["answer"])

### MATH Dataset - 数学竞赛题

In [None]:
# 加载MATH数据集
math_dataset = load_dataset("competition_math")

# 查看数据集结构
print("数据集结构:", math_dataset)

# 查看一个样本
example = math_dataset["train"][0]
print("\n问题:", example["problem"])
print("\n难度级别:", example["level"])
print("\n问题类型:", example["type"])
print("\n解答:", example["solution"][:200], "...")  # 只显示前200个字符

### MathQA - 带操作程序的数学题

In [None]:
# 加载MathQA数据集
mathqa = load_dataset("math_qa")

# 查看样本
example = mathqa["train"][0]
print("问题:", example["Problem"])
print("\n推理程序:", example["Rationale"])
print("\n选项:", example["options"])
print("\n正确答案:", example["correct"])

## 2. 逻辑推理数据集

### LogiQA - 逻辑推理

In [None]:
# 加载LogiQA数据集
logiqa = load_dataset("logiqa")

# 查看样本
example = logiqa["train"][0]
print("上下文:", example["context"])
print("\n问题:", example["query"])
print("\n选项:")
for i, option in enumerate(example["options"]):
    print(f"  {chr(65+i)}: {option}")
print("\n正确答案:", chr(65 + example["correct_option"]))

### ProofWriter - 形式化逻辑证明

In [None]:
# 加载ProofWriter数据集
proofwriter = load_dataset("allenai/proofwriter")

# 查看样本
example = proofwriter["train"][0]
print("理论/前提:")
print(example["theory"])
print("\n问题:", example["question"])
print("\n答案:", example["answer"])
if "proof" in example:
    print("\n证明过程:", example["proof"])

## 3. 常识推理数据集

### CommonsenseQA

In [None]:
# 加载CommonsenseQA数据集
commonsenseqa = load_dataset("commonsense_qa")

# 查看样本
example = commonsenseqa["train"][0]
print("问题:", example["question"])
print("\n选项:")
for label, text in zip(example["choices"]["label"], example["choices"]["text"]):
    print(f"  {label}: {text}")
print("\n正确答案:", example["answerKey"])

### PIQA - 物理常识推理

In [None]:
# 加载PIQA数据集
piqa = load_dataset("piqa")

# 查看样本
example = piqa["train"][0]
print("目标/问题:", example["goal"])
print("\n选项A:", example["sol1"])
print("选项B:", example["sol2"])
print("\n正确答案:", "A" if example["label"] == 0 else "B")

## 4. 综合推理数据集

### ARC (AI2 Reasoning Challenge)

In [None]:
# 加载ARC-Challenge数据集
arc_challenge = load_dataset("ai2_arc", "ARC-Challenge")

# 查看样本
example = arc_challenge["train"][0]
print("问题:", example["question"])
print("\n选项:")
for label, text in zip(example["choices"]["label"], example["choices"]["text"]):
    print(f"  {label}: {text}")
print("\n正确答案:", example["answerKey"])

### OpenBookQA

In [None]:
# 加载OpenBookQA数据集
openbookqa = load_dataset("openbookqa", "main")

# 查看样本
example = openbookqa["train"][0]
print("问题干:", example["question_stem"])
print("\n选项:")
for label, text in zip(example["choices"]["label"], example["choices"]["text"]):
    print(f"  {label}: {text}")
print("\n正确答案:", example["answerKey"])

## 5. 代码推理数据集

### HumanEval - Python编程

In [None]:
# 加载HumanEval数据集
humaneval = load_dataset("openai_humaneval")

# 查看样本
example = humaneval["test"][0]
print("任务ID:", example["task_id"])
print("\n函数签名和文档:")
print(example["prompt"])
print("\n测试用例:")
print(example["test"][:200], "...")  # 只显示部分测试用例

### MBPP - Python基础编程

In [None]:
# 加载MBPP数据集
mbpp = load_dataset("mbpp")

# 查看样本
example = mbpp["train"][0]
print("任务ID:", example["task_id"])
print("\n问题描述:", example["text"])
print("\n测试用例:")
for test in example["test_list"]:
    print(f"  {test}")
print("\n参考代码:")
print(example["code"])

## 6. 思维链(Chain-of-Thought)数据集

### MetaMathQA

In [None]:
# 加载MetaMathQA数据集
metamath = load_dataset("meta-math/MetaMathQA")

# 查看样本
example = metamath["train"][0]
print("问题:", example["query"])
print("\n完整推理过程:")
print(example["response"])

## 7. 数据预处理示例

### 统一格式化多个数据集

In [None]:
def format_gsm8k(example):
    """格式化GSM8K数据"""
    return {
        "instruction": "Solve the following math problem step by step.",
        "input": example["question"],
        "output": example["answer"],
        "dataset": "gsm8k"
    }

def format_commonsenseqa(example):
    """格式化CommonsenseQA数据"""
    choices = "\n".join([
        f"{label}: {text}" 
        for label, text in zip(example["choices"]["label"], example["choices"]["text"])
    ])
    return {
        "instruction": "Answer the following question by selecting the correct option.",
        "input": f"{example['question']}\n\nOptions:\n{choices}",
        "output": example["answerKey"],
        "dataset": "commonsenseqa"
    }

def format_arc(example):
    """格式化ARC数据"""
    choices = "\n".join([
        f"{label}: {text}" 
        for label, text in zip(example["choices"]["label"], example["choices"]["text"])
    ])
    return {
        "instruction": "Answer the following science question.",
        "input": f"{example['question']}\n\nOptions:\n{choices}",
        "output": example["answerKey"],
        "dataset": "arc"
    }

# 应用格式化
print("格式化示例:")
print("\n=== GSM8K ===")
formatted_gsm8k = format_gsm8k(gsm8k["train"][0])
print(json.dumps(formatted_gsm8k, indent=2, ensure_ascii=False))

### 创建混合训练数据集

In [None]:
# 示例：混合多个数据集
def create_mixed_dataset(sample_sizes=None):
    """创建混合数据集
    
    Args:
        sample_sizes: dict，指定每个数据集的样本数量
    """
    if sample_sizes is None:
        sample_sizes = {
            "gsm8k": 1000,
            "commonsenseqa": 1000,
            "arc": 1000
        }
    
    datasets_list = []
    
    # 加载并格式化GSM8K
    gsm8k_data = load_dataset("gsm8k", "main", split="train")
    gsm8k_formatted = gsm8k_data.select(range(min(sample_sizes["gsm8k"], len(gsm8k_data))))
    gsm8k_formatted = gsm8k_formatted.map(format_gsm8k)
    datasets_list.append(gsm8k_formatted)
    
    # 加载并格式化CommonsenseQA
    cqa_data = load_dataset("commonsense_qa", split="train")
    cqa_formatted = cqa_data.select(range(min(sample_sizes["commonsenseqa"], len(cqa_data))))
    cqa_formatted = cqa_formatted.map(format_commonsenseqa)
    datasets_list.append(cqa_formatted)
    
    # 加载并格式化ARC
    arc_data = load_dataset("ai2_arc", "ARC-Challenge", split="train")
    arc_formatted = arc_data.select(range(min(sample_sizes["arc"], len(arc_data))))
    arc_formatted = arc_formatted.map(format_arc)
    datasets_list.append(arc_formatted)
    
    # 合并数据集
    mixed_dataset = concatenate_datasets(datasets_list)
    
    # 打乱数据
    mixed_dataset = mixed_dataset.shuffle(seed=42)
    
    return mixed_dataset

# 创建混合数据集
print("创建混合数据集...")
# mixed = create_mixed_dataset()
# print(f"混合数据集大小: {len(mixed)}")
print("（取消注释上面两行以实际创建数据集）")

## 8. 数据统计分析

In [None]:
def analyze_dataset(dataset, name, text_field="question"):
    """分析数据集基本统计信息"""
    print(f"\n=== {name} 数据集分析 ===")
    print(f"样本总数: {len(dataset)}")
    
    # 文本长度统计
    if text_field in dataset.column_names:
        lengths = [len(item[text_field].split()) for item in dataset]
        print(f"平均文本长度: {sum(lengths)/len(lengths):.2f} 词")
        print(f"最短文本: {min(lengths)} 词")
        print(f"最长文本: {max(lengths)} 词")
    
    # 字段列表
    print(f"数据字段: {dataset.column_names}")

# 示例分析
# analyze_dataset(gsm8k["train"], "GSM8K", "question")
# analyze_dataset(commonsenseqa["train"], "CommonsenseQA", "question")

## 9. 训练数据准备

### 为微调准备数据

In [None]:
def prepare_for_training(dataset, format_fn, output_file="training_data.jsonl"):
    """准备用于训练的数据
    
    Args:
        dataset: 原始数据集
        format_fn: 格式化函数
        output_file: 输出文件路径
    """
    formatted_data = []
    
    for item in dataset:
        formatted_item = format_fn(item)
        formatted_data.append(formatted_item)
    
    # 保存为JSONL格式
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in formatted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"已保存 {len(formatted_data)} 条数据到 {output_file}")
    return formatted_data

# 示例使用
print("数据准备完成！")
print("\n使用示例:")
print("formatted_data = prepare_for_training(gsm8k['train'], format_gsm8k, 'gsm8k_train.jsonl')")

## 10. 提示工程(Prompt Engineering)示例

In [None]:
# Chain-of-Thought提示模板
cot_template = """Question: {question}

Let's solve this step by step:

{reasoning_steps}

Therefore, the answer is: {answer}
"""

# Few-shot提示模板
few_shot_template = """Here are some examples:

Example 1:
Q: {example1_q}
A: {example1_a}

Example 2:
Q: {example2_q}
A: {example2_a}

Now solve this:
Q: {question}
A: 
"""

# 自洽性(Self-Consistency)提示
self_consistency_template = """Solve the following problem in multiple ways and verify your answer:

Problem: {question}

Method 1:
{method1}

Method 2:
{method2}

Consistent answer: {answer}
"""

print("提示工程模板已定义！")
print("\n可用模板:")
print("1. cot_template - 思维链提示")
print("2. few_shot_template - Few-shot学习提示")
print("3. self_consistency_template - 自洽性提示")

## 总结

本notebook展示了如何:
1. 加载各种推理数据集
2. 查看数据集结构和样本
3. 格式化和预处理数据
4. 创建混合训练数据集
5. 准备用于模型训练的数据
6. 应用不同的提示工程技术

### 下一步建议:
1. 根据你的具体任务选择合适的数据集
2. 实验不同的数据混合比例
3. 尝试不同的提示工程技术
4. 使用这些数据集微调你的模型
5. 在测试集上评估模型性能

### 参考资源:
- HuggingFace Datasets文档: https://huggingface.co/docs/datasets
- Transformers库文档: https://huggingface.co/docs/transformers
- 相关论文和博客文章（见reasoning_datasets_guide.md）