# QA pipeline

In [1]:
import os

# 设置代理
os.environ['http_proxy'] = 'http://127.0.0.1:7893'
os.environ['https_proxy'] = 'http://127.0.0.1:7893'
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7893'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7893'
os.environ['no_proxy'] = '127.0.0.1,localhost'
os.environ['NO_PROXY'] = '127.0.0.1,localhost'

In [2]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cuda:0


{'score': 0.9804227044842264,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# 准备输入
# tokenizer 会自动将 question 和 context 组合成模型需要的格式：
# [CLS] question [SEP] context [SEP]
inputs = tokenizer(question, context, return_tensors="pt")

# 模型推理
# outputs 包含 start_logits 和 end_logits
# start_logits: 每个 token 作为答案开始位置的分数
# end_logits: 每个 token 作为答案结束位置的分数
outputs = model(**inputs)

# ========== 从 outputs 还原答案（这就是 pipeline 内部做的事）==========

# 1. 获取最高分的开始和结束位置
start_scores = outputs.start_logits
end_scores = outputs.end_logits

# 找到最高分的位置索引
start_idx = torch.argmax(start_scores)
end_idx = torch.argmax(end_scores) 

# 2. 获取答案的 token IDs
answer_ids = inputs.input_ids[0][start_idx:end_idx + 1]

# 3. 将 token IDs 解码回文本
answer = tokenizer.decode(answer_ids)
print("answer", answer)

# 4. 计算置信度分数（使用 softmax）
start_prob = torch.softmax(start_scores, dim=-1)[0, start_idx].item()
end_prob = torch.softmax(end_scores, dim=-1)[0, end_idx].item()
score = start_prob * end_prob  # 简单相乘作为总分

# 5. 获取答案在原文中的字符位置（可选）
# 使用 offset_mapping 可以映射回原文位置
inputs_with_offsets = tokenizer(
    question, context, 
    return_tensors="pt",
    return_offsets_mapping=True
)

# 获取答案在 context 中的实际位置
offsets = inputs_with_offsets["offset_mapping"][0]
# 注意：需要跳过 question 部分的 tokens
sep_idx = inputs.input_ids[0].tolist().index(tokenizer.sep_token_id)  # 第一个 [SEP] 的位置
if start_idx > sep_idx:  # 确保答案在 context 中
    # 计算相对于 context 的偏移
    context_start_char = offsets[start_idx][0].item() - len(question) - 2  # 减去 question 和标记
    context_end_char = offsets[end_idx][1].item() - len(question) - 2
    answer_from_context = context[context_start_char:context_end_char]
    
print(f"答案: {answer}")
print(f"置信度: {score:.4f}")
print(f"开始位置: {start_idx}, 结束位置: {end_idx}")
print(f"答案 tokens: {tokenizer.convert_ids_to_tokens(answer_ids)}")

answer Jax, PyTorch, and TensorFlow
答案: Jax, PyTorch, and TensorFlow
置信度: 0.9803
开始位置: 23, 结束位置: 35
答案 tokens: ['Jax', ',', 'P', '##y', '##T', '##or', '##ch', ',', 'and', 'Ten', '##sor', '##F', '##low']


In [None]:
# 更简洁的版本（最常用的方式）
def get_answer_from_outputs(tokenizer, inputs, outputs):
    """
    从模型输出中提取答案
    这个函数展示了 pipeline 内部的核心逻辑
    """
    # 获取最可能的答案位置
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits)
    
    # 提取答案 tokens 并解码
    answer_tokens = inputs.input_ids[0][start_idx:end_idx + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    
    # 计算分数
    start_score = torch.softmax(outputs.start_logits, dim=-1)[0, start_idx]
    end_score = torch.softmax(outputs.end_logits, dim=-1)[0, end_idx]
    score = (start_score * end_score).item()
    
    return {
        "answer": answer,
        "score": score,
        "start": start_idx.item(),
        "end": end_idx.item()
    }

# 使用简化函数
result = get_answer_from_outputs(tokenizer, inputs, outputs)
print(f"\n简化版结果: {result}")

# 对比 pipeline 的结果
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipeline_result = qa_pipeline(question=question, context=context)
print(f"Pipeline 结果: {pipeline_result}")

In [4]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question_answerer(question=question, context=long_context)

{'score': 0.9717117227373819,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

In [10]:
# ========== 处理长文本：需要分块处理 ==========
# 当 context 超过模型最大长度（384 tokens）时，需要使用滑动窗口技术

inputs = tokenizer(
    question,
    long_context,
    stride=128,         # 滑动窗口重叠 128 个 tokens（避免答案被切断）
    max_length=384,     # 每个块的最大长度
    padding="longest",  # 填充到最长序列
    truncation="only_second",  # 只截断 context，保留完整 question
    return_overflowing_tokens=True,  # 返回溢出的部分（生成多个块）
    return_offsets_mapping=True,      # 返回字符偏移映射（用于定位原文位置）
)

# 移除不需要的字段
_ = inputs.pop("overflow_to_sample_mapping")  # 样本映射关系
offsets = inputs.pop("offset_mapping")        # 保存偏移映射，后面用于还原答案位置

# 转换为 PyTorch tensors
inputs = inputs.convert_to_tensors("pt")
print(f"输入形状: {inputs['input_ids'].shape}")
print(f"说明: 生成了 {inputs['input_ids'].shape[0]} 个块，每个块最多 {inputs['input_ids'].shape[1]} 个 tokens")

输入形状: torch.Size([2, 384])
说明: 生成了 2 个块，每个块最多 384 个 tokens


In [11]:
# 对每个块进行推理
# 注意：现在有 2 个块，所以输出也是 2 个结果
outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(f"Start logits 形状: {start_logits.shape}")  # [2, 384] = 2个块，每个384个位置
print(f"End logits 形状: {end_logits.shape}")
print(f"解释: 每个块都会产生一个答案预测")

Start logits 形状: torch.Size([2, 384])
End logits 形状: torch.Size([2, 384])
解释: 每个块都会产生一个答案预测


In [12]:
# ========== 关键步骤：过滤无效位置 ==========
# 只有 context 部分的 tokens 才可能是答案，需要屏蔽其他位置

# 1. 获取每个 token 的类型（0=特殊标记/question, 1=context, None=padding）
sequence_ids = inputs.sequence_ids()

# 2. 创建掩码：标记哪些位置不可能是答案
mask = [i != 1 for i in sequence_ids]  # True = 不是 context 的部分
mask[0] = False  # [CLS] token 也要屏蔽
# 添加 padding 掩码
mask = torch.logical_or(torch.tensor(mask)[None], (inputs["attention_mask"] == 0))

# 3. 将无效位置的分数设为极小值（-10000），这样 softmax 后概率接近 0
start_logits[mask] = -10000
end_logits[mask] = -10000

# 4. 计算概率
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)

# ========== 找出每个块的最佳答案 ==========
candidates = []
for i, (start_probs, end_probs) in enumerate(zip(start_probabilities, end_probabilities)):
    # 计算所有开始-结束位置组合的分数矩阵
    scores = start_probs[:, None] * end_probs[None, :]
    
    # torch.triu: 保留上三角矩阵（确保 end >= start）
    # argmax: 找到分数最高的位置
    idx = torch.triu(scores).argmax().item()
    
    # 从一维索引转换回二维坐标
    start_idx = idx // scores.shape[1]
    end_idx = idx % scores.shape[1]
    score = scores[start_idx, end_idx].item()
    
    candidates.append((start_idx, end_idx, score))
    print(f"块 {i}: 开始={start_idx}, 结束={end_idx}, 分数={score:.4f}")

print(f"\n候选答案: {candidates}")

# ========== 还原每个候选答案的文本 ==========
print("\n每个块的答案:")
for i, (candidate, offset) in enumerate(zip(candidates, offsets)):
    start_token, end_token, score = candidate
    start_char, _ = offset[start_token]
    _, end_char = offset[end_token]
    answer = long_context[start_char:end_char]
    result = {"answer": answer, "start": start_char, "end": end_char, "score": score}
    print(f"块 {i}: {result}")

# ========== 为什么忽略第一个答案？ ==========
print("\n" + "="*50)
print("为什么第一个答案通常被忽略？")
print("1. 第一个块的答案分数很低 (0.34)，第二个块分数高 (0.97)")
print("2. 第一个块可能不包含问题的答案，模型被迫选择了一个不相关的内容")
print("3. 第二个块包含了真正的答案（文档末尾提到的三个库）")
print("4. Pipeline 会自动选择所有块中分数最高的答案，所以返回第二个")
print("\n最终答案（分数最高的）:")
best_idx = max(range(len(candidates)), key=lambda i: candidates[i][2])
best_candidate, best_offset = candidates[best_idx], offsets[best_idx]
start_token, end_token, score = best_candidate
start_char, _ = best_offset[start_token]
_, end_char = best_offset[end_token]
answer = long_context[start_char:end_char]
print(f"答案: '{answer}'")
print(f"分数: {score:.4f}")
print(f"位置: [{start_char}:{end_char}]")

块 0: 开始=0, 结束=18, 分数=0.3387
块 1: 开始=173, 结束=184, 分数=0.9715

候选答案: [(0, 18, 0.3386705815792084), (173, 184, 0.9714869856834412)]

每个块的答案:
块 0: {'answer': '\n🤗 Transformers: State of the Art NLP', 'start': 0, 'end': 37, 'score': 0.3386705815792084}
块 1: {'answer': 'Jax, PyTorch and TensorFlow', 'start': 1892, 'end': 1919, 'score': 0.9714869856834412}

为什么第一个答案通常被忽略？
1. 第一个块的答案分数很低 (0.34)，第二个块分数高 (0.97)
2. 第一个块可能不包含问题的答案，模型被迫选择了一个不相关的内容
3. 第二个块包含了真正的答案（文档末尾提到的三个库）
4. Pipeline 会自动选择所有块中分数最高的答案，所以返回第二个

最终答案（分数最高的）:
答案: 'Jax, PyTorch and TensorFlow'
分数: 0.9715
位置: [1892:1919]
