In [1]:
import torch
from transformers import BartTokenizer, BartForQuestionAnswering
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "valhalla/bart-large-finetuned-squadv1"
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

# BartTokenizer

In [4]:
tokenizer: BartTokenizer = BartTokenizer.from_pretrained(version)
tokenizer

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartTokenizer(name_or_path='valhalla/bart-large-finetuned-squadv1', vocab_size=50265, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

# BartForQuestionAnswering

BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits).

In [5]:
model: BartForQuestionAnswering = BartForQuestionAnswering.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartForQuestionAnswering(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm(

## 问题和答案放在一起放入模型

In [6]:
print(tokenizer.tokenize(question))
print(tokenizer.tokenize(text))

['Who', 'Ġwas', 'ĠJim', 'ĠH', 'enson', '?']
['Jim', 'ĠH', 'enson', 'Ġwas', 'Ġa', 'Ġnice', 'Ġpuppet']


In [7]:
inputs = tokenizer(question, text, return_tensors="pt", return_length=True).to(device, torch.float16)

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"])
print(inputs["length"])

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[    0, 12375,    21,  2488,   289, 13919,   116,     2,     2, 24021,
           289, 13919,    21,    10,  2579, 29771,     2]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([17], device='cuda:0')


In [8]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs
# Seq2SeqQuestionAnsweringModelOutput

Seq2SeqQuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ -9.3060,  -9.3060, -13.0659, -12.4787, -12.6400, -13.9874, -13.7258,
         -12.5306, -12.3611, -11.3757, -10.6383, -11.5808, -11.9820,  -7.7097,
          -7.0618,  -7.2340, -11.0562]], device='cuda:0'), end_logits=tensor([[ -8.9331,  -8.9331, -13.0835, -14.1188, -14.0800, -10.6040, -13.1219,
         -13.6587, -12.9884, -12.6344, -13.5677,  -8.5649, -12.7503, -12.0183,
          -8.5100,  -5.6637, -11.3245]], device='cuda:0'), past_key_values=((tensor([[[[ 3.0625e-01,  4.8447e-01,  1.9885e+00,  ...,  1.0054e+00,
            2.8460e+00, -1.3105e+00],
          [-6.2117e-02,  6.7858e-01,  1.6820e+00,  ...,  9.1172e-01,
            2.4835e+00, -1.6280e+00],
          [-8.8196e-01, -1.1487e+00, -2.9363e+00,  ..., -2.7585e+00,
           -4.9829e+00, -2.7216e+00],
          ...,
          [-1.5336e-01, -6.6063e-01, -1.0952e+00,  ..., -5.6688e+00,
           -5.7846e+00,  3.5386e+00],
          [-2.2358e+00, -2.1846e+0

In [9]:
print(outputs.start_logits.shape)
print(outputs.start_logits)
print(outputs.start_logits.argmax())

torch.Size([1, 17])
tensor([[ -9.3060,  -9.3060, -13.0659, -12.4787, -12.6400, -13.9874, -13.7258,
         -12.5306, -12.3611, -11.3757, -10.6383, -11.5808, -11.9820,  -7.7097,
          -7.0618,  -7.2340, -11.0562]], device='cuda:0')
tensor(14, device='cuda:0')


In [10]:
print(outputs.end_logits.shape)
print(outputs.end_logits)
print(outputs.end_logits.argmax())

torch.Size([1, 17])
tensor([[ -8.9331,  -8.9331, -13.0835, -14.1188, -14.0800, -10.6040, -13.1219,
         -13.6587, -12.9884, -12.6344, -13.5677,  -8.5649, -12.7503, -12.0183,
          -8.5100,  -5.6637, -11.3245]], device='cuda:0')
tensor(15, device='cuda:0')


In [11]:
predict_answer_tokens = inputs.input_ids[0, outputs.start_logits.argmax() : outputs.end_logits.argmax() + 1]
predict_answer_tokens

tensor([ 2579, 29771], device='cuda:0')

In [12]:
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

' nice puppet'

## 问题和答案分开放入模型(结果可能不对)

In [13]:
encoder_inputs = tokenizer(question, return_tensors="pt", return_length=True).to(device)
decoder_inputs = tokenizer(text, return_tensors="pt", return_length=True).to(device)
print(encoder_inputs["length"])
print(decoder_inputs["length"])

tensor([8], device='cuda:0')
tensor([9], device='cuda:0')


In [14]:
with torch.inference_mode():
    outputs = model(
        input_ids = encoder_inputs["input_ids"],
        attention_mask = encoder_inputs["attention_mask"],
        decoder_input_ids = decoder_inputs["input_ids"],
        decoder_attention_mask = decoder_inputs["attention_mask"],
    )

In [15]:
print(outputs.start_logits.shape)
print(outputs.start_logits)
print(outputs.start_logits.argmax())

torch.Size([1, 9])
tensor([[ -9.0251, -11.6106, -12.0686, -13.9226, -10.9058, -10.5395, -11.3237,
         -11.1091, -10.6531]], device='cuda:0')
tensor(0, device='cuda:0')


In [16]:
print(outputs.end_logits.shape)
print(outputs.end_logits)
print(outputs.end_logits.argmax())

torch.Size([1, 9])
tensor([[ -8.3461, -10.4400,  -6.6506, -10.2771, -11.3568, -10.9265, -11.8570,
         -11.0036, -11.8208]], device='cuda:0')
tensor(2, device='cuda:0')


In [17]:
predict_answer_tokens = decoder_inputs.input_ids[0, outputs.start_logits.argmax() : outputs.end_logits.argmax() + 1]
predict_answer_tokens

tensor([    0, 24021,   289], device='cuda:0')

In [18]:
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

'Jim H'

# AutoTokenizer

In [19]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version)
tokenizer

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartTokenizerFast(name_or_path='valhalla/bart-large-finetuned-squadv1', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [20]:
inputs = tokenizer(question, text, return_tensors="pt", return_length=True).to(device)
inputs

{'input_ids': tensor([[    0, 12375,    21,  2488,   289, 13919,   116,     2,     2, 24021,
           289, 13919,    21,    10,  2579, 29771,     2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'length': tensor([17], device='cuda:0')}

# AutoModelForQuestionAnswering

In [21]:
model: AutoModelForQuestionAnswering = AutoModelForQuestionAnswering.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartForQuestionAnswering(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm(

In [22]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )

In [23]:
print(outputs.start_logits.shape)
print(outputs.start_logits)
print(outputs.start_logits.argmax())

torch.Size([1, 17])
tensor([[ -9.3060,  -9.3060, -13.0659, -12.4787, -12.6400, -13.9874, -13.7258,
         -12.5306, -12.3611, -11.3757, -10.6383, -11.5808, -11.9820,  -7.7097,
          -7.0618,  -7.2340, -11.0562]], device='cuda:0')
tensor(14, device='cuda:0')


In [24]:
print(outputs.end_logits.shape)
print(outputs.end_logits)
print(outputs.end_logits.argmax())

torch.Size([1, 17])
tensor([[ -8.9331,  -8.9331, -13.0835, -14.1188, -14.0800, -10.6040, -13.1219,
         -13.6587, -12.9884, -12.6344, -13.5677,  -8.5649, -12.7503, -12.0183,
          -8.5100,  -5.6637, -11.3245]], device='cuda:0')
tensor(15, device='cuda:0')


In [25]:
predict_answer_tokens = inputs.input_ids[0, outputs.start_logits.argmax() : outputs.end_logits.argmax() + 1]
predict_answer_tokens

tensor([ 2579, 29771], device='cuda:0')

In [26]:
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

' nice puppet'