In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

In [3]:
tokens,input_ids

(['i',
  "'",
  've',
  'been',
  'waiting',
  'for',
  'a',
  'hugging',
  '##face',
  'course',
  'my',
  'whole',
  'life',
  '.'],
 tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]))

In [6]:
# This line will fail.
# model(input_ids)

# This line is ok.
model(input_ids.view(1,-1))

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [7]:
tokenizer.decode(input_ids)

"i've been waiting for a huggingface course my whole life."

In [9]:
tokenizer(sequence,return_tensors='pt')

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
torch.tensor([
    [200, 200, 200],
    [200, 200]
])

ValueError: expected sequence of length 3 at dim 1 (got 2)

## attention_mask

### 引出attention_mask

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [12]:
tokenizer.pad_token_id

0

In [13]:
tokenizer.pad_token

'[PAD]'

In [16]:
tokenizer.decode([0]),tokenizer.decode(0),tokenizer.decode(torch.tensor([0]))

('[PAD]', '[PAD]', '[PAD]')