# Handling multiple sequences (PyTorch)

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
! pip install datasets transformers[sentencepiece]

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 22.1 MB/s eta 0:00:01[K     |██▌                             | 20 kB 27.3 MB/s eta 0:00:01[K     |███▊                            | 30 kB 13.0 MB/s eta 0:00:01[K     |█████                           | 40 kB 9.3 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 5.2 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 5.9 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████                      | 81 kB 6.3 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 6.4 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 5.1 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 5.1 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 5.1 MB/s eta 0:00:01[K     |████████████████                | 133 kB 5.1 MB/s eta 0:00:01

In [2]:
# Tokenizer wont work on single input you must make ids a list of lists
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

IndexError: ignored

In [3]:
# Tokenizing inputs
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [4]:
#passing input ids to model to get logits

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)


In [5]:
# Batch of two sequences with different lengths

batched_ids = [
  [200, 200, 200],
  [200, 200]
]

In [6]:
# We have to pad sequeneces to get a rectangular matrix

padding_id = 100

batched_ids = [
  [200, 200, 200],
  [200, 200, padding_id]
]

In [7]:
# We see that the padded and unpadded versions produce different logits
# We have to mask the attention of the padded tokens so the model ingores those in the attention layers

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [[200, 200, 200], [200, 200, tokenizer.pad_token_id]]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)
tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward>)


In [8]:
# Masking attention of padded tokens

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

attention_mask = [
  [1, 1, 1],
  [1, 1, 0]
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward>)


In [11]:
# Example of truncation

max_sequence_length = 5

sequence = sequence[:max_sequence_length]

In [21]:
# Once again showing getting different logits if you dont have attention masks on padded tokens

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
             "I've been waiting for a HuggingFace course my whole life.",
             "I hate this."
]
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(tokens[0])
print(tokens[1])
print('\n')

print(ids[0])
print(ids[1])
print('\n')

input_1 = torch.tensor([ids[0]])
input_2 = torch.tensor([ids[1]])
input_3 = torch.tensor([[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012],
                       [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

output_1 = model(input_1)
output_2 = model(input_2)
output_3 = model(input_3)
print(output_1.logits)
print(output_2.logits)
print(output_3.logits)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
['i', 'hate', 'this', '.']


[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 1012]


tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward>)
tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
tensor([[-2.7276,  2.8789],
        [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)


In [22]:
# The AutoTokenizer class will auto set attention masks if you specify padding=True

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
             "I've been waiting for a HuggingFace course my whole life.",
             "I hate this."
]

print(tokenizer(sentences, padding=True))

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
