In [1]:
!pip install -q --upgrade transformers torch torchvision torchaudio
!pip install -q tokenizers==0.13.3
!pip install -q bitsandbytes transformers accelerate gradio thread6

[0m

#### In the Last section we learned about tokenizers. They are a way that the model understands the text we give it.

#### We observed how tokenizers work by doing inference on a single sequence of a small length

#### This however, has brought up a few questions:
   * How do we handle multiple sequences?
   * How do we handle multiple sequences of different lengths?
   * Are vocabulary indices the only inputs that allow a model to work well?
   * Is there such a thing as too long a sequence?

In [2]:
# We previously saw how sequences get translated into lists of numbers. 
# let's convert this list of numbers to a tensor and send it to the model
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail
model(input_ids)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

IndexError: too many indices for tensor of dimension 1

In [11]:
# NOTE: 
# We followed all the steps like in the tokenizer's notebook but the code failed

# The problem is that we sent a single sequence to the model, whereas 🤗 Transformers models expect multiple sentences by default.
# if you look closely and the next code block you'll see that the tokenizer didn't just convert the list of input id's into a tensor, it added a dimension on top of it.
tokenized_inputs = tokenizer(sequence, return_tensors='pt')
print("Shape of Tensor:", tokenized_inputs["input_ids"].shape)
print("Tensor:", tokenized_inputs["input_ids"])
print("\n"+"The tensor is 2 dimensional")

Shape of Tensor: torch.Size([1, 16])
Tensor: tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])

The tensor is 2 dimensional


In [8]:
# WE'll try again but add another dimension

# Note: we skip the imports and downloading the model since we already did it above
sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids]) # we add another dimension here with the []

print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


#### "Batching" is the act of sending multiple sentences through the model, all at once

   * If you only have one sentence, you can build a batch with a single sequence
   
```python
batched_ids = [ids, ids]
```

In [13]:
# Exercise - pass the above into the model, result should be double of what we got above since its [ids, ids]
batched_inputs = torch.tensor([ids, ids])

print("Input IDs:", batched_inputs)

batched_output = model(batched_inputs)
print("Logits:", batched_output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


#### Batching allows the model to work when you feed it multiple sentences. Using multiple sequences is just as simple as building a batch with a single sequence.

#### Note: When you're trying to batch two or more sentences together, they might be different lengths. 

#### Tensors are required to be a rectangular shape, so we won't be able to convert the list of input id's directly into a tensor.

#### To work around this problem we'll have to "pad" the inputs

In [None]:
# The following list of lists cannot be converted into a tensor:
batched_ids = [
    [200, 200, 200],
    [200, 200]
]

In [14]:
# In order to work around this issue we'll use "padding" to make our tensor a rectangular shape

# padding makes sure all sentences have the same shape by adding a "padding token" to the sentences with fewer values

# for example, if you have 10 sentences with 10 words and 1 sentence with 20 words, padding will make all sentences of 20 words
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [39]:
# The padding token ID can be found in 'tokenizer.pad_token_id'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

# Theres something wrong with our batched_ids logits
# they should have been the same for sequence2

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


[[200, 200, 200], [200, 200, 0]]

#### The key feature of Transformers models are Attention-Layers that "contextualize" each token

#### These Attention-Layers will take into account the padding tokens since they attend to all of the tokens of a sequence

#### To fix the previous issue we need to tell those attention layers to ignore the padding tokens.

#### This is done by using an "attention mask"

#### Attention Masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens
#### should be attended to, and 0s indicate the corresponding tokens should not be attended to(ignored by the attention mask layer)

In [16]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask = torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [45]:
# Exercise for padding tokens and getting logits

ex_sen1 = "I’ve been waiting for a HuggingFace course my whole life."
ex_sen2 = "I hate this so much!"

# converting sentences to tokens
extoken1 = tokenizer.tokenize(ex_sen1)
extoken2 = tokenizer.tokenize(ex_sen2)

# converting tokens into ids but they are not tensors yet
ex_ids1 = tokenizer.convert_tokens_to_ids(extoken1)
ex_ids2 = tokenizer.convert_tokens_to_ids(extoken2)

# Find the maximum length of the two sequences
max_length = max(len(ex_ids1), len(ex_ids2))

# Pad both sequences to the maximum length
while len(ex_ids1) < max_length:
    ex_ids1.append(tokenizer.pad_token_id)

while len(ex_ids2) < max_length:
    ex_ids2.append(tokenizer.pad_token_id)

# turn the ids into a batch
exbatched_ids = [
    ex_ids1,
    ex_ids2 # "tokenizer.pad_token_id" lets us send two sentences together through the model individually and batched together
]

exattention_masks = [[1 if token_id != tokenizer.pad_token_id else 0 for token_id in sentence] for sentence in exbatched_ids]

print(model(torch.tensor(exbatched_ids), attention_mask=torch.tensor(exattention_masks)))

SequenceClassifierOutput(loss=None, logits=tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [48]:
# simpler approach to the above:
ex_sen1 = "I’ve been waiting for a HuggingFace course my whole life."
ex_sen2 = "I hate this so much!"

# Using batch_encode_plus to handle both tokenization and padding
encoded_batch = tokenizer.batch_encode_plus( # 'encoded_batch' returns a dictionary with 'input_ids' and 'attention_mask' keys
    [ex_sen1, ex_sen2], 
    padding='longest',  # Pads to the longest sequence in the batch
    return_attention_mask=True, 
    return_tensors="pt"  # Returns PyTorch tensors
)

input_ids = encoded_batch['input_ids']
attention_masks = encoded_batch['attention_mask']

print(model(input_ids, attention_mask=attention_masks))


SequenceClassifierOutput(loss=None, logits=tensor([[-1.5979,  1.6390],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


#### With Transformers models there is a limit to the lengths of the sequences we can pass the models. Most models handle sequences of up to 512 or 1024
#### tokens, and will crash when asked to process longer sequences. There are two solutions to this problem
   * Use a model with a longer supported sequence length
   * Truncate your sequence