In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # tokenizer from checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) # model from checkpoint

In [3]:
sequence = "I've been waiting for a HuggingFace course my whole life"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids]) # here we add a dimension by passing our ids in an array (this creates a batch of ids, in this case, the batch size is 1)
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166]])
Logits: tensor([[-3.1398,  3.3515]], grad_fn=<AddmmBackward0>)


In [4]:
# with batch size as 2
input_ids = torch.tensor([ids, ids])
output = model(input_ids)
print("Logits:", output.logits)

Logits: tensor([[-3.1398,  3.3515],
        [-3.1398,  3.3515]], grad_fn=<AddmmBackward0>)


In [5]:
# pad tokens
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence_ids_1 = [[200, 200, 200]]
sequence_ids_2 = [[200, 200]]

# padding the array with tokens
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id] # pad token, since the generated tokens have unique values
]

print(model(torch.tensor(sequence_ids_1)).logits) # output value
print(model(torch.tensor(sequence_ids_2)).logits) # output value
print(model(torch.tensor(batched_ids)).logits) # output value

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [6]:
# putting it together
# creating tokenizer from a defined checkpoint
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# creating model from a defined checkpoint
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english");

sequences = ["this is my token string for model input", "this is a test you will fail"]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

outputs = model(**tokens)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.5169, -2.1896],
        [ 2.8167, -2.4000]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
