In [2]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
sequence = "I've been waiting for HugginFace course my whole life!"

model_inputs = tokenizer(sequence)

model_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 8549, 11528, 12172, 2607, 2026, 2878, 2166, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
# Can also provide multiple inputs

sequences = [sequence, "What a time to be alive!"]

model_inputs = tokenizer(sequences)

model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 8549, 11528, 12172, 2607, 2026, 2878, 2166, 999, 102], [101, 2054, 1037, 2051, 2000, 2022, 4142, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [7]:
# The padding behaviour can also be changed

# Pad to maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Pad to model's max length
model_inputs = tokenizer(sequences, padding="max_length")

# Pad to specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)


In [8]:
# Can also truncate sequences

# Trunctate to maximum of 8 length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

### Special Tokens

In [9]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs['input_ids'])

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]


In [10]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


The direct conversion also add start token and end tokens.

In [11]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [12]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

In [13]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)