# Models & Tokenizers in HuggingFace

In [1]:
from transformers import AutoTokenizer

In [3]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

(…)cased/resolve/main/tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.8kB/s]
(…)rt-base-uncased/resolve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 568kB/s]
(…)bert-base-uncased/resolve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 135kB/s]
(…)base-uncased/resolve/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 8.24MB/s]


In [4]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
tokens = tokenizer.tokenize("hello world")
tokens

['hello', 'world']

In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7592, 2088]

In [7]:
print(tokenizer.convert_ids_to_tokens(ids))
print(tokenizer.decode(ids))

['hello', 'world']
hello world


Now with encode method

In [8]:
ids = tokenizer.encode("hello world")
print(ids)
print(tokenizer.convert_ids_to_tokens(ids))
print(tokenizer.decode(ids))

[101, 7592, 2088, 102]
['[CLS]', 'hello', 'world', '[SEP]']
[CLS] hello world [SEP]


In [11]:
model_inputs = tokenizer('hello world')
print(model_inputs)
print(tokenizer.decode(model_inputs['input_ids']))

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
[CLS] hello world [SEP]


### Let's try using results of tokenizers on a model now

When the data contains multiple entries/sentences, you need to add the options: padding=True and truncation=true for the pytorch tensor to be valid. Also the model needs to take torch tensor as input.

so you need the return_tensor='pt' optional param.

In [16]:
data = [
    'I like cats.',
    'Do you like cats?'
]
model_inputs = tokenizer(data, padding=True, truncation=True, return_tensors='pt')
print(model_inputs['input_ids'])
print(model_inputs['attention_mask'])

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0],
        [ 101, 2079, 2017, 2066, 8870, 1029,  102]])
tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])


In [25]:
from transformers import AutoModelForSequenceClassification
import numpy as np

In [18]:
# default is a binary classifier (2 classes), but you can specify a number of classes
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
outputs = model(**model_inputs)
print(outputs)
# get the logits in a tensor format
print(outputs.logits)
# get the logits in a numpy array
print(outputs.logits.detach().cpu().numpy())
# get the predicted label for each output
print([np.argmax(x) for x in outputs.logits.detach().cpu().numpy()])

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3248, -0.0339,  0.3493],
        [ 0.3340, -0.0370,  0.3167]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[ 0.3248, -0.0339,  0.3493],
        [ 0.3340, -0.0370,  0.3167]], grad_fn=<AddmmBackward0>)
[[ 0.32482412 -0.03393699  0.3493161 ]
 [ 0.33399528 -0.03701604  0.31672823]]
[2, 0]
