In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [2]:
# Import Autotokeinzer class
from transformers import AutoTokenizer

In [3]:
# Design our desired checkpoint, pass it into the method
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
# Print the object
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
# Tes the tokenizer with a simple string
tokenizer("Hello world")

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [6]:
# Show what the tokenizer does behind the scene
tokens = tokenizer.tokenize("hello world")
tokens

['hello', 'world']

In [7]:
# Convert the tokens to integer id's
ids = tokenizer.convert_tokens_to_ids(tokens)

In [8]:
# Check th id's
ids

[7592, 2088]

In [9]:
# Convert the id's back into tokens
tokenizer.convert_ids_to_tokens(ids)

['hello', 'world']

In [10]:
# Joins the tokens back into a string
tokenizer.decode(ids)

'hello world'

In [11]:
# Try the opposite function t decode wich is encode
ids = tokenizer.encode("hello world")
ids

[101, 7592, 2088, 102]

In [12]:
# Convert the id´s back into tokens (To invetigate why we got 4 id's where [7] gave two id's)
# [CLS] and [SEP] are special BERT-tokens
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', 'world', '[SEP]']

In [13]:
# Test what happens if we try to decode the output of encode directly
tokenizer.decode(ids)

'[CLS] hello world [SEP]'

In [14]:
# Tokenize the string to get model-inputs that we can pass into ur model
model_inputs = tokenizer("hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [15]:
# Tokenize multiple scentences at the same time
data = [
    "I like cats.",
        "Do yu like cats too?",
]
tokenizer(data)

{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 9805, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [17]:
# Import the class
from transformers import AutoModelForSequenceClassification

In [19]:
# Pass in the checkpoint, same as we did with the tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
# Pass in the model-inputs into the model, we need a torch-tensor, not a list
outputs = model(**model_inputs)

AttributeError: ignored

In [21]:
# Fix the problem above with torch-tensors
model_inputs = tokenizer("hello world", return_tensors='pt')
model_inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [22]:
# The default was to create a binary classifier
# Try to pass the input into the model again
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[0.2477, 0.1482]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
# Creat another model with 3 outputs instead of 2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [25]:
# pass our inputs into the new model to see what we get
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2600,  0.1092, -0.1388]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [26]:
# Investigate how we can retrive the logits
outputs.logits

tensor([[-0.2600,  0.1092, -0.1388]], grad_fn=<AddmmBackward0>)

In [30]:
# As above, accesink the key called logits
outputs['logits']

tensor([[-0.2600,  0.1092, -0.1388]], grad_fn=<AddmmBackward0>)

In [31]:
# As above
outputs[0]

tensor([[-0.2600,  0.1092, -0.1388]], grad_fn=<AddmmBackward0>)

In [29]:
# Convert to a numpy-array, can be handy if u want to compute metrics like accuracy
outputs.logits.detach().cpu().numpy()

array([[-0.26000762,  0.10924405, -0.13877088]], dtype=float32)

In [32]:
# Multiple strings at once
data = [
    "I like cats.",
        "Do yu like cats too?",
]
model_inputs = tokenizer(data, return_tensor='pt')
model_inputs

TypeError: ignored

In [33]:
# Fix above, by setting True all outputs will have the same lenght
model_inputs = tokenizer(
    data, padding=True, truncation=True, return_tensors='pt'
)
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 9805, 2066, 8870, 2205, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [34]:
# Check out the input id's, 0=padding tokens
model_inputs['input_ids']

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 9805, 2066, 8870, 2205, 1029,  102]])

In [35]:
# Print out the attention mask, 0=padding tokens
model_inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [36]:
# Pass in the model inputs, just as we did before
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.5544, -0.0388,  0.1133],
        [-0.3741, -0.0320,  0.0497]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)