<a href="https://colab.research.google.com/github/Muntasir1808/NLP-Transformers/blob/main/Fine-tuning%20/Models_and_Tokenizer_in_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer("Hello world")

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
tokens = tokenizer.tokenize("Hello world")
tokens

['hello', 'world']

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7592, 2088]

In [None]:
tokenizer.convert_ids_to_tokens(ids)

['hello', 'world']

In [None]:
tokenizer.decode(ids)

'hello world'

In [None]:
# now in the output we will see two extra tokens because encode method adds two special tokens cls and sep
ids = tokenizer.encode("Hello world")
ids

[101, 7592, 2088, 102]

In [None]:
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', 'world', '[SEP]']

In [None]:
tokenizer.decode(ids) # this is the true input into the bert model with cls and sep even though we technically pass "hello world"

'[CLS] hello world [SEP]'

In [None]:
model_inputs = tokenizer("Hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [None]:
# tokenizing multiple sentences at the same time
data = [
    "I like cats.",
    "Do you like cats too?"
]

tokenizer(data)   # will give error at the later steps

{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# model doesn't take any kind of input since the default models in hugging face are pytorch models, models exceps torch tensor
outputs = model(**model_inputs)

AttributeError: ignored

In [None]:
# create model_inputs again but this time return_tensors is torch tensors
# NOTE: this process works for single string but not for multiple strings
model_inputs = tokenizer("Hello world", return_tensors='pt')
model_inputs

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [None]:
# the default was to create a binary classifier!
# we get an object of type sequence classifier output which contains the logits, which have been output by our model
# but these logits are meaningless because the top layers of our model have not yet been trained
# Note that the default library assumes that we wanted a binary classifier
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0250,  0.3686]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# Create another model but this time with three outputs instead of two
# for this the argument num_labels is specified
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0089,  0.0841, -0.9370]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
outputs.logits

tensor([[-0.0089,  0.0841, -0.9370]], grad_fn=<AddmmBackward0>)

In [None]:
outputs['logits']

tensor([[-0.0089,  0.0841, -0.9370]], grad_fn=<AddmmBackward0>)

In [None]:
outputs[0]

tensor([[-0.0089,  0.0841, -0.9370]], grad_fn=<AddmmBackward0>)

In [None]:
# if we want to convert the logits into a numpy array for computing metrics like accuracy, auc, f1_score
outputs.logits.detach().cpu().numpy()

array([[-0.00892988,  0.08406292, -0.93704414]], dtype=float32)

In [None]:
# Processing multiple strings at the same time
# this will give an error
data = [
    "I like cats.",
    "Do you like cats too?"
]
model_inputs = tokenizer(data, return_tensors='pt')
model_inputs

ValueError: ignored

In [None]:
# resolving the error by passing padding and truncation argument
model_inputs = tokenizer(
    data, padding=True, truncation=True, return_tensors='pt'
)
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
model_inputs['input_ids']  # now both inputs have same length with padded 0

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]])

In [None]:
# in the output 1 means real tokens and 0 means padding tokens
# by passing it into the model, it will avoid the padding tokens to do any computation of the output
# as we don't want them to influence our prediction
model_inputs['attention_mask']


tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [None]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0448, -0.0067, -0.9420],
        [-0.0327,  0.0154, -0.9409]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)