<a href="https://colab.research.google.com/github/MethEthPro/colab/blob/main/hugging_face/nlp/using_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **0 - USING TRANSFORMERS**

# 1 - PIPELINE-DEEP-DIVE

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

classifier(
    [
        "I hate this so much",
        "I was waiting for this match for my whole life."
    ]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9995144605636597},
 {'label': 'POSITIVE', 'score': 0.8894093036651611}]

## 1 - Tokenization/Pre-Processing

the default checkpoint of the sentiment-analysis pipeline is distilbert-base-uncased-finetuned-sst-2-english

In [None]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
raw_inputs = [
        "I hate this so much",
        "I was waiting for this match for my whole life."
    ]

inputs = tokenizer(raw_inputs, padding = True, truncation = True, return_tensors='pt')
inputs

{'input_ids': tensor([[ 101, 1045, 5223, 2023, 2061, 2172,  102,    0,    0,    0,    0,    0,
            0],
        [ 101, 1045, 2001, 3403, 2005, 2023, 2674, 2005, 2026, 2878, 2166, 1012,
          102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## 2 - MODEL

In [None]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model= AutoModel.from_pretrained(checkpoint)

In [None]:
outputs = model(**inputs)
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.3088,  0.7332, -0.1861,  ..., -0.1305, -0.9360, -0.0433],
         [-0.3340,  0.9830, -0.0946,  ..., -0.3825, -0.6176,  0.2008],
         [-0.1687,  0.8781, -0.1117,  ..., -0.2380, -0.7790,  0.0935],
         ...,
         [-0.2724,  0.7307, -0.2367,  ..., -0.0929, -0.7367, -0.0444],
         [-0.2921,  0.7664, -0.2156,  ..., -0.1126, -0.7472, -0.0283],
         [-0.3022,  0.8105, -0.1694,  ..., -0.1124, -0.7401, -0.0543]],

        [[-0.2389,  0.2169,  0.2976,  ..., -0.1718,  0.2655, -0.0080],
         [ 0.2902,  0.7627, -0.1496,  ...,  0.0859,  0.2925,  0.0595],
         [ 0.1307,  0.5581, -0.2550,  ..., -0.0599,  0.1255,  0.0201],
         ...,
         [ 0.3770,  0.6464,  0.0875,  ..., -0.5511,  0.4718,  0.0644],
         [ 0.5557, -0.0492,  0.0693,  ...,  0.1239,  0.1460, -0.5485],
         [ 0.2917,  0.2442,  0.3041,  ...,  0.0910,  0.2270, -0.4516]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [None]:
outputs['last_hidden_state'].shape

torch.Size([2, 13, 768])

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model= AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
outputs = model(input_ids = inputs['input_ids'], attention_mask=inputs['attention_mask'])
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.2141, -3.4158],
        [-0.9930,  1.0917]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
outputs.logits.shape

torch.Size([2, 2])

In [None]:
outputs.logits
# this is our model output so we need to change this in post processing step

tensor([[ 4.2141, -3.4158],
        [-0.9930,  1.0917]], grad_fn=<AddmmBackward0>)

## 3 - Post Processing

In [None]:
import torch

In [None]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
predictions

tensor([[9.9951e-01, 4.8549e-04],
        [1.1059e-01, 8.8941e-01]], grad_fn=<SoftmaxBackward0>)

In [None]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [None]:
# first sentence negative with 0.995 and positive with 0.005
# second sentence negative with 0.11 and postive with 0.89

# 2 - MODELS

## 1 - loading a transformers model

In [1]:
# we can use AutoModel , it will allow all checkpoints
# or we can go with specific models

In [8]:
from transformers import BertModel, BertConfig

# building the config
config = BertConfig(num_hidden_layers=1)

# building the model from the config
model = BertModel(config)

# model is randomly initialised

# now this model has the same architecture as the BertModel
# but the weights are random

In [9]:
print(config)

BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 1,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
# the benefit of loading a model in this way is that we can
# make changes to the architecture as we wish

In [5]:
# The model can be used in this state, but it will output gibberish; it needs to be trained first.
# or we can load a transformer model completely with its weights and architecture

In [6]:
from transformers import BertModel

model = BertModel.from_pretrained("Bert-base-cased")

# In the code sample above we didn’t use BertConfig,
# and instead loaded a pretrained model via the bert-base-cased identifier.
# This is a model checkpoint that was trained by the authors of BERT themselves;
# This model is now initialized with all the weights of the checkpoint.
# it is ready to use

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

## 2 - saving a model

In [7]:
model.save_pretrained("directory")
# this saves 2 files
# the architecture json file and the weights also

In [None]:
# so we can now pass in inputs to our model and do stuff
# but to pass inputs to our model they should be tensors
# and also toknizer should have done its work
# lets look into it

# 3 - Tokenizers

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

text = "let's do tokenization!"

tokens = tokenizer.tokenize(text)

print(tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['let', "'", 's', 'do', 'token', '##ization', '!']


In [16]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(token_ids)

[2292, 1005, 1055, 2079, 19204, 3989, 999]


In [17]:
from transformers import AutoModel
import torch


model = AutoModel.from_pretrained("bert-base-uncased")
input_ids = torch.tensor([token_ids])

outputs = model(input_ids)

print(outputs.last_hidden_state.shape)

torch.Size([1, 7, 768])


## 1 - Subword - based tokenizers

In [None]:
# we have word tokenizer
# we have characters tokenizer
# both are shit , we need something in between
# we have sub word tokenizers for that


### 1- Byte Pair Encoding (BPE)

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# as gpt2 uses bpe
print(tokenizer.tokenize("transformerization"))

['trans', 'former', 'ization']


### 2 - WordPiece Tokenization

In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# as bert uses wordpiece sub word tokenization

print(tokenizer.tokenize("transformerization"))

['transform', '##eri', '##zation']


### 3 - Unigram or SentecePiece

In [26]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

print(tokenizer.tokenize("transformerization"))

['▁transformer', 'ization']


## 2 - loading and saving

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
tokenizer.save_pretrained("folder")

('folder/tokenizer_config.json',
 'folder/special_tokens_map.json',
 'folder/vocab.txt',
 'folder/added_tokens.json',
 'folder/tokenizer.json')

## 3 - Encoding

In [29]:
# basically it involves the steps to convert text to ids


In [30]:
my_sentences = ["I’ve been waiting for a HuggingFace course my whole life.",
                "I hate this so much!"]

In [39]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokens = [tokenizer.tokenize(sentence) for sentence in my_sentences]

print(tokens)

[['i', '’', 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.'], ['i', 'hate', 'this', 'so', 'much', '!']]


In [41]:
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

input_ids

[[1045,
  1521,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012],
 [1045, 5223, 2023, 2061, 2172, 999]]

In [42]:
final_inputs = tokenizer.prepare_for_model(input_ids)

final_inputs

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, [1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172, 999], 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

## 4 - Decoding

In [46]:
# it basically means to go from numbers to words

decoded_sentences = [tokenizer.decode(final_input) for final_input in final_inputs['input_ids']]

decoded_sentences

['[CLS]',
 'i ’ ve been waiting for a huggingface course my whole life.',
 'i hate this so much!',
 '[SEP]']

# 4 - Handling multiple sequences

## 1 - Batch of Inputs

In [52]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sentence)

ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor(ids)
print(input_ids)

output = model(input_ids)
# this fails as the dimension is incorrect

tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
         2026,  2878,  2166,  1012])


IndexError: too many indices for tensor of dimension 1

In [54]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sentence)

ids = tokenizer.convert_tokens_to_ids(tokens)

# notice the subtle change in this line
input_ids = torch.tensor([ids])

output = model(input_ids)

print(output.logits)

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [55]:
print(input_ids)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


In [None]:
# Batching is the act of sending multiple sentences through the model,
#  all at once. If you only have one sentence, you can just build a batch with a single sequence:

In [56]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# using,  batched_ids = [ids, ids]
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sentence)

ids = tokenizer.convert_tokens_to_ids(tokens)

batched_ids = [ids,ids]

# notice the subtle change in this line
input_ids = torch.tensor(batched_ids)

output = model(input_ids)

print(output.logits)

tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [None]:
# notice we get the same result but twice just as expected


In [None]:
# now the issue here arises when the 2 sequences are of different lengths

## 2 - Padding and Attention Masks

In [None]:
# we need our ids to be rectangular in shape to pass into tensors
# so we use padding to make the sequences of same length
# and we use attention masks
# Attention masks are tensors with the exact same shape as the input IDs tensor,
# filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to,
# and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).


In [57]:
batched_ids = [[200,200,200],
               [200,200]]
# now this cant be converted to a tensor as
# it is not in a rectangular shape


In [58]:
tokenizer.pad_token_id
# that means it pads with the value 0

0

In [59]:
padding_id = 100
batched_ids = [
    [200,200,200],
    [200,200,padding_id]
]

# ṇow this is rectangular in shape and thus can be turned into a tensor

In [60]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]


print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [61]:
# we see that we dont get the same results
# when using padding and not using padding
# this is because our model is paying to the padded value 0 also
# which we dont want
# we tackle this with the help of attention layers

attention_mask = [
    [1,1,1],
    [1,1,0]
]


print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids), attention_mask = torch.tensor(attention_mask)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [None]:
# we got the same results

In [62]:
# assignment ig

my_sentences

['I’ve been waiting for a HuggingFace course my whole life.',
 'I hate this so much!']

In [111]:
tokens = [tokenizer.tokenize(sentence) for sentence in my_sentences]

tokens

[['i',
  '’',
  've',
  'been',
  'waiting',
  'for',
  'a',
  'hugging',
  '##face',
  'course',
  'my',
  'whole',
  'life',
  '.'],
 ['i', 'hate', 'this', 'so', 'much', '!']]

In [112]:
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
final_ids = [tokenizer.prepare_for_model(id) for id in ids]
final_ids

[{'input_ids': [101, 1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 1045, 5223, 2023, 2061, 2172, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}]

In [113]:
final_ids = [item.input_ids for item in final_ids]
final_ids

[[101,
  1045,
  1521,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012,
  102],
 [101, 1045, 5223, 2023, 2061, 2172, 999, 102]]

In [115]:

print(len(final_ids[0]),len(final_ids[1]))



16 8


In [117]:
for i in range(8):
  final_ids[1].append(tokenizer.pad_token_id)

In [118]:
final_ids

[[101,
  1045,
  1521,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012,
  102],
 [101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0]]

In [119]:
# so we have implemented padding now just attention mask is left

input_ids = torch.tensor(final_ids)

attention_mask = [[1 if item!=0 else 0 for item in id] for id in input_ids]
attention_mask

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [120]:
output = model(input_ids, attention_mask = torch.tensor(attention_mask))
output.logits

tensor([[-1.5979,  1.6390],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

In [106]:
# going directly
ids = tokenizer(my_sentences, padding = True, truncation = True, return_tensors = "pt")
ids

{'input_ids': tensor([[  101,  1045,  1521,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [81]:
outputs = model(**ids)
outputs.logits

tensor([[-1.5979,  1.6390],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

In [None]:
# we got the same answer from both methods

# 5 - CONCLUSION

up till now we implemented everything from scratch now we see how all of this is done by api

In [122]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer(my_sentences, padding=True, truncation=True, return_tensors = "pt")
output = model(**inputs)
output.logits

tensor([[-1.5979,  1.6390],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)