In [None]:
from IPython.display import clear_output

# Content

In this notebook, we will take a look at the transformers library by hugging face

In [None]:
%pip install transformers
%pip install torch

clear_output()

In [None]:
import transformers

import torch
import torch.nn.functional as F

from pprint import pprint

In [None]:
device = 'mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu')  # mps is for apple chips
device

'cuda'

## The Pipeline function

Pipeline: Convenient way of running the input trough the model

Input -> Pre-processing -> Run through model -> post-processing

steps: specify the task, model(optional) and tokenizer(for NLP tasks, optional) and run your data through it

In [None]:
from transformers import pipeline

### Let's try an example with sentiment analysis

In [None]:
sample_sentiment_texts = [
    '''I don't like like the food today''',
    '''I love travelling and adventuring''',
    '''I am unaware of any such developments''',
]

model_checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'  # already the default model for sentiment analysis
classifier = pipeline(task='sentiment-analysis', model=model_checkpoint, device=device)
classifier(sample_sentiment_texts)

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'NEGATIVE', 'score': 0.8080077767372131},
 {'label': 'POSITIVE', 'score': 0.9996951818466187},
 {'label': 'NEGATIVE', 'score': 0.9752130508422852}]

In [None]:
# In case no model or tokenizer is provided, it defaults to a pre-set default value for the TASK
pipeline('image-classification')

No model was supplied, defaulted to google/vit-base-patch16-224 and revision 5dca96d (https://huggingface.co/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

<transformers.pipelines.image_classification.ImageClassificationPipeline at 0x782386d79690>

In [None]:
# Example: Using a model which also accomodates neutral sentiment

twitter_classifier = pipeline(task='sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment-latest', device=device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
twitter_classifier(sample_sentiment_texts)

[{'label': 'negative', 'score': 0.9171754717826843},
 {'label': 'positive', 'score': 0.96360182762146},
 {'label': 'neutral', 'score': 0.7371393442153931}]

## Doing it Manually

instead of using the pipeline function, we can also build the whole pipeline ourselves so we can change things in it.

In [None]:
from transformers import AutoModel  # AutoModel initializes the model without last layer (or last few layers).

In [None]:
model_checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'

In [None]:
test_data = torch.randint(1, 200, (1, 10))

auto_model = AutoModel.from_pretrained(model_checkpoint)
auto_outputs = auto_model(test_data)

print(type(auto_outputs))
print(dir(auto_outputs))
auto_outputs.last_hidden_state.shape  # shape indicates the output is not logits but some middle layer output (without final layers)

<class 'transformers.modeling_outputs.BaseModelOutput'>
['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'last_hidden_state', 'move_to_end', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']


torch.Size([1, 10, 768])

In [None]:
from transformers import AutoModelForSequenceClassification  # General class.
# from transformers import BertForSequenceClassification
# from transformers import GPT2ForSequenceClassification

from transformers import AutoTokenizer
# from transformers import BertTokenizer

In [None]:
classifier_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
sample_sentiment_texts = [
    # '''I don't like like the food today''',
    '''I love travelling and adventuring''',
    '''I am unaware of any such developments''',
    # '''This sentence is supposed to be intentionally extra long so we can see how padding works in the tokenizer'''
]

In [None]:
o1 = tokenizer.tokenize('Some random text here. I like to tokenize')  # convert to tokens
print(f'{o1=}')
o2 = tokenizer.convert_tokens_to_ids(o1)  # convert tokens to ids
print(f'{o2=}')
o3 = tokenizer.prepare_for_model(o2)  # extra steps before sending off to model. In this case, add ids for [CLS] and [SEP] tokens
print(f'{o3=}')

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


o1=['some', 'random', 'text', 'here', '.', 'i', 'like', 'to', 'token', '##ize']
o2=[2070, 6721, 3793, 2182, 1012, 1045, 2066, 2000, 19204, 4697]
o3={'input_ids': [101, 2070, 6721, 3793, 2182, 1012, 1045, 2066, 2000, 19204, 4697, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
print(tokenizer.decode(o2))
print(tokenizer.decode(o3.input_ids))

some random text here. i like to tokenize
[CLS] some random text here. i like to tokenize [SEP]


In [None]:
# OR instead of doing all of the above things seperately, we can just call the tokenizer (__call__) method and it takes care of all of it
# Padding is done on strings so that their output id lists are of the same shape as the max length string one. This is needed to achieve the rectangular shape so it can be converted to tensors/arrays

tokenizer_results = tokenizer(sample_sentiment_texts, padding=True, return_tensors='pt')  # can change pt to tf

In [None]:
# attention mask is passed to the transformer model. Attention mask has 1 for 'pay attention to token' and 0 for otherwise. used to tell attention layers to avoid
# pying any attention to padding tokens.

print(tokenizer_results)

{'input_ids': tensor([[  101,  1045,  2293,  8932,  1998, 13896, 12228,   102,     0],
        [  101,  1045,  2572, 11499,  1997,  2151,  2107,  8973,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
model_outputs = classifier_model(**tokenizer_results)

hugging face models output logits

In [None]:
model_outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.9294,  4.1659],
        [ 2.0654, -1.6070]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# Convert logits to probabilities using softmax
# use argmax to convert probabilities to labels

probs = F.softmax(model_outputs.logits, dim=1)
output = probs.argmax(dim=1)

Q: BUT. What do the 1's and 0's represent? are there any more possible labels (classes) that have not shown up in the output? how do we know this info??

A: Look into model config file

In [None]:
classifier_model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

Which brings us to model config files. This is how it looks

In [None]:
classifier_model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.33.3",
  "vocab_size": 30522
}

A model has 2 parts:

1. the weights file (kind of expected...)
2. The **config** file. This contains info about the model architecture, the parameters (number of hidden layers, activation, dropout probabilities), information about the output etc etc.

**This config file is used for things like initializing model, in the inference api, etc**

we can change the config to initialize the models with different params. This can be done through creating a config object in transformers library and initializing the model thorugh it, or through key word arguments when initializing the model like this:

In [None]:
AutoModelForSequenceClassification.from_pretrained(classifier_model.config._name_or_path, activation='relu', attention_dropout=0.5).config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "relu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.5,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.33.3",
  "vocab_size": 30522
}

The config file is updated according to kwargs (activation and attention_dropout fields). These changes are also applied to the model

## Sentence pair tokenizers

For tasks which involve comparison between two pairs
Just pass both the sentences to the tokenizers
in addition to previously returned objects, this also returns 'token_type_ids' which keeps track of which token came from which sentence

In [None]:
pair_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
pair_tokenizer('this is sentence 1', 'and this is sentence 2')

{'input_ids': [101, 2023, 2003, 6251, 1015, 102, 1998, 2023, 2003, 6251, 1016, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}