-----------
**Author**: Gunnvant

**Description**: Examines the Models and Tokenizers api in more detail

-----------


### Pre-processing with tokenizers

Use the `dbmdz/bert-large-cased-finetuned-conll03-english` checkpoint and have a look at the tokenizer and model architecture

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel

In [2]:
ckpt = "dbmdz/bert-large-cased-finetuned-conll03-english"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModel.from_pretrained(ckpt)

loading configuration file config.json from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--dbmdz--bert-large-cased-finetuned-conll03-english/snapshots/f2482bf01f5da0f0eb8e183ffd8cc3885aa90b14/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-large-cased-finetuned-conll03-english",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "ma

In [5]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,   146,   112,  1396,  1151,  2613,  1111,   170, 20164, 10932,
          2271,  7954,  1736,  1139,  2006,  1297,   119,   102],
        [  101,   146,  4819,  1142,  1177,  1277,   106,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


**`attention_mask`** is used to signify the self attention blocks which tokens to apply self attention on. When one input sentence is longer and the other is shorter, the shorte sentence will have `attention_mask` with a value of zero.

**`token_type_ids`** represent sentence to which the tokens belong. More appropriate for tasks that involve pair of sentence in the model training. Eg: Entailment etc

In [6]:
outputs = model(**inputs)

In [9]:
outputs.last_hidden_state.shape ## b,seq_len_dimension

torch.Size([2, 18, 1024])

In [10]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [11]:
from transformers import AutoModelForTokenClassification

In [13]:
model1 = AutoModelForTokenClassification.from_pretrained(ckpt)

loading configuration file config.json from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--dbmdz--bert-large-cased-finetuned-conll03-english/snapshots/f2482bf01f5da0f0eb8e183ffd8cc3885aa90b14/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-large-cased-finetuned-conll03-english",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "ma

In [14]:
model1

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), 

In [22]:
outputs = model1(**inputs)

In [23]:
outputs.logits.shape ## batch,seq_len,num_classes

torch.Size([2, 18, 9])

Look at `distilbert-base-cased-distilled-squad` model

In [24]:
ckpt = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(ckpt)

loading configuration file config.json from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/50ba811384f02cb99cdabe5cdc02f7ddc4f69e10/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.32.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/50ba811384f02cb99cdabe5cdc02f7ddc4f69e10/vocab.txt
loadin

In [25]:
q,ctx = "Which launch vechicle was used to launch chandrayaan 3?", "Chandrayaan 3 was launched using LVM3 also known as GSLV MK3"

In [26]:
inputs = tokenizer(q, ctx, return_tensors="pt")

In [27]:
inputs

{'input_ids': tensor([[  101,  5979,  4286,  1396,  4313, 10536,  1108,  1215,  1106,  4286,
         22572, 19799,  2315,  1389,   124,   136,   102, 17595,  2315,  1389,
           124,  1108,  2536,  1606,   149,  2559,  2107,  1495,  1145,  1227,
          1112,   144, 13726,  2559,   150,  2428,  1495,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [28]:
from transformers import AutoModelForQuestionAnswering

In [30]:
m2 = AutoModelForQuestionAnswering.from_pretrained(ckpt)

loading configuration file config.json from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/50ba811384f02cb99cdabe5cdc02f7ddc4f69e10/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.32.1",
  "vocab_size": 28996
}

loading weights file model.safetensors from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--distilbert-base-cased-distilled-squad/snapshots/50ba811384f02cb99cdabe5cdc02f7ddc4f69e10/

In [31]:
m2

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [32]:
## Here the last layer outputs two number which signify the start and the end token

In [33]:
output = m2(**inputs)

In [36]:
output.start_logits.argmax()

tensor(24)

In [37]:
output.end_logits.argmax()

tensor(27)

In [38]:
inputs['input_ids']

tensor([[  101,  5979,  4286,  1396,  4313, 10536,  1108,  1215,  1106,  4286,
         22572, 19799,  2315,  1389,   124,   136,   102, 17595,  2315,  1389,
           124,  1108,  2536,  1606,   149,  2559,  2107,  1495,  1145,  1227,
          1112,   144, 13726,  2559,   150,  2428,  1495,   102]])

In [46]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])[24]

'L'

In [48]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0])[27]

'##3'

In [53]:
import numpy as np
np.array(tokenizer.convert_ids_to_tokens(inputs['input_ids'].tolist()[0]))[24:28].tolist()

['L', '##V', '##M', '##3']

In [54]:
"".join(['L', '##V', '##M', '##3'])

'L##V##M##3'

**Summarization Model `sshleifer/distilbart-cnn-12-6`**

In [55]:
from transformers import BartForConditionalGeneration
m3 = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')

Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████| 1.80k/1.80k [00:00<00:00, 2.72MB/s]
loading configuration file config.json from cache at /Users/gunnvantsaini/.cache/huggingface/hub/models--sshleifer--distilbart-cnn-12-6/snapshots/a4f8f3ea906ed274767e9906dbaede7531d660ff/config.json
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra

In [56]:
m3

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [None]:
### out_features=Vocab size, this means logits are calculated for each word and then the summary is generated. Open question is how decoding happens