# 2.2 Scaled Dot Product Attention

In [1]:
from transformers import BertModel

In [2]:
model = BertModel.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
len(model.encoder.layer)

12

In [4]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [5]:
model.encoder.layer[0].attention

BertAttention(
  (self): BertSelfAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (output): BertSelfOutput(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

# 2.3 Multi-Headed-Attention

In [7]:
from transformers import BertModel, BertTokenizer
import torch
import pandas as pd

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
text = "My friend told me about this class and I love it so far! She was right."

tokens = tokenizer.encode(text)
inputs = torch.tensor(tokens).unsqueeze(0) ## Unsequeeze changes the shape from (20,) -> (1,20)
inputs

tensor([[ 101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009,
         2061, 2521,  999, 2016, 2001, 2157, 1012,  102]])

In [10]:
attention = model(inputs, output_attentions=True)[2]

In [11]:
final_attention = attention[-1].mean(1)[0]

In [13]:
attention_df = pd.DataFrame(final_attention.detach()).applymap(float).round(3)

attention_df.columns = tokenizer.convert_ids_to_tokens(tokens)
attention_df.index = tokenizer.convert_ids_to_tokens(tokens)

attention_df

Unnamed: 0,[CLS],my,friend,told,me,about,this,class,and,i,love,it,so,far,!,she,was,right,.,[SEP]
[CLS],0.092,0.028,0.019,0.011,0.012,0.022,0.05,0.087,0.031,0.023,0.023,0.031,0.007,0.028,0.067,0.057,0.065,0.124,0.104,0.12
my,0.021,0.023,0.014,0.01,0.013,0.021,0.028,0.015,0.014,0.012,0.01,0.023,0.011,0.009,0.016,0.022,0.021,0.019,0.312,0.388
friend,0.018,0.009,0.129,0.009,0.005,0.008,0.008,0.012,0.009,0.005,0.009,0.006,0.004,0.005,0.009,0.023,0.01,0.006,0.314,0.401
told,0.01,0.004,0.013,0.084,0.004,0.011,0.005,0.005,0.005,0.002,0.008,0.005,0.005,0.003,0.006,0.008,0.004,0.003,0.351,0.464
me,0.024,0.013,0.01,0.011,0.017,0.016,0.018,0.011,0.014,0.01,0.01,0.014,0.007,0.008,0.014,0.009,0.006,0.005,0.347,0.436
about,0.019,0.01,0.007,0.018,0.01,0.079,0.021,0.012,0.012,0.006,0.014,0.019,0.008,0.008,0.012,0.005,0.003,0.005,0.32,0.412
this,0.026,0.014,0.003,0.004,0.01,0.015,0.069,0.02,0.011,0.01,0.011,0.018,0.006,0.008,0.012,0.005,0.003,0.004,0.331,0.421
class,0.028,0.01,0.007,0.006,0.006,0.015,0.029,0.096,0.01,0.009,0.013,0.019,0.006,0.009,0.015,0.01,0.005,0.005,0.312,0.39
and,0.031,0.016,0.006,0.007,0.012,0.009,0.013,0.009,0.08,0.013,0.01,0.01,0.008,0.009,0.024,0.014,0.012,0.011,0.316,0.386
i,0.023,0.014,0.008,0.005,0.011,0.011,0.019,0.012,0.021,0.029,0.014,0.013,0.008,0.014,0.019,0.012,0.009,0.008,0.334,0.414


## Tensors

### N Dimensional arrays. These objects are functionally no different than arrays in Numpy