In [2]:
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

In [3]:
model_ckpt = "bert-base-uncased"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BertModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████| 433/433 [00:00<?, ?B/s]
100%|████████████████████████████████████████████████████████████████| 440473133/440473133 [01:33<00:00, 4720400.92B/s]


In [7]:
show(model, "bert", tokenizer, text, display_mode="light", layer=0, head=8)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)

In [13]:
inputs.input_ids

tensor([[ 2051, 10029,  2066,  2019,  8612]])

Let's create some dense embeddings

In [14]:
from torch import nn
from transformers import AutoConfig

In [15]:
config = AutoConfig.from_pretrained(model_ckpt)
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [17]:
config.vocab_size, config.hidden_size

(30522, 768)

In [20]:
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb

Embedding(30522, 768)

In [22]:
input_embds = token_emb(inputs.input_ids)
input_embds.size()

torch.Size([1, 5, 768])

In [31]:
import torch
from math import sqrt

In [32]:
query = key = value = input_embds

In [34]:
dim_k = key.size(-1)
dim_k

768

In [35]:
scores = torch.bmm(query, key.transpose(1, 2))/sqrt(dim_k)

In [36]:
scores.size()

torch.Size([1, 5, 5])

The torch.bmm() function performs a batch-matrix-matrix product that simplifies the computation of the attention score where the the query and key vectors have the shape [batch_size, seq_len, hidden_dim]. If we ignored the batch dimention we could calculate the dot product between each query and key vector by simply transposing the key tensor to have the shape [hidden_dim, seq_len] and then using the matrix product to collect all the dot products in a [seq_len, seq_len] matrix.
Since we want to this for all sequences in the batch independently, we use torch.bmm(), which takes 2 batches of matrices and multiplies each matrix from the first batch with the corresponding matrix in the second batch.

let's apply the softmax not

In [37]:
import torch.nn.functional as F

In [39]:
weights = F.softmax(scores, dim=1)
weights.sum(dim=-1)

tensor([[1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

The final step is to multiply the attention weights by the values

In [40]:
attn_output = torch.bmm(weights, value)
attn_output.shape

torch.Size([1, 5, 768])

And That's it -- We have gone through all the steps to implement a simplified form of self attention! Notice that the whole process is just two matrix multiplications and a softmax, so you can think of "self-attention" as just a fancy form of averaging.

Let's wrap these steps into a function  

In [55]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1,2))/sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

In [56]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
        self.q(hidden_state), self.k(hidden_state), self.v(hidden_state)
        )
        return attn_outputs

In [57]:
nn.Module

torch.nn.modules.module.Module

Here we have initialized three independent linear layers that apply matrix multiplication to the embedding vectors to produce tensor shape [batch_size, seq_len, head_dim], where head_dim is the number of dimensions we are projrcting into. Although head_dim does not have to be smaller than the number of the embedding dimension of the token (embed_dim), in practice it is chosen to be a multiple of embed_dim so that the computation across each head is constant. For example BERT has 12 attention heads, so the dimesion of each head is 768/12=64 

In [58]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim//num_heads
        self.heads = nn.ModuleList(
        [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

Notice that the concatenated output form the attention heads is also fed through a final linear layer to produce n output tensor of shape [batch_size, seq_len, hidden_dim] that is suitable for the feed-forward network downstream. To confirm, let's see if the multi-head attention layer produce the expected shape of our inputs.  
we pass the configuration we loaded ealier from the pretrained BERT model when initializing the MultiHeadAttention module.
This ensures that we use the same setting as BERT

In [59]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(input_embds)
attn_output.size()

torch.Size([1, 5, 768])

let's visualize this using bertviz

In [74]:
from bertviz import head_view
from transformers import AutoModel

model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:
sentence_a = "time flies like an arrow"
sentence_b = "friut flies like a banana"

In [76]:
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt')
attention = model(**viz_inputs).attentions
attention

(tensor([[[[4.5477e-02, 4.3833e-02, 3.0539e-02,  ..., 9.0351e-02,
            2.7387e-02, 1.4314e-01],
           [2.3033e-01, 5.3649e-02, 1.8889e-01,  ..., 1.2162e-03,
            7.2390e-03, 5.3054e-03],
           [3.8966e-02, 1.4649e-01, 2.1543e-01,  ..., 2.2750e-03,
            1.1117e-02, 4.5202e-03],
           ...,
           [7.5327e-02, 2.5293e-03, 4.4162e-03,  ..., 7.1685e-02,
            2.1770e-01, 1.2669e-01],
           [5.4800e-02, 1.4525e-03, 3.0858e-03,  ..., 2.8514e-02,
            1.9391e-01, 1.7435e-01],
           [5.0318e-02, 1.4176e-03, 1.4943e-03,  ..., 1.5214e-01,
            1.2672e-01, 2.5998e-01]],
 
          [[8.1602e-01, 1.2548e-02, 4.1806e-03,  ..., 1.9997e-02,
            5.2076e-03, 8.1710e-03],
           [5.2615e-03, 5.6914e-02, 3.4299e-01,  ..., 2.9237e-03,
            4.6810e-02, 1.2473e-02],
           [7.0146e-02, 1.0973e-01, 8.9813e-02,  ..., 3.4661e-03,
            1.8877e-02, 2.3621e-02],
           ...,
           [1.4815e-02, 1.3650e-02, 2.

In [77]:
sentence_b_start = (viz_inputs.token_type_ids==0).sum(dim=1)
sentence_b_start

tensor([7])

In [78]:
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

In [88]:
head_view(attention, tokens, sentence_b_start, heads=[8])

<IPython.core.display.Javascript object>

Now that we have covered attention, let's take a look at implementing the missing piece of the encoder layer: Position-wise feed-forward networks.

The feed-forward sublayer in the encoder and decoder is just a simple two-layer fully connected neural network, but with a twist: instead of processing the whole sequence of embeddings as a single vector, it processed each embedding independently.

In [89]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        slef.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

Note that a fee-forward... Continue from Page 71