# **Transformers**

In [1]:
from transformers import AutoTokenizer, AutoConfig
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

import torch
import torch.nn as nn
import torch.nn.functional as F

import math


In [2]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

text = "Processor becomes smaller, so as the size of CPU"
model = BertModel.from_pretrained(model_ckpt)
model.eval()

show(model, 'bert', tokenizer, text, display_mode='light', layer=0, head=0)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Tokenization**

In [10]:
inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
inputs.input_ids

tensor([[13151,  4150,  3760,  1010,  2061,  2004,  1996,  2946,  1997, 17368]])

In [11]:
config = AutoConfig.from_pretrained(model_ckpt)
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [12]:
token_embedding_layer = nn.Embedding(config.vocab_size, config.hidden_size)
token_embedding_layer

Embedding(30522, 768)

In [15]:
inputs

{'input_ids': tensor([[13151,  4150,  3760,  1010,  2061,  2004,  1996,  2946,  1997, 17368]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
input_embds = token_embedding_layer(inputs.input_ids)
input_embds

tensor([[[-1.0961, -0.7186,  1.2510,  ...,  1.4863, -0.8738,  0.1008],
         [ 0.9566, -0.7478,  0.7055,  ...,  1.8973,  1.3414, -0.5830],
         [ 0.8415, -0.4369,  1.4776,  ...,  1.5013, -1.3221,  1.3184],
         ...,
         [-0.5017, -0.6283,  0.2117,  ..., -1.4411,  1.2818,  0.3001],
         [-0.7122, -0.2407,  0.6829,  ...,  1.4840, -0.4810, -0.2447],
         [ 0.8134, -1.1567, -1.7822,  ..., -0.6614, -1.5348, -1.2032]]],
       grad_fn=<EmbeddingBackward0>)

### **Custom Embedding Layer**

In [16]:
query = key = value = input_embds
dim_k = key.size(-1)

In [18]:
scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(dim_k)
scores.size()

torch.Size([1, 12, 12])

In [20]:
soft_max = nn.Softmax(1)
weight = soft_max(scores)
weight.size()


torch.Size([1, 12, 12])

In [21]:
attnn_outputs = torch.bmm(weight, value)
attnn_outputs

torch.Size([1, 12, 768])

# **Custom Transformer**

In [28]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embded_dim, head_dim) -> None:
        super(MultiHeadAttention, self).__init__()
        embded_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = config.hidden_size // config.num_attention_heads

        self.linear_q = nn.Linear(embded_dim, head_dim)
        self.linear_k = nn.Linear(embded_dim, head_dim)
        self.linear_v = nn.Linear(embded_dim, head_dim)
        self.linear_out = nn.Linear(embded_dim, head_dim)

    def scaled_dot_product_attention(self, query, key, value):
        dim_k = key.size(-1)
        scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(dim_k)
        soft_max = nn.Softmax(1)
        weight = soft_max(scores)
        attnn_outputs = torch.bmm(weight, value)
        return attnn_outputs   

    def forward(self, input_embds):
        self.query = self.linear_q(input_embds)
        self.key = self.linear_k(input_embds)
        self.value = self.linear_v(input_embds)
        self.attnn_outputs = self.scaled_dot_product_attention(self.query, self.key, self.value)
        self.heads = nn.ModuleList([self.attnn_outputs])
        concat_out = torch.cat([h(input_embds) for h in self.heads], dim=1)
        self.attnn_outputs = self.linear_out(concat_out)        
        return self.attnn_outputs


In [29]:
multi_head_attention = MultiHeadAttention(config.hidden_size, config.hidden_size)
output = multi_head_attention(input_embds)

TypeError: torch.FloatTensor is not a Module subclass