In [1]:
import torch
from transformers import BertTokenizer, BertModel
import warnings

warnings.filterwarnings("ignore")

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager", output_attentions=True)

In [3]:
sentence = "What are the symptoms of diabetes?"

In [4]:
inputs = tokenizer(sentence, return_tensors="pt")

In [5]:
with torch.no_grad():
    outputs = model(**inputs)

embeddings = outputs.last_hidden_state
attentions = outputs.attentions

In [6]:
print("Embeddings shape:", embeddings.shape)
print("\nNumber of layers containing attention weights:", len(attentions))
print("Attention weights in the first layer:", attentions[0].shape)
print("\nAttention weights From the first layer - the first head:")
print(attentions[0][0, 0])

Embeddings shape: torch.Size([1, 9, 768])

Number of layers containing attention weights: 12
Attention weights in the first layer: torch.Size([1, 12, 9, 9])

Attention weights From the first layer - the first head:
tensor([[0.0703, 0.0626, 0.0576, 0.1530, 0.0469, 0.1002, 0.0671, 0.0971, 0.3451],
        [0.1299, 0.1026, 0.0900, 0.0791, 0.0837, 0.0760, 0.0858, 0.1506, 0.2024],
        [0.1393, 0.0903, 0.1344, 0.0599, 0.1634, 0.1129, 0.0504, 0.1713, 0.0781],
        [0.0755, 0.0988, 0.1373, 0.1133, 0.1443, 0.0978, 0.0660, 0.2030, 0.0641],
        [0.0621, 0.0693, 0.1322, 0.0236, 0.1607, 0.0339, 0.1899, 0.2461, 0.0823],
        [0.1299, 0.1160, 0.0785, 0.0668, 0.1307, 0.1224, 0.0831, 0.1509, 0.1218],
        [0.1145, 0.0830, 0.0942, 0.0156, 0.2272, 0.0312, 0.1865, 0.1688, 0.0791],
        [0.0950, 0.0870, 0.1386, 0.0539, 0.1093, 0.1478, 0.0934, 0.1272, 0.1477],
        [0.0942, 0.1061, 0.0873, 0.1179, 0.0876, 0.0876, 0.0775, 0.1812, 0.1606]])
