# BERT: how to get embeddings

In [3]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch 
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

## References
- [BERT Word embeddings](https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU) 
 - Looks like concatenating last 4 layers of BERT embeddings is a good idea 
- Get embedding matrix

In [4]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "Is there airspace consolidation on the left side?"

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
print('seq length:', len(tokenized_text))
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))



seq length: 11
[CLS]           101
is            2,003
there         2,045
airspace     29,357
consolidation 17,439
on            2,006
the           1,996
left          2,187
side          2,217
?             1,029
[SEP]           102


In [5]:
# Mark them as the 1st segment 
segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [6]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (den

In [7]:
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

## Check shape

In [8]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0
print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0
print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 11
Number of hidden units: 768


![image](http://jalammar.github.io/images/bert-feature-extraction-contextualized-embeddings.png)

In [9]:
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()
# get rid of `batch`` dimension
token_embeddings = torch.squeeze(token_embeddings, dim=1)
# let the shape be [seq_len, layer_num, emb_dim] 
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()


torch.Size([11, 12, 768])

In [10]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []
token_vecs_sum = []
# `token_embeddings` is a [12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    sum_vec = torch.sum(token[-4:], dim=0)
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0]))) # 96.1 
print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0]))) # 95.9 

Shape is: 11 x 3072
Shape is: 11 x 768


## Conclusion
覺得可以試試看 sum last 4 layers，
dimension 不會太大。

In [13]:
embedding_matrix = model.embeddings.word_embeddings.weight.data

In [14]:
embedding_matrix.shape

torch.Size([30522, 768])

In [16]:
# print padding idx
pad_id = tokenizer.convert_tokens_to_ids(['[PAD]'])

In [18]:
# find vocab size 
vocab_size = embedding_matrix.shape[0]

In [19]:
vocab_size

30522

In [21]:
model.embeddings.word_embeddings
# get size 
model.embeddings.word_embeddings.weight.size()

torch.Size([30522, 768])