In [1]:
import torch

# !pip install pytorch-pretrained-bert
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
"""
BERT is a pretrained model that expects input data in a specific format, we will need:
    -->special tokens to mark the beginning ([CLS]) and separation/end of sentences ([SEP])
    -->tokens that conforms with the fixed vocabulary used in BERT
    -->token IDs from BERT’s tokenizer
    -->mask IDs to indicate which elements in the sequence are tokens and which are padding elements
    -->segment IDs used to distinguish different sentences
    -->positional embeddings used to show token position within the sequence
"""

text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"


# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]']


In [4]:
"""
The original word has been split into smaller subwords and characters. 
The two hash signs preceding some of these subwords are just our tokenizer’s way to 
denote that this subword or character is part of a larger word and preceded by another subword. 
"""

"""
After breaking the text into tokens, 
we then have to convert the sentence from a list of strings to a list of vocabulary indeces.
"""

'\nAfter breaking the text into tokens, \nwe then have to convert the sentence from a list of strings to a list of vocabulary indeces.\n'

In [5]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [6]:
"""
BERT is trained on and expects sentence pairs, using 1s and 0s to distinguish between the two sentences. That is, for each token in “tokenized_text,” 
we must specify which sentence it belongs to: sentence 0 (a series of 0s) or sentence 1 (a series of 1s). 
For our purposes, single-sentence inputs only require a series of 1s, 
so we will create a vector of 1s for each token in our input sentence.

If you want to process two sentences, assign each word in the first sentence plus the ‘[SEP]’ token a 0, and all tokens of the second sentence a 1.
"""

'\nBERT is trained on and expects sentence pairs, using 1s and 0s to distinguish between the two sentences. That is, for each token in “tokenized_text,” \nwe must specify which sentence it belongs to: sentence 0 (a series of 0s) or sentence 1 (a series of 1s). \nFor our purposes, single-sentence inputs only require a series of 1s, \nso we will create a vector of 1s for each token in our input sentence.\n\nIf you want to process two sentences, assign each word in the first sentence plus the ‘[SEP]’ token a 0, and all tokens of the second sentence a 1.\n'

In [7]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [9]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
    
"""
encoded_layers object has four dimensions, in the following order:
    1. The layer number (12 layers)
    2. The batch number (1 sentence)
    3. The word / token number (22 tokens in our sentence)
    4. The hidden unit / feature number (768 features)
"""

'\nencoded_layers object has four dimensions, in the following order:\n    1. The layer number (12 layers)\n    2. The batch number (1 sentence)\n    3. The word / token number (22 tokens in our sentence)\n    4. The hidden unit / feature number (768 features)\n'

In [10]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 22
Number of hidden units: 768


In [11]:
# `encoded_layers` is a Python list.
print('     Type of encoded_layers: ', type(encoded_layers))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', encoded_layers[0].size())

     Type of encoded_layers:  <class 'list'>
Tensor shape for each layer:  torch.Size([1, 22, 768])


In [12]:
# Concatenate the tensors for all layers.
# We use `stack` here to create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)

print(token_embeddings.size())

torch.Size([12, 1, 22, 768])


In [13]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

print(token_embeddings.size())

torch.Size([12, 22, 768])


In [14]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

print(token_embeddings.size())

torch.Size([22, 12, 768])


In [15]:
"""
Studies for NER have shown that concatenation of the last four layers 
produced the best results on this specific task.
Have a look @ http://jalammar.github.io/images/bert-feature-extraction-contextualized-embeddings.png
"""
# Creating the word vectors by summing together the last four layers.

# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.
# For each token in the sentence..
for token in token_embeddings:

    # `token` is a [12 x 768] tensor
    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [16]:
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

0 [CLS]
1 after
2 stealing
3 money
4 from
5 the
6 bank
7 vault
8 ,
9 the
10 bank
11 robber
12 was
13 seen
14 fishing
15 on
16 the
17 mississippi
18 river
19 bank
20 .
21 [SEP]


In [17]:
print("word embedding for 'money': ", str(token_vecs_sum[3]))

word embedding for 'money':  tensor([ 1.2090e+00, -4.5478e+00, -1.8455e+00,  1.7785e+00,  5.5919e+00,
         3.0103e+00, -3.6344e+00, -1.8779e+00, -2.0583e+00, -5.6114e-01,
        -5.0866e-01, -3.5573e-02, -1.6982e+00,  3.0808e+00, -5.0677e+00,
         9.8939e-01,  3.0563e+00,  3.3990e+00, -5.1202e-01,  2.2999e+00,
        -4.7083e+00,  2.1254e-01,  3.1129e+00,  9.1927e-01,  1.8552e+00,
        -1.7869e+00, -6.0480e-02,  2.2700e+00, -1.9204e-01,  1.2888e+00,
         6.2763e+00,  9.4952e-02,  2.3813e+00,  4.8176e+00,  1.5172e+00,
        -1.0869e+00, -3.6432e+00,  1.4986e+00, -5.6087e-01,  1.1351e+00,
        -2.6811e-01,  5.8343e-01, -4.2257e+00,  3.7800e+00, -1.6967e-01,
         7.2276e-01,  1.3434e+00,  1.0216e+00, -2.5422e+00,  9.3772e-01,
        -1.8737e+00,  1.2663e+00, -8.9085e+00, -2.7672e+00,  8.1881e-01,
         3.2483e+00, -3.2627e+00, -3.2115e+00,  1.8310e+00, -4.6896e-02,
         4.1646e+00, -1.1390e+00,  2.1328e+00, -4.7024e-01, -3.5338e+00,
         5.4664e-01,  

In [18]:
cos = torch.nn.CosineSimilarity(dim=0)

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = cos(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = cos(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  {}' .format(same_bank))
print('Vector similarity for *different* meanings:  {}' .format(diff_bank))

Vector similarity for  *similar*  meanings:  0.9456751942634583
Vector similarity for *different* meanings:  0.6797332763671875
