# Contextualized Word Embeddings

## Preparation

Import the required modules.

In [1]:
import logging
from pathlib import Path

# set logging level (suggested: logging.INFO; for bug fixing: logging.DEBUG)
# logging_level = logging.INFO
logging_level = logging.DEBUG

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging_level)


In [41]:
# import the bert module
# note: code reused from https://towardsdatascience.com/nlp-extract-contextualized-word-embeddings-from-bert-keras-tf-67ef29f60a7b
# pre-trained embedding model from: https://github.com/google-research/bert 'BERT_uncased_L-12_H-768_A-12'

import importlib.util
spec = importlib.util.spec_from_file_location("bert_embeddings", "bert_embeddings.py")
bert_embeddings = importlib.util.module_from_spec(spec)
spec.loader.exec_module(bert_embeddings)


## Load pre-trained BERT model

Download it from the official website and extract the zip file. This part has to be run only once.

In [None]:
!pip install progressbar
import progressbar

pbar = None

def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size).start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

In [32]:
bert_model_url = 'https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip'
# for testing you might want to use a smaller model, e.g.:
# bert_model_url = 'https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-2_H-128_A-2.zip'

import urllib.request
urllib.request.urlretrieve(bert_model_url, 'uncased_L-12_H-768_A-12.zip', show_progress)

('uncased_L-12_H-768_A-12.zip', <http.client.HTTPMessage at 0x7f1458239048>)

In [33]:
import zipfile

DATA_ZIP_PATH = Path('.') / 'uncased_L-12_H-768_A-12.zip'
DATA_OUTPUT_PATH = Path('.') / 'data'

with zipfile.ZipFile(DATA_ZIP_PATH) as zip_file:
    for filename in zip_file.namelist():
        logging.debug('Extracting file %s' % filename)
        zip_file.extract(filename, DATA_OUTPUT_PATH)

DEBUG:Extracting file bert_model.ckpt.data-00000-of-00001
DEBUG:Extracting file bert_config.json
DEBUG:Extracting file vocab.txt
DEBUG:Extracting file bert_model.ckpt.index


## Experiments with contextualized BERT embeddings


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

### Example 1 ('mouse')

You can also see the tokenizer output.<br> <br>

Here the embeddings will be computed from the output of the last 4 layers of the transformer.<br>
Note that you also have to specify a dimensionality of the output embeddings here. 768 is the maximum size of the output; when using lower values, the embeddings will be simply cut off in this implementation, which is not recommended.<br> <br>

Note that there will be a few warning messages, which occur due to this being a quite hacky conversion of an old implementation to work with the current version of tensorflow. I don't recommend to use this for actual research, but it should suffice for our experiments.

In [84]:
sent1 = "A mouse is a rodent animal."
sent2 = "A mouse is a computer device."

sent3 = "A rat is an animal that lives in the sewers."
sent4 = "A computer has a keyboard as an input device."

#embeddings = bert_embeddings.get_bert_embeddings([sent1, sent2, sent3, sent4], 10000)
embeddings = bert_embeddings.get_bert_embeddings([sent1, sent2, sent3, sent4], dim=768)

INFO:Read input: 4 examples
INFO:*** Example ***
INFO:unique_id: 0
INFO:tokens: [CLS] a mouse is a rode ##nt animal . [SEP]
INFO:input_ids: 101 1037 8000 2003 1037 8469 3372 4111 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:input_mask: 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:*** Example ***
INFO:



































































































































































































































































































DEBUG:Initialize variable bert/embeddings/position_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/position_embeddings
DEBUG:Initialize variable bert/embeddings/token_type_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/token_type_embeddings
DEBUG:Initialize variable bert/embeddings/word_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/word_embeddings
DEBUG:Initialize variable bert/encoder/layer_0/attention/output/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/output/dense/bias
DEBUG:Initialize variable bert/encoder/layer_0/attention/output/dense/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/output/dense/kernel
DEBUG:Initialize variable bert/encoder/layer_0/attention/self/key/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/self/key/bias
DEBUG:Initialize variable bert/encoder/layer_0/attention/self/key/kernel:0 fro

DEBUG:Initialize variable bert/encoder/layer_2/attention/output/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/output/dense/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/output/dense/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/output/dense/kernel
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/key/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/key/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/key/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/key/kernel
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/query/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/query/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/query/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/query/kernel
DEBU

DEBUG:Initialize variable bert/encoder/layer_6/attention/self/key/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/key/kernel
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/query/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/query/bias
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/query/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/query/kernel
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/value/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/value/bias
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/value/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/value/kernel
DEBUG:Initialize variable bert/encoder/layer_6/intermediate/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/intermediate/dense/bias
DEBUG:Initia

INFO:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/layer_normalization_1/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_0/attention/output/layer_normalization_1/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  

INFO:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/attention/output/layer_normalization_11/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/attention/output/layer_normalization_11/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/layer_normalization_12/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/output/layer_normalization_12/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_6/attent

INFO:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/layer_normalization_22/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_10/output/layer_normalization_22/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_

In [85]:
mouse1_embedding = embeddings[0]['mouse']
mouse2_embedding = embeddings[1]['mouse']
rat_embedding = embeddings[2]['rat']
keyboard_embedding = embeddings[3]['keyboard']

print('mouse1 - rat: %.4f' % (cosine_similarity([mouse1_embedding], [rat_embedding])[0][0]))
print('mouse1 - keyboard: %.4f' % (cosine_similarity([mouse1_embedding], [keyboard_embedding])[0][0]))
print('mouse2 - rat: %.4f' % (cosine_similarity([mouse2_embedding], [rat_embedding])[0][0]))
print('mouse2 - keyboard: %.4f' % (cosine_similarity([mouse2_embedding], [keyboard_embedding])[0][0]))

# expected mouse1: rat > keyboard
# expected mouse2: keyboard > rat

mouse1 - rat: 0.9898
mouse1 - keyboard: 0.9887
mouse2 - rat: 0.9880
mouse2 - keyboard: 0.9888


### Example 2 ('lean')

In [86]:
sents = [
    'I like my steak lean.',
    'Gravity made the tower lean to the side.',
    'The steak is very greasy.',
    'The corner made the cat tilt sideways.'
]

embeddings2 = bert_embeddings.get_bert_embeddings(sents, dim=768)

INFO:Read input: 4 examples
INFO:*** Example ***
INFO:unique_id: 0
INFO:tokens: [CLS] i like my steak lean . [SEP]
INFO:input_ids: 101 1045 2066 2026 21475 8155 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:*** Example ***
INFO:unique_id: 1
I



































































































































































































































































































DEBUG:Initialize variable bert/embeddings/position_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/position_embeddings
DEBUG:Initialize variable bert/embeddings/token_type_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/token_type_embeddings
DEBUG:Initialize variable bert/embeddings/word_embeddings:0 from checkpoint data/bert_model.ckpt with bert/embeddings/word_embeddings
DEBUG:Initialize variable bert/encoder/layer_0/attention/output/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/output/dense/bias
DEBUG:Initialize variable bert/encoder/layer_0/attention/output/dense/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/output/dense/kernel
DEBUG:Initialize variable bert/encoder/layer_0/attention/self/key/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_0/attention/self/key/bias
DEBUG:Initialize variable bert/encoder/layer_0/attention/self/key/kernel:0 fro

DEBUG:Initialize variable bert/encoder/layer_2/attention/output/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/output/dense/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/output/dense/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/output/dense/kernel
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/key/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/key/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/key/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/key/kernel
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/query/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/query/bias
DEBUG:Initialize variable bert/encoder/layer_2/attention/self/query/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_2/attention/self/query/kernel
DEBU

DEBUG:Initialize variable bert/encoder/layer_6/attention/self/key/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/key/kernel
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/query/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/query/bias
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/query/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/query/kernel
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/value/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/value/bias
DEBUG:Initialize variable bert/encoder/layer_6/attention/self/value/kernel:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/attention/self/value/kernel
DEBUG:Initialize variable bert/encoder/layer_6/intermediate/dense/bias:0 from checkpoint data/bert_model.ckpt with bert/encoder/layer_6/intermediate/dense/bias
DEBUG:Initia

INFO:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/attention/output/layer_normalization_1/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_0/attention/output/layer_normalization_1/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  

INFO:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/attention/output/layer_normalization_11/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/attention/output/layer_normalization_11/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_5/output/layer_normalization_12/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_5/output/layer_normalization_12/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_6/attent

INFO:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_10/output/layer_normalization_22/gamma:0, shape = (768,)
INFO:  name = bert/encoder/layer_10/output/layer_normalization_22/beta:0, shape = (768,)
INFO:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:  name = bert/encoder/layer_

In [87]:
lean1_embedding = embeddings2[0]['lean']
lean2_embedding = embeddings2[1]['lean']
greasy_embedding = embeddings2[2]['greasy']
tilt_embedding = embeddings2[3]['tilt']

print('lean1 - greasy: %.4f' % (cosine_similarity([lean1_embedding], [greasy_embedding])[0][0]))
print('lean1 - tilt: %.4f' % (cosine_similarity([lean1_embedding], [tilt_embedding])[0][0]))
print('lean2 - greasy: %.4f' % (cosine_similarity([lean2_embedding], [greasy_embedding])[0][0]))
print('lean2 - tilt: %.4f' % (cosine_similarity([lean2_embedding], [tilt_embedding])[0][0]))

# expected lean1: greasy > tilt
# expected lean2: tilt > greasy

lean1 - greasy: 0.9788
lean1 - tilt: 0.9620
lean2 - greasy: 0.9148
lean2 - tilt: 0.9176
