## EXPERIMENT 1

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 11.6MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 47.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 43.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=cf88f061fbf0

In [2]:
import torch
from transformers import BertModel, BertConfig, BertTokenizer

In [28]:
# utility function for getting segments
def get_segments(tokens):
    #print("get_segments")
    #print(tokens)
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1-current_seg_id 
    return (seg_ids)

In [5]:
def get_ids(tokens, tokenizer):
    return tokenizer.convert_tokens_to_ids(tokens)

In [6]:
def encode_sentence(sent, tokenizer):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [8]:
def get_model(model_string = 'bert-base-uncased'):
  config = BertConfig.from_pretrained(model_string, output_hidden_states=True)
  model = BertModel.from_pretrained(model_string, config=config)
  tokenizer = BertTokenizer.from_pretrained(model_string)
  return (model, tokenizer, config)

In [29]:
def get_sentence_embedding(sent, model, tokenizer, config):

  tokens = encode_sentence(sent, tokenizer)
  segments_idx = get_segments(tokens)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
  #print(indexed_tokens)
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_idx])
  model.eval()

  with torch.no_grad():
    outputs  = model(tokens_tensor, segments_tensors)
  embeddings_of_last_layer = outputs[0]
  cls_embeddings = embeddings_of_last_layer[0]
  last_hidden_states = outputs[0] 
  hidden_states = outputs[2]
  embedding_output = hidden_states[0]
  encoded_layers = attention_hidden_states = hidden_states[1:]
  # BERT has twelve (in this case) layers, we are considering Second Last layer.
  #token_vecs = encoded_layers[10][0] # encoded_layers[11][0]
  token_vecs = cls_embeddings
  sentence_embedding = torch.mean(token_vecs, dim=0) # Calculating average across the sentence.
  return(sentence_embedding)

In [11]:
def get_document_embedding(lstdocuments, model, tokenizer, config ):
  docembeddings = []
  for doc in lstdocuments:
    docembeddings.append(get_sentence_embedding(doc, model, tokenizer, config))
  return(docembeddings)

In [12]:
lst_corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.',
          'Sachin Tendulkar is a great player.',
          'Sholay is an Indian classic film',
          'Dog is hunting for food'
          ]
test_sentences = ["Cricket is my favourite game.", "I like hindi movies.", "Cat is looking to eat"]

In [30]:
model, tokenizer, config = get_model();
test_embeds = get_document_embedding(test_sentences, model, tokenizer, config)
doc_embeds = get_document_embedding(lst_corpus, model, tokenizer, config)

In [31]:
from scipy.spatial.distance import cosine
import scipy

In [33]:
def calculate_distances(query_embedding, document_emdeddings):
  distances_c = []  
  for docembed in document_emdeddings:
    distances_c.append(scipy.spatial.distance.cosine(query_embedding, docembed))
  return(distances_c)

In [34]:
closest_n = 3
for query, query_embedding in zip(test_sentences, test_embeds):
    distances = calculate_distances(query_embedding, doc_embeds)

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop %s most similar sentences in corpus:\n" % closest_n)

    for idx, distance in results[0:closest_n]:
        print(lst_corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: Cricket is my favourite game.

Top 3 most similar sentences in corpus:

A monkey is playing drums. (Score: 0.6063)
Sachin Tendulkar is a great player. (Score: 0.6003)
A woman is playing violin. (Score: 0.5699)




Query: I like hindi movies.

Top 3 most similar sentences in corpus:

A monkey is playing drums. (Score: 0.6127)
A man is eating food. (Score: 0.6020)
A woman is playing violin. (Score: 0.5986)




Query: Cat is looking to eat

Top 3 most similar sentences in corpus:

A man is eating food. (Score: 0.6959)
Dog is hunting for food (Score: 0.6752)
The girl is carrying a baby. (Score: 0.6217)


### EXPERIMENT 1A : TESTING BERT FOR WORD CONTEXT

In [35]:
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

marked_text = "[CLS] " + text + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

segments_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
model.eval()

with torch.no_grad():
    outputs  = model(tokens_tensor, segments_tensors)

hidden_states = outputs[2]
print(len(hidden_states))  # 13

embedding_output = hidden_states[0]
attention_hidden_states = hidden_states[1:]
token_embeddings = torch.stack(attention_hidden_states, dim=0)
token_embeddings.size()
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()
token_embeddings = token_embeddings.permute(1,0,2)

13


In [36]:
tokenized_text

['[CLS]',
 'after',
 'stealing',
 'money',
 'from',
 'the',
 'bank',
 'vault',
 ',',
 'the',
 'bank',
 'robber',
 'was',
 'seen',
 'fishing',
 'on',
 'the',
 'mississippi',
 'river',
 'bank',
 '.',
 '[SEP]']

In [37]:
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))


Shape is: 22 x 768


In [38]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

First 5 vector values for each instance of "bank".

bank vault    tensor([ 3.3596, -2.9805, -1.5421,  0.7065,  2.0031])
bank robber   tensor([ 2.7359, -2.5577, -1.3094,  0.6797,  1.6633])
river bank    tensor([ 1.5266, -0.8895, -0.5152, -0.9298,  2.8334])


In [39]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.94
Vector similarity for *different* meanings:  0.69
