Import the necessary libraries.

In [3]:
# Import libraries for keywords
from transformers import BertModel, BertConfig, BertTokenizer
import numpy as np
import torch
import os
from create_input_lists import create_input_lists_from_csv, lemmatize

Obtain a list of the lemmatized texts.

In [9]:
input_folder = 'OneDrive-2020-12-04/intro_bio (with periods)_labelled'
input1_path = os.path.join(input_folder, 'assessments.csv')
input2_path = os.path.join(input_folder, 'paragraphs.csv')

text_ids_assess, lemmatized_texts_assess, original_texts_assess, text_ids_para, lemmatized_texts_para, orig_texts_para = create_input_lists_from_csv(input1_path, input2_path)
print(len(lemmatized_texts_assess))

1095


Create the pre-trained BERT model.

In [10]:
model_path = 'bert-base-uncased'
model = BertModel.from_pretrained(model_path, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_path)

Get embeddings for text and print out their similarity.

In [43]:
text = lemmatized_texts_assess[0]
print(text, '\n')

tokens = tokenizer.tokenize(text)
print(tokens, '\n')

print(len(tokens))

inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors='pt')
print(inputs)

outputs = model(**inputs)  # outputs: [last_hidden_state, pooler_output, hidden_states]

last_hidden_state = outputs[0].detach().numpy()
print(last_hidden_state.shape)




hidden_states = outputs[2]

print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))



water abiotic water substance require biotic organism water abiotic meet condition outline evaluate item accord characteristic require biotic living thing help know bacteria compose cell virus small simpler cell water carbon dioxide tiny abiotic molecule glucose big molecule nonliving virus edge life compose cell depend cell existence truly alive bacteria cellular life form . 

['water', 'ab', '##iot', '##ic', 'water', 'substance', 'require', 'bio', '##tic', 'organism', 'water', 'ab', '##iot', '##ic', 'meet', 'condition', 'outline', 'evaluate', 'item', 'accord', 'characteristic', 'require', 'bio', '##tic', 'living', 'thing', 'help', 'know', 'bacteria', 'compose', 'cell', 'virus', 'small', 'simpler', 'cell', 'water', 'carbon', 'dioxide', 'tiny', 'ab', '##iot', '##ic', 'molecule', 'glucose', 'big', 'molecule', 'non', '##li', '##ving', 'virus', 'edge', 'life', 'compose', 'cell', 'depend', 'cell', 'existence', 'truly', 'alive', 'bacteria', 'cellular', 'life', 'form', '.'] 

64
{'input_ids'

Examine the embedding being used in SMART.

In [34]:
embedding = outputs[0][:,0,:].detach().numpy()
print(embedding.shape)

(1, 768)
[[-2.41987765e-01  5.93203083e-02  7.34214246e-01 -3.91524345e-01
  -1.24290600e-01 -5.38780391e-01  4.42330062e-01 -2.40684584e-01
   1.01756684e-01 -3.70942205e-01  6.36059344e-01 -3.51173699e-01
  -3.94592106e-01  4.97204870e-01 -5.17419159e-01 -3.04590981e-03
   2.52093434e-01  7.99663782e-01  2.50117958e-01 -8.19038451e-02
  -1.62877560e-01 -4.11117107e-01  3.35963517e-01 -1.93929225e-01
  -1.01840436e-01 -5.37424386e-01  1.40398324e-01  1.78092107e-01
   3.19815964e-01 -3.82299647e-02  8.03322345e-02  2.42981255e-01
  -2.47283921e-01 -4.32608545e-01  1.14466958e-01 -7.29863048e-01
  -2.20453274e-02  1.62313581e-01  5.39587326e-02  3.91507261e-02
  -2.93593526e-01 -5.29166758e-02  4.24015634e-02  2.32985079e-01
  -6.93243921e-01 -1.95412729e-02 -3.79263711e+00  5.39484441e-01
  -7.69294322e-01 -1.47286370e-01  6.19338751e-01 -5.46518028e-01
   1.75099805e-01  8.12982202e-01  2.80176830e-02  7.43252397e-01
   4.60775346e-02 -1.94843754e-01  1.49333254e-01  3.59011978e-01
 

Examine the embedding for the first token in the sentence using the last hidden layer weights.

In [39]:
embedding_first_token = hidden_states[12][0][0].detach().numpy()
print(embedding_first_token.shape)

comparison = embedding == embedding_first_token
print('Equality of SMART version and embedding for first_token:', comparison.all())

(768,)
[-2.41987765e-01  5.93203083e-02  7.34214246e-01 -3.91524345e-01
 -1.24290600e-01 -5.38780391e-01  4.42330062e-01 -2.40684584e-01
  1.01756684e-01 -3.70942205e-01  6.36059344e-01 -3.51173699e-01
 -3.94592106e-01  4.97204870e-01 -5.17419159e-01 -3.04590981e-03
  2.52093434e-01  7.99663782e-01  2.50117958e-01 -8.19038451e-02
 -1.62877560e-01 -4.11117107e-01  3.35963517e-01 -1.93929225e-01
 -1.01840436e-01 -5.37424386e-01  1.40398324e-01  1.78092107e-01
  3.19815964e-01 -3.82299647e-02  8.03322345e-02  2.42981255e-01
 -2.47283921e-01 -4.32608545e-01  1.14466958e-01 -7.29863048e-01
 -2.20453274e-02  1.62313581e-01  5.39587326e-02  3.91507261e-02
 -2.93593526e-01 -5.29166758e-02  4.24015634e-02  2.32985079e-01
 -6.93243921e-01 -1.95412729e-02 -3.79263711e+00  5.39484441e-01
 -7.69294322e-01 -1.47286370e-01  6.19338751e-01 -5.46518028e-01
  1.75099805e-01  8.12982202e-01  2.80176830e-02  7.43252397e-01
  4.60775346e-02 -1.94843754e-01  1.49333254e-01  3.59011978e-01
 -3.66777360e-01 -

In [46]:
a = np.array([1, 1, 1])
b = np.array([2, 3, 4])
c = np.array([5, 7, 9])

vectors = [a,b,c]

added = a + b + c
averaged = added / len(vectors)

print(added)
print(averaged)

[ 8 11 14]
[2.66666667 3.66666667 4.66666667]
