In [119]:
import pickle
from tqdm import tqdm
from word_forms.word_forms import get_word_forms
from transformers import BertTokenizer, BertModel
import torch
import os
import numpy as np
from scipy.spatial import distance

In [None]:
# Load the data
# make sure to get the .pkl from the .zip file
with open("sampled_SemCore+OMSTI.pkl", "rb") as f:
    data = pickle.load(f)
data_list = list(data.items())

print("Number of words in the dataset: ", len(data_list))

In [121]:
# Load the BERT-base-uncased model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
# YOU ONLY NEED TO CHANGE THE WORD INDEX HERE 
target_word_index = 0

# Structure of data_list
# data_list[word index][0-word, 1-all the info][meaning key][sentence No]['sentence'/'lematized'/'word'/'pos'/'id'/'sent_pos']
# 'sentence' = the whole sentence
# 'lematized' = lemmatization of words in sentence
# 'word' = target word
# 'pos' = NOUN/VERB/ADJ...
# 'id' = some id
# 'sent_pos' = position of target word in lemmatized sentence
keys = list(data_list[target_word_index][1].keys())

target_word = data_list[target_word_index][0]
print(target_word)

In [None]:
word_meaning_header = []
i = 1
for k in keys:
    word_meaning_header = word_meaning_header + [i]*len(data_list[target_word_index][1][k])
    i += 1

data_list[target_word_index][1]

In [210]:
part_of_speech = {'NOUN': 1, 'ADJ': 2, 'VERB': 3}

word_part_of_speech = []
for k in keys:
    for sen in data_list[target_word_index][1][k]:
        word_part_of_speech = word_part_of_speech + [part_of_speech[sen['pos']]]

In [None]:
only_sentence = []  # contains only the sentences
only_lem = []  # contains only the lemmatized sentences
for k in keys:
    for i in data_list[target_word_index][1][k]:
        only_sentence.append(i['sentence'])
        only_lem.append(i['lemmatized'])

only_sentence

In [212]:
# tokenize the list of sentences and get the hidden states
hidden_states = []

inputs = tokenizer(text=only_sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states

In [None]:
# get all the forms that the target word can take in the sentence
# similarity threshold is set to 0.99 to get only the most similar forms
# we couldn't find a better way to get the forms of the target word
# so we are using this library to get the forms
_forms = get_word_forms(target_word, 0.99)
forms = set()
for key in _forms:
    forms = forms.union(_forms[key])

print(forms)

In [None]:
from tqdm import tqdm

target_word_hidden_states = []
for layer in tqdm(range(13), desc="Layers"):
    layer_hidden_states = []
    for i, sentence_data in tqdm(enumerate(only_sentence), desc="Embeddings layer " + str(layer)):
        for form in forms:
            try:
                # get the position of the target word's token in the tokenized sentence
                # this is needed to accurately get the hidden states of the target word
                # as the tokenized sentence usually contains extra tokens like [CLS], [SEP], etc.
                target_word_position = list(tokenizer(text=sentence_data, return_tensors="pt")['input_ids'][0]).index(int(tokenizer(form, return_tensors="pt")['input_ids'][0][1]))
                break
            except:
                pass
        sentence_hidden_states = hidden_states[layer][i][target_word_position]
        layer_hidden_states.append(sentence_hidden_states)

    target_word_hidden_states.append(layer_hidden_states)

In [None]:
# Calculate signatures of embeddings for each layer

signatures = []
for layer in range(13):
    A = np.asarray([list(x.numpy()) for x in target_word_hidden_states[layer]])
    dot_prods = A @ A.T
    norms = np.sqrt(dot_prods.diagonal())
    cos_dist = np.ones(dot_prods.shape) - (dot_prods / np.outer(norms, norms))
    signatures.append(cos_dist.T)

signatures[0].shape

In [None]:
for layer in tqdm(range(13)):
    filename = f"..\input\{target_word}\layer_{layer}.in"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        # matrix dimension
        f.write(str(len(word_meaning_header)))
        f.write("\n")

        # word meanings
        f.write(" ".join(map(str, word_meaning_header)) + "\n")

        # word as part of speech
        f.write(" ".join(map(str, word_part_of_speech)) + "\n")

        n = len(signatures[layer])
        
        # create the signature distance matrix
        signature_matrix = distance.cdist(XA=signatures[layer], XB=signatures[layer], metric='euclidean')
        signature_matrix = signature_matrix * (1 / n ** 0.5)
        
        np.savetxt(f, signature_matrix, fmt="%.5f")