In [None]:
import pickle

# Load the data
# make sure to get the .pkl from the .zip file
with open("sampled_SemCore+OMSTI.pkl", "rb") as f:
    data = pickle.load(f)
data_list = list(data.items())

print("Number of words in the dataset: ", len(data_list))

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT-base-uncased model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [None]:
# YOU ONLY NEED TO CHANGE THE WORD INDEX HERE 
target_word_index = 2

# Structure of data_list
# data_list[word index][0-word, 1-all the info][meaning key][sentence No]['sentence'/'lematized'/'word'/'pos'/'id'/'sent_pos']
# 'sentence' = the whole sentence
# 'lematized' = lemmatization of words in sentence
# 'word' = target word
# 'pos' = NOUN/VERB/ADJ...
# 'id' = some id
# 'sent_pos' = position of target word in lemmatized sentence
keys = list(data_list[target_word_index][1].keys())

target_word = data_list[target_word_index][0]
print(target_word)

In [None]:
word_meaning_header = []
i = 1
for k in keys:
    word_meaning_header = word_meaning_header + [i]*len(data_list[target_word_index][1][k])

    i += 1

data_list[target_word_index][1]

In [None]:
part_of_speech = {'NOUN': 1, 'ADJ': 2, 'VERB': 3}

word_part_of_speech = []
for k in keys:
    for sen in data_list[target_word_index][1][k]:
        word_part_of_speech = word_part_of_speech + [part_of_speech[sen['pos']]]

In [None]:
only_sentence = []  # contains only the sentences
only_lem = []  # contains only the lemmatized sentences
for k in keys:
    for i in data_list[target_word_index][1][k]:
        only_sentence.append(i['sentence'])
        only_lem.append(i['lemmatized'])

only_sentence

In [None]:
# tokenize the list of sentences and get the hidden states
hidden_states = []

inputs = tokenizer(text=only_sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states

In [None]:
from word_forms.word_forms import get_word_forms

target_word_hidden_states = []

# get all the forms that the target word can take in the sentence
# similarity threshold is set to 0.8 to get only the most similar forms
# we couldn't find a better way to get the forms of the target word
# so we are using this library to get the forms, but they might have to
# be manually filtered later
_forms = get_word_forms(target_word, 0.8)
forms = set()
for key in _forms:
    forms = forms.union(_forms[key])

print(forms)

for layer in range(13):
    layer_hidden_states = []
    for i, sentence_data in enumerate(only_sentence):
        for form in forms:
            try:
                # get the position of the target word's token in the tokenized sentence
                # this is needed to accurately get the hidden states of the target word
                # as the tokenized sentence usually contains extra tokens like [CLS], [SEP], etc.
                target_word_position = list(tokenizer(text=sentence_data, return_tensors="pt")['input_ids'][0]).index(int(tokenizer(form, return_tensors="pt")['input_ids'][0][1]))
                break
            except:
                pass
        sentence_hidden_states = hidden_states[layer][i][target_word_position]
        layer_hidden_states.append(sentence_hidden_states)

    target_word_hidden_states.append(layer_hidden_states)

In [None]:
def cosine_dist(v1, v2):
    dot_product = torch.dot(v1, v2)
    norm_v1 = torch.norm(v1)
    norm_v2 = torch.norm(v2)
    return 1 - dot_product / (norm_v1 * norm_v2)

signatures = []
for layer in range(13):
    signatures.append([])
    for i in target_word_hidden_states[layer]:
        signature = torch.tensor([])
        for j in target_word_hidden_states[layer]:
            signature = torch.cat((signature, cosine_dist(i, j).unsqueeze(0)))
        signatures[layer].append(signature)

In [None]:
import os

for layer in range(13):
    filename = f"signature_dists\{target_word}\layer_{layer}.in"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        print("Started layer " + str(layer) + "\n")
        # matrix dimension
        f.write(str(len(word_meaning_header)))
        f.write("\n")

        # word meanings
        for m in word_meaning_header:
            f.write("{} ".format(m))
        f.write("\n")

        # word as part of speech
        for p in word_part_of_speech:
            f.write("{} ".format(p))
        f.write("\n")

        for sig1 in signatures[layer]:
            for sig2 in signatures[layer]:
                f.write("{:.5f} ".format((1 / len(sig1)**0.5) * torch.norm(sig1 - sig2)))
            f.write("\n")
    print("Finished with layer " + str(layer) + "\n")