In [None]:
import pickle
# make sure you have the .pkl file because it doesn't work with the .zip
with open("sampled_SemCore+OMSTI.pkl", "rb") as f:
    data = pickle.load(f)
data_list = list(data.items())

# YOU ONLY NEED TO CHANGE THE WORD INDEX HERE 
target_word_index = 0

print(data_list[target_word_index][0])

# Structure of data_list
# data_list[word index][0-word, 1-all the info][meaning key][sentence No]['sentence'/'lematized'/'word'/'pos'/'id'/'sent_pos']
# 'sentence' = the whole sentence
# 'lematized' = lemmatization of words in sentence
# 'word' = target word
# 'pos' = NOUN/VERB/ADJ...
# 'id' = some id
# 'sent_pos' = position of target word in lemmatized sentence
keys = list(data_list[target_word_index][1].keys())

print(keys)

In [None]:
word_meaning_header = []
i = 1
for k in keys:
    word_meaning_header = word_meaning_header + [i]*len(data_list[target_word_index][1][k])

    i += 1

data_list[target_word_index][1]

In [None]:
part_of_speech = {'NOUN': 1, 'ADJ': 2, 'VERB': 3}

word_part_of_speech = []
for k in keys:
    for sen in data_list[target_word_index][1][k]:
        word_part_of_speech = word_part_of_speech + [part_of_speech[sen['pos']]]

In [None]:
len(word_part_of_speech)

In [None]:
data_list[target_word_index][1]['work%2:35:02::'][0]

In [None]:
sentences_lem = []  # contains sentence, lemmatization, target word, ... in a dictionary
only_sentence = []  # contains only the sentences
only_lem = []       # contains lists of the lemmatized words in each sentence
for k in keys:
    for i in data_list[target_word_index][1][k]:
        sentences_lem.append(i)
        only_sentence.append(i['sentence'])
        only_lem.append(i['lemmatized'])

only_sentence

In [None]:
len(sentences_lem)

In [None]:
only_lem[0]

In [None]:
# lists of lists of the lemmatized words in the sentences
lem_words_in_sentences = []
for i in only_lem:
    sentence = []
    for j in i:
        sentence.append(j[0])
    lem_words_in_sentences.append(sentence)

lem_words_in_sentences

In [None]:
# the original sentences but the words in them have been lemmatized
lemmatized_sentences = []
for l in lem_words_in_sentences:
    sen = ' '.join(l)
    print(sen)
    lemmatized_sentences.append(sen)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT-base-uncased model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)


In [None]:
# tokenize the list of sentences and get the hidden states
hidden_states = []

inputs = tokenizer(text=only_sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states

len(hidden_states)

In [None]:
print(only_sentence[0])
print(sentences_lem[0]['sent_pos'])
print(tokenizer(text=only_sentence[0], return_tensors="pt"))

print(only_sentence[4])
print(sentences_lem[4]['sent_pos'])
print(tokenizer(text=only_sentence[4], return_tensors="pt"))

print(only_sentence[5])
print(only_lem[5][sentences_lem[5]['sent_pos']])
print(tokenizer(text=only_sentence[5], return_tensors="pt"))

In [None]:
target_word_hidden_states = []

for layer in range(13):
    tmp = []
    for i, sentence_data in enumerate(only_sentence):
        for form in ["work", "works", "working", "worked"]:
            try:
                target_word_position = list(tokenizer(text=sentence_data, return_tensors="pt")['input_ids'][0]).index(int(tokenizer(form, return_tensors="pt")['input_ids'][0][1]))
                break
            except:
                pass
        sentence_hidden_states = hidden_states[layer][i][target_word_position]
        tmp.append(sentence_hidden_states)

    target_word_hidden_states.append(tmp)

In [None]:
target_word_hidden_states[0][0]

In [None]:
def cosine_dist(v1, v2):
    dot_product = torch.dot(v1, v2)
    norm_v1 = torch.norm(v1)
    norm_v2 = torch.norm(v2)
    return 1 - dot_product / (norm_v1 * norm_v2)

signatures = []
for layer in range(13):
    signatures.append([])
    for i in target_word_hidden_states[layer]:
        signature = torch.tensor([])
        for j in target_word_hidden_states[layer]:
            signature = torch.cat((signature, cosine_dist(i, j).unsqueeze(0)))
        signatures[layer].append(signature)

In [None]:
signatures[0][0]

In [None]:
print("{} ".format(1))

In [None]:
for layer in range(13):
    with open(f"work_embeddings\\layer_r_{layer}.in", "w") as f:
        print("Started layer " + str(layer) + "\n")
        # matrix dimension
        f.write(str(len(word_meaning_header)))
        f.write("\n")

        # word meanings
        for m in word_meaning_header:
            f.write("{} ".format(m))
        f.write("\n")

        # word as part of speech
        for p in word_part_of_speech:
            f.write("{} ".format(p))
        f.write("\n")

        for sig1 in signatures[layer]:
            for sig2 in signatures[layer]:
                f.write("{:.5f} ".format((1 / len(sig1)**0.5) * torch.norm(sig1 - sig2)))
            f.write("\n")
    print("Finished with layer " + str(layer) + "\n")