In [119]:
import pickle
from tqdm import tqdm
from word_forms.word_forms import get_word_forms
from transformers import BertTokenizer, BertModel
import torch
import os
import numpy as np

In [120]:
# Load the data
# make sure to get the .pkl from the .zip file
with open("sampled_SemCore+OMSTI.pkl", "rb") as f:
    data = pickle.load(f)
data_list = list(data.items())

print("Number of words in the dataset: ", len(data_list))

Number of words in the dataset:  130


In [121]:
# Load the BERT-base-uncased model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [217]:
# YOU ONLY NEED TO CHANGE THE WORD INDEX HERE 
target_word_index = 30

# Structure of data_list
# data_list[word index][0-word, 1-all the info][meaning key][sentence No]['sentence'/'lematized'/'word'/'pos'/'id'/'sent_pos']
# 'sentence' = the whole sentence
# 'lematized' = lemmatization of words in sentence
# 'word' = target word
# 'pos' = NOUN/VERB/ADJ...
# 'id' = some id
# 'sent_pos' = position of target word in lemmatized sentence
keys = list(data_list[target_word_index][1].keys())

target_word = data_list[target_word_index][0]
print(target_word)

figure


In [209]:
word_meaning_header = []
i = 1
for k in keys:
    word_meaning_header = word_meaning_header + [i]*len(data_list[target_word_index][1][k])

    i += 1

data_list[target_word_index][1]

{'field%1:09:00::': [{'sentence': 'Stresses with deep concern that the HIV/AIDS emergency, with its devastating scale and impact, requires urgent actions in all fields and at all levels;',
   'lemmatized': [['stress', 'NOUN'],
    ['with', 'ADP'],
    ['deep', 'ADJ'],
    ['concern', 'NOUN'],
    ['that', 'ADP'],
    ['the', 'DET'],
    ['hiv/aids', 'NOUN'],
    ['emergency', 'NOUN'],
    [',', '.'],
    ['with', 'ADP'],
    ['its', 'PRON'],
    ['devastate', 'VERB'],
    ['scale', 'NOUN'],
    ['and', 'CONJ'],
    ['impact', 'NOUN'],
    [',', '.'],
    ['require', 'VERB'],
    ['urgent', 'ADJ'],
    ['action', 'NOUN'],
    ['in', 'ADP'],
    ['all', 'DET'],
    ['field', 'NOUN'],
    ['and', 'CONJ'],
    ['at', 'ADP'],
    ['all', 'DET'],
    ['level', 'NOUN'],
    [';', '.']],
   'word': 'field',
   'pos': 'NOUN',
   'id': 'd000000.s000503.t000000',
   'sent_pos': 21},
  {'sentence': 'Similarly, the Assembly should reassert its role in the field of disarmament and non-proliferation,

In [210]:
part_of_speech = {'NOUN': 1, 'ADJ': 2, 'VERB': 3}

word_part_of_speech = []
for k in keys:
    for sen in data_list[target_word_index][1][k]:
        word_part_of_speech = word_part_of_speech + [part_of_speech[sen['pos']]]

In [211]:
only_sentence = []  # contains only the sentences
only_lem = []  # contains only the lemmatized sentences
for k in keys:
    for i in data_list[target_word_index][1][k]:
        only_sentence.append(i['sentence'])
        only_lem.append(i['lemmatized'])

only_sentence

['Stresses with deep concern that the HIV/AIDS emergency, with its devastating scale and impact, requires urgent actions in all fields and at all levels;',
 'Similarly, the Assembly should reassert its role in the field of disarmament and non-proliferation, replacing the Security Council Committee established pursuant to Council resolution 1540 (2004) with a comparable Assembly committee.',
 'At the same time, Member States and other partners must do their part and together take concrete steps to achieving more global coherence in the rule of law field.',
 'Functional commissions also continued to cooperate with relevant United Nations funds and programs, specialized agencies and the Bretton Woods institutions, allowing them to tap into the lessons learned and examples of good practices from the field.',
 "I am confident that Mr. Al-Bader's competence and valuable experience in the field of disarmament and international security will greatly contribute to the efficient and productive w

In [212]:
# tokenize the list of sentences and get the hidden states
hidden_states = []

inputs = tokenizer(text=only_sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

hidden_states = outputs.hidden_states

In [None]:
# get all the forms that the target word can take in the sentence
# similarity threshold is set to 0.99 to get only the most similar forms
# we couldn't find a better way to get the forms of the target word
# so we are using this library to get the forms
_forms = get_word_forms(target_word, 0.99)
forms = set()
for key in _forms:
    forms = forms.union(_forms[key])

print(forms)

{'field', 'fields', 'fielded', 'fielding'}


In [214]:
from tqdm import tqdm

target_word_hidden_states = []
for layer in tqdm(range(13), desc="Layers"):
    layer_hidden_states = []
    for i, sentence_data in tqdm(enumerate(only_sentence), desc="Embeddings layer " + str(layer)):
        for form in forms:
            try:
                # get the position of the target word's token in the tokenized sentence
                # this is needed to accurately get the hidden states of the target word
                # as the tokenized sentence usually contains extra tokens like [CLS], [SEP], etc.
                target_word_position = list(tokenizer(text=sentence_data, return_tensors="pt")['input_ids'][0]).index(int(tokenizer(form, return_tensors="pt")['input_ids'][0][1]))
                break
            except:
                pass
        sentence_hidden_states = hidden_states[layer][i][target_word_position]
        layer_hidden_states.append(sentence_hidden_states)

    target_word_hidden_states.append(layer_hidden_states)

Embeddings layer 0: 707it [00:01, 538.05it/s]]
Embeddings layer 1: 707it [00:01, 555.78it/s]1.32s/it]
Embeddings layer 2: 707it [00:01, 552.65it/s]1.29s/it]
Embeddings layer 3: 707it [00:01, 562.33it/s]1.29s/it]
Embeddings layer 4: 707it [00:01, 552.77it/s]1.28s/it]
Embeddings layer 5: 707it [00:01, 553.12it/s]1.28s/it]
Embeddings layer 6: 707it [00:01, 532.15it/s]1.28s/it]
Embeddings layer 7: 707it [00:01, 530.76it/s]1.30s/it]
Embeddings layer 8: 707it [00:01, 554.67it/s]1.31s/it]
Embeddings layer 9: 707it [00:01, 560.41it/s]1.30s/it]
Embeddings layer 10: 707it [00:01, 555.38it/s]1.29s/it]
Embeddings layer 11: 707it [00:01, 535.20it/s]1.28s/it]
Embeddings layer 12: 707it [00:01, 536.39it/s]1.30s/it]
Layers: 100%|██████████| 13/13 [00:16<00:00,  1.30s/it]


In [215]:
# Calculate signatures of embeddings for each layer

signatures = []
for layer in range(13):
    A = np.asarray([list(x.numpy()) for x in target_word_hidden_states[layer]])
    dot_prods = A @ A.T
    norms = np.sqrt(dot_prods.diagonal())
    cos_dist = np.ones(dot_prods.shape) - (dot_prods / np.outer(norms, norms))
    signatures.append(cos_dist.T)

signatures[0].shape

(707, 707)

In [216]:
# Save the signature distances to files

for layer in tqdm(range(13)):
    filename = f"signature_dists\{target_word}\layer_{layer}.in"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        # matrix dimension
        f.write(str(len(word_meaning_header)))
        f.write("\n")

        # word meanings
        for m in word_meaning_header:
            f.write("{} ".format(m))
        f.write("\n")

        # word as part of speech
        for p in word_part_of_speech:
            f.write("{} ".format(p))
        f.write("\n")

        signatures_layer = signatures[layer]

        n = len(signatures_layer)
        chunk_size = 500  # Adjust the chunk size based on available memory

        # Initialize an empty list to store the norms
        norms = np.zeros((n, n))

        # Process vectors in chunks
        for i in range(0, n, chunk_size):
            for j in range(0, n, chunk_size):
                # Get the current chunks
                chunk_i = signatures_layer[i:i + chunk_size]
                chunk_j = signatures_layer[j:j + chunk_size]
                
                # Compute the pairwise differences for the current chunks
                diffs = chunk_i[:, np.newaxis, :] - chunk_j[np.newaxis, :, :]
                
                # Compute the norms of the differences
                chunk_norms = np.linalg.norm(diffs, axis=2)
                
                # Normalize the norms
                chunk_norms = chunk_norms * (1 / n**0.5)
                
                # Store the norms in the appropriate positions
                norms[i:i + chunk_size, j:j + chunk_size] = chunk_norms

        for i in range(len(signatures[layer])):
            for j in range(len(signatures[layer])):
                f.write("{:.5f} ".format(norms[i][j]))
            f.write("\n")

100%|██████████| 13/13 [00:41<00:00,  3.21s/it]
