In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)

# Function to get embeddings for words
def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.hidden_states[-1]
    return hidden_states.mean(dim=1).squeeze().numpy()

# Precompute embeddings for vocabulary
vocab = tokenizer.get_vocab()
vocab_words = list(vocab.keys())
embeddings = []

print("Precomputing vocabulary embeddings...")
for word in vocab_words:
    embedding = get_embedding(word)
    embeddings.append(embedding)

# Save precomputed embeddings
np.save('vocab_embeddings.npy', embeddings)
print("Vocabulary embeddings saved to 'vocab_embeddings.npy'")

# Load precomputed embeddings
embeddings = np.load('vocab_embeddings.npy')

Precomputing vocabulary embeddings...
Vocabulary embeddings saved to 'vocab_embeddings.npy'


In [None]:
# Tjek size of vocabulary embeddings
print(embeddings.shape)

(30522, 768)


In [None]:

# Function to find 5 closest words to a given embedding combination
def find_closest_words_to_embedding(embedding, embeddings, vocab_words, top_k=5):
    similarities = cosine_similarity([embedding], embeddings)[0]
    closest_indices = similarities.argsort()[-top_k:][::-1]
    closest_words = [(vocab_words[i], similarities[i]) for i in closest_indices]
    return closest_words

# Function to combine embeddings
def combine_embeddings(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)
    combined_embedding =embedding1 + embedding2
    return combined_embedding


word1 = "hate"
word2 = "love"
result_vector = combine_embeddings(word1, word2)
embedding = get_embedding(word)
closest_words = find_closest_words_to_embedding(result_vector, embeddings, vocab_words)
print(f"The five closest words to '{word1}' and '{word2}' are: {closest_words}")



Traceback (most recent call last):
  File "/Users/jakobbramming/.vscode/extensions/ms-python.python-2024.22.0-darwin-x64/python_files/python_server.py", line 133, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 21, in <module>
  File "<string>", line 4, in find_closest_words_to_embedding
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/metrics/pairwise.py", line 1577, in cosine_similarity
    X, Y = check_pairwise_arrays(X, Y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/metrics/pairwise.py", line 165, in check_pairwise_arrays
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/

In [None]:

def combine_and_find_closest_words(word1, word2, embeddings, vocab_words, top_k=1):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)
    combined_embedding = embedding1 + embedding2

    # Compute cosine similarity between the combined embedding and all embeddings
    similarities = cosine_similarity([combined_embedding], embeddings)[0]

    # Sort indices by similarity in descending order
    sorted_indices = similarities.argsort()[::-1]

    # Filter out words that match word1 or word2
    closest_word = []
    for i in sorted_indices:
        candidate_word = vocab_words[i]
        if candidate_word != word1 and candidate_word != word2:
            closest_word.append((candidate_word, similarities[i]))
            if len(closest_word) == top_k:  
                break

    return closest_word

    
word1 = "earth"
word2 = "water"
closest_word = combine_and_find_closest_words(word1, word2, embeddings, vocab_words)
print(f"The closest word to the combination of '{word1}' and '{word2}' is: {closest_word}")

Traceback (most recent call last):
  File "/Users/jakobbramming/.vscode/extensions/ms-python.python-2024.22.0-darwin-x64/python_files/python_server.py", line 133, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 27, in <module>
  File "<string>", line 8, in combine_and_find_closest_words
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/metrics/pairwise.py", line 1577, in cosine_similarity
    X, Y = check_pairwise_arrays(X, Y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/python3.11/site-packages/sklearn/metrics/pairwise.py", line 165, in check_pairwise_arrays
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/jakobbramming/anaconda3/lib/p

In [None]:
#Inititate list of discovered items
discovered_items = ['earth', 'water','fire', 'wind']

#Choose 2 items to combine
item1 = "earth"
item2 = "water"

closest_word = combine_and_find_closest_words(item1, item2, embeddings, vocab_words)
print(f"The closest word to the combination of '{word1}' and '{word2}' is: {closest_word}")
discovered_items.append(closest_word[0][0])

print(discovered_items)

The closest word to the combination of 'hate' and 'love' is: [('resentment', 0.9706785)]
['earth', 'water', 'fire', 'wind', 'resentment']
