In [None]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel, BertModel, BertTokenizer

# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = TFAutoModel.from_pretrained('distilbert-base-uncased')

# Example labeled data: (word1, word2, target_word)
labeled_data = [
    ("fire", "water", "steam"),
    ("fire", "earth", "lava"),
    ("fire", "wind", "smoke"),
    ("water", "earth", "plant"),
    ("water", "wind", "wave"),
    ("earth", "wind", "dust"),
    ("wind", "fire", "smoke"),
]

# Function to get the embedding of a word
def get_embedding(word):
    inputs = tokenizer(word, return_tensors="tf")
    outputs = model(**inputs)
    return tf.reduce_mean(outputs.last_hidden_state, axis=1)  # Mean pooling

# Custom cosine similarity loss function
def cosine_similarity_loss(combined_embedding, target_embedding):
    combined_norm = tf.nn.l2_normalize(combined_embedding, axis=1)
    target_norm = tf.nn.l2_normalize(target_embedding, axis=1)
    cosine_sim = tf.reduce_sum(combined_norm * target_norm, axis=1)
    loss = 1.0 - tf.reduce_mean(cosine_sim)  # Target similarity: 1.0
    return loss

# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Training loop
epochs = 3

for epoch in range(epochs):
    total_loss = 0

    for word1, word2, target_word in labeled_data:
        with tf.GradientTape() as tape:
            # Get embeddings for word1, word2, and target_word
            embedding1 = get_embedding(word1)
            embedding2 = get_embedding(word2)
            target_embedding = get_embedding(target_word)

            # Combine embeddings (simple addition)
            combined_embedding = embedding1 + embedding2

            # Compute cosine similarity loss
            loss = cosine_similarity_loss(combined_embedding, target_embedding)

        # Update model parameters
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        total_loss += loss.numpy()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

# Save fine-tuned embeddings
vocab = tokenizer.get_vocab()
vocab_words = list(vocab.keys())
fine_tuned_embeddings = []

print("Saving fine-tuned embeddings...")
for word in vocab_words:
    embedding = get_embedding(word).numpy().squeeze()
    fine_tuned_embeddings.append(embedding)

# Save to file
np.save('fine_tuned_vocab_embeddings.npy', fine_tuned_embeddings)
print("Fine-tuned embeddings saved to 'fine_tuned_vocab_embeddings.npy'")

Epoch 1/3, Loss: 0.3321
Epoch 2/3, Loss: 0.1204
Epoch 3/3, Loss: 0.0478
Saving fine-tuned embeddings...
Fine-tuned embeddings saved to 'fine_tuned_vocab_embeddings.npy'


In [None]:
# Tjecking the size of the fine-tuned embeddings
fine_tuned_embeddings = np.load('fine_tuned_vocab_embeddings.npy')
print(fine_tuned_embeddings.shape)  


(30522, 768)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def combine_and_find_closest_words1(word1, word2, fine_tuned_embeddings, vocab_words, top_k=5):
    embedding1 = get_embedding(word1).numpy().squeeze()
    embedding2 = get_embedding(word2).numpy().squeeze()
    combined_embed = embedding1 + embedding2

    # Compute cosine similarity between the combined embedding and all embeddings
    similarities = cosine_similarity([combined_embed], fine_tuned_embeddings)[0]

    # Sort indices by similarity in descending order
    sorted_indices = similarities.argsort()[::-1]

    # Filter out words that match word1 or word2
    closest_word = []
    for i in sorted_indices:
        candidate_word = vocab_words[i]
        if candidate_word != word1 and candidate_word != word2:
            closest_word.append((candidate_word, similarities[i]))
            if len(closest_word) == top_k:  
                break

    return closest_word

    
word1 = "fire"
word2 = "steam"
closest_word = combine_and_find_closest_words1(word1, word2, fine_tuned_embeddings, vocab_words)
print(f"The closest word to the combination of '{word1}' and '{word2}' is: {closest_word}")

The closest word to the combination of 'fire' and 'steam' is: [('smoke', 0.9974934), ('water', 0.9959872), ('wind', 0.99542344), ('jet', 0.9954127), ('sail', 0.99539626)]


In [None]:
#Initial list of discovered items to combine
discovered_items = ['earth', 'water','fire', 'wind']

#Choose 2 items to combine
item1 = "tree"
item2 = "candy"

closest_word = combine_and_find_closest_words1(item1, item2, fine_tuned_embeddings, vocab_words)
print(f"The closest word to the combination of '{item1}' and '{item2}' is: {closest_word}")
# Add the closest word to discovered items
new_item = closest_word[0][0]
if new_item not in discovered_items:  
    discovered_items.append(new_item)

print(discovered_items)

The closest word to the combination of 'tree' and 'candy' is: [('trees', 0.99488986), ('juice', 0.9948405), ('monkey', 0.99459296), ('cake', 0.9945701), ('doll', 0.9944013)]
['earth', 'water', 'fire', 'wind', 'trees']
