In [17]:
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

In [18]:
# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

# Set the model to eval mode
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OlmoeForCausalLM(
  (model): OlmoeModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoeDecoderLayer(
        (self_attn): OlmoeSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): OlmoeRMSNorm((2048,), eps=1e-05)
          (k_norm): OlmoeRMSNorm((2048,), eps=1e-05)
        )
        (mlp): OlmoeSparseMoeBlock(
          (gate): Linear(in_features=2048, out_features=64, bias=False)
          (experts): ModuleList(
            (0-63): 64 x OlmoeMLP(
              (gate_proj): Linear(in_features=2048, out_features=1024, bias=False)
              (up_proj): Linear(in_features=2048, out_features=1024, bias=False)
              (down_proj): Linear(in

#### vector embedding

In [19]:
def vector_embedding(text, model, tokenizer):
    ids = tokenizer(text, return_tensors="pt")["input_ids"]
    embedding = model.model.embed_tokens(ids)
    return embedding

In [20]:
def save_embeddings(input_filename, output_filename, model, tokenizer):
    # Read input file
    with open(f'data/{input_filename}.txt', 'r') as f:
        words = f.read().splitlines()
    
    # Get embeddings for each word
    embeddings_dict = {}
    for word in words:
        embedding = vector_embedding(word, model, tokenizer)
        # Convert tensor to list for JSON serialization
        embeddings_dict[word] = embedding.detach().numpy().tolist()
    
    # Save embeddings to JSON file
    with open(f'data/{output_filename}.json', 'w') as f:
        json.dump(embeddings_dict, f, indent=2)


In [21]:
save_embeddings('input_words', 'word_embeddings', model, tokenizer)

#### average embedding

In [22]:
def avg_embedding(file):
    # Read the embeddings from JSON file
    with open(f'data/{file}.json', 'r') as f:
        embeddings_dict = json.load(f)
    
    # Get the first embedding to determine dimensions
    first_word = list(embeddings_dict.keys())[0]
    embedding_dim = len(embeddings_dict[first_word][0][0])
    
    # Initialize sum array
    embedding_sum = [0] * embedding_dim
    num_words = len(embeddings_dict)
    
    # Sum up all embeddings
    for word in embeddings_dict:
        word_embedding = embeddings_dict[word][0][0]  # Get the actual embedding vector
        for i in range(embedding_dim):
            embedding_sum[i] += word_embedding[i]
    
    # Calculate average
    average_embedding = [x/num_words for x in embedding_sum]
    
    # Save average embedding to new JSON file
    output_dict = {"average_embedding": average_embedding}
    output_filename = f'{file}_average'
    with open(f'data/{output_filename}.json', 'w') as f:
        json.dump(output_dict, f, indent=2)
        
    return average_embedding


In [23]:
avg = avg_embedding('word_embeddings')
print(avg)

[-0.0005644336076402266, -0.0005334796155308938, 0.0014718937220249472, -0.0009396604048054892, -0.0005747200621467381, 0.0008146011218410852, -0.00016128034713867878, -2.819541242324411e-05, -0.0022776868545113928, -0.00026577760835086065, -0.0011828355979578429, 0.0021250782194808214, -0.0014232998986920527, 0.0010988357098366847, 0.0025175900794610078, -0.0003123243291628091, -0.0013570171004967126, -0.0017061382709725964, -0.002836316255619472, -0.0006659981462753714, -0.0013107624753013129, 0.0035744185101005545, -0.00147070560750859, 0.0014176237995206812, -0.002896963164689856, 0.00229026151641191, 0.0003750011307121886, 0.0006673199627481854, 0.00028483741900065516, 0.002049440031824578, 0.0009242985330974957, -0.0020013306294351424, 0.002802724420527025, 0.0012122135480623847, -0.002033785109866646, -0.002811701277868114, 0.001615218809307403, -0.0013538314844342891, -0.0004729389149753357, -0.0038931864592401007, -0.0035637104184465555, 0.0004653903135099627, -0.0013932631893

#### cosine similarity

In [24]:
def cosine_similarity(word, avg_embedding, model, tokenizer):
    # Get embedding for the specific word
    word_embedding = vector_embedding(word, model, tokenizer)
    word_vector = word_embedding.detach().numpy()[0][0] 
    
    # Convert average embedding to numpy array
    avg_vector = np.array(avg_embedding)
    word_vector = np.array(word_vector)
    
    # Calculate cosine similarity
    dot_product = np.dot(avg_vector, word_vector)
    avg_norm = np.linalg.norm(avg_vector) 
    word_norm = np.linalg.norm(word_vector)
    
    similarity = dot_product / (avg_norm * word_norm)
    
    return similarity

In [28]:
test_word = "plastic"
similarity = cosine_similarity(test_word, avg, model, tokenizer)
print(f"Cosine similarity between '{test_word}' and average embedding: {similarity}")

Cosine similarity between 'plastic' and average embedding: 0.08309577579960596


In [26]:
test_word = "network"
similarity = cosine_similarity(test_word, avg, model, tokenizer)
print(f"Cosine similarity between '{test_word}' and average embedding: {similarity}")

Cosine similarity between 'network' and average embedding: 0.16846000158603971
