# Problem 1

In [8]:
import torch
from transformers import RobertaTokenizer, RobertaModel
from collections import defaultdict

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# read assingment4-dataset.txt
with open("assignment4-dataset.txt", "r", errors='ignore', encoding='utf-8') as file:
    text = file.read()

# tokenize assingment4-dataset.txt
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = model(**encoded_input)
    embeddings = output.last_hidden_state  # (batch_size, sequence_length, hidden_size)

token_sum = defaultdict(lambda: torch.zeros(embeddings.size(-1)))
token_count = defaultdict(int)

for batch_index, input_ids in enumerate(encoded_input['input_ids']):
    for token_index, token_id in enumerate(input_ids):
        token_id = token_id.item()
        token_sum[token_id] += embeddings[batch_index, token_index]
        token_count[token_id] += 1

# output
average_embeddings = {token_id: token_sum[token_id] / token_count[token_id] for token_id in token_sum}
average_token_embeddings = {
    tokenizer.decode([token_id]): avg_emb
    for token_id, avg_emb in average_embeddings.items()
}
print("Average embeddings for tokens:")
for token, embedding in average_token_embeddings.items():
    print(f"Token: {token}, Embedding: {embedding.tolist()[:5]}...")  # Print first 5 dimensions of the embedding

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average embeddings for tokens:
Token: <s>, Embedding: [-0.06278745085000992, 0.0923120304942131, 0.023924440145492554, -0.09761037677526474, 0.10966449230909348]...
Token: The, Embedding: [-0.07268483191728592, 0.07396253943443298, 0.06347452849149704, -0.19215558469295502, 0.467999130487442]...
Token:  White, Embedding: [0.04904630035161972, 0.21464592218399048, -0.026654377579689026, -0.00598438223823905, 0.7937624454498291]...
Token:  Monkey, Embedding: [-0.06331782788038254, 0.17491233348846436, -0.2458176463842392, -0.48307374119758606, 0.6809879541397095]...
Token:  is, Embedding: [0.13222387433052063, 0.22494344413280487, 0.0073118084110319614, -0.09792547672986984, 0.2448006421327591]...
Token:  a, Embedding: [0.022679712623357773, 0.05687248706817627, 0.15807029604911804, -0.32929080724716187, -0.13671213388442993]...
Token:  1925, Embedding: [0.09546397626399994, -0.06229622662067413, 0.06189507618546486, -0.2579684257507324, 0.8974025249481201]...
Token:  American, Embedding

# Problem 2

In [10]:
# Load glove_vocabulary.txt
with open("glove.6B.300d-vocabulary.txt", "r", encoding="utf-8") as vocab_file:
    glove_vocab = vocab_file.read().splitlines()

# Tokenize each word in the vocabulary
word_token_mapping = {}
for word in glove_vocab:
    word_token_mapping[word] = tokenizer(word, return_tensors='pt', add_special_tokens=False)['input_ids'][0]

In [None]:
token_sum = defaultdict(lambda: torch.zeros(embeddings.size(-1)))
token_count = defaultdict(int)

word_embeddings = {}
for word, token_ids in word_token_mapping.items():
    token_embeddings = [token_sum[token_id] / token_count[token_id] for token_id in token_ids]
    word_embeddings[word] = torch.stack(token_embeddings).mean(dim=0)  # Average over token embeddings


In [None]:
def most_similar(query_word, word_embeddings, topn=10):
    if query_word not in word_embeddings:
        raise ValueError(f"'{query_word}' not found in the word embeddings.")
    
    query_emb = word_embeddings[query_word]
    similarities = {}
    for word, emb in word_embeddings.items():
        if word == query_word:
            continue
        # 9.1.1 Word Similarity
        similarity = torch.nn.functional.cosine_similarity(query_emb, emb, dim=0).item()
        similarities[word] = similarity
        
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:topn]

In [None]:
## 6 Examples from Notebook

In [None]:
most_similar("cactus", word_embeddings)

In [None]:
most_similar("cake", word_embeddings)

In [None]:
most_similar("angry", word_embeddings)

In [None]:
most_similar("quickly", word_embeddings)

In [None]:
most_similar("between", word_embeddings)

In [None]:
most_similar("the", word_embeddings)