# Problem 1

This took around 2 hours. The embeddings are averaged and stored in token_embeds.pt. This file is attached as a link in the README of the repository.

In [5]:
import os
import math
from collections import defaultdict
from tqdm import tqdm

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [6]:
MODEL_NAME = "FacebookAI/roberta-base"

DATASET = "assignment4-dataset.txt"
GLOVE_VOCAB = "glove.6B.300d-vocabulary.txt"

OUTPUT_TOKEN_EMB = "token_embeds.pt"
OUTPUT_WORD_EMB = "word_embs.npz"

BATCH_SIZE = 64
# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    np.random.seed(seed)
    torch.manual_seed(seed)

random seed: 1234


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [8]:
print("Loading tokenizer and model:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
model.to(device)

hidden_size = model.config.hidden_size
hidden_size

Loading tokenizer and model: FacebookAI/roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


768

In [9]:
def load_sentences():
    sentences = []
    with open(DATASET, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if s:
                sentences.append(s)
    return sentences

sentences = load_sentences()
len(sentences)

3980290

In [10]:
token_sums = defaultdict(lambda: torch.zeros(hidden_size, dtype=torch.float32))
token_counts = defaultdict(int)

def model_forward(batch):
    with torch.no_grad():
        return model(**batch).last_hidden_state

In [11]:
for i in tqdm(range(0, len(sentences), BATCH_SIZE)):
    batch = sentences[i:i+BATCH_SIZE]

    encoded = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        add_special_tokens=True
    )

    encoded = {k: v.to(device) for k, v in encoded.items()}
    last_hidden = model_forward(encoded)

    input_ids = encoded["input_ids"]

    for sent_ids, sent_embs in zip(input_ids, last_hidden):
        tok_ids = sent_ids.cpu().tolist()
        tokens = tokenizer.convert_ids_to_tokens(tok_ids)
        sent_embs_cpu = sent_embs.cpu()

        for tok, emb in zip(tokens, sent_embs_cpu):
            if tok == tokenizer.pad_token:
                continue
            token_sums[tok] += emb
            token_counts[tok] += 1

100%|██████████| 62193/62193 [3:05:17<00:00,  5.59it/s]  


In [13]:
token_embs = {}
for tok in token_sums:
    token_embs[tok] = token_sums[tok] / token_counts[tok]

torch.save(token_embs, OUTPUT_TOKEN_EMB)
print("Saved:", OUTPUT_TOKEN_EMB)

Saved: token_embeds.pt


### Problem 2

This problem took around 5 hours. I had the same problem as Kevin Pratt in Piazza, where the similarities are wrong compared to the example in Chapter 9. I couldn't find a solution to this issue, which took most of the time spent on this problem.

In [14]:
vocab_words = []
with open(GLOVE_VOCAB, "r", encoding="utf-8") as f:
    for line in f:
        w = line.strip()
        if w:
            vocab_words.append(w)
token_embs_np = {tok: v.numpy() for tok, v in token_embs.items()}

len(vocab_words)

400000

In [15]:
word_embs = {}
missing_word_tokens = 0
missing_token_hits = 0

for word in tqdm(vocab_words):
    enc = tokenizer(word, add_special_tokens=False)
    ids = enc["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(ids)

    vecs = []
    for tok in tokens:
        if tok in token_embs_np:
            vecs.append(token_embs_np[tok])
        else:
            missing_token_hits += 1

    if len(vecs) == 0:
        missing_word_tokens += 1
        continue

    word_embs[word] = np.mean(vecs, axis=0)

len(word_embs), missing_word_tokens, missing_token_hits

100%|██████████| 400000/400000 [01:31<00:00, 4384.04it/s]


(399937, 63, 165)

In [16]:
index_to_word = list(word_embs)
word_matrix = np.stack([word_embs[w] for w in index_to_word])

norms = np.linalg.norm(word_matrix, axis=1, keepdims=True)
norms[norms == 0] = 1.0
word_matrix_normed = word_matrix / norms

word_to_index = {w: i for i, w in enumerate(index_to_word)}

In [17]:
def most_similar(word, matrix, idx2word, word2idx, topn=10):
    if word not in word2idx:
        raise KeyError(f"Word '{word}' not found.")

    wid = word2idx[word]
    emb = matrix[wid]

    sims = matrix @ emb
    sims[wid] = -np.inf

    best = np.argsort(-sims)[:topn]
    return [(idx2word[i], float(sims[i])) for i in best]

In [18]:
queries = ["cactus", "cake", "angry", "quickly", "between", "the"]

for w in queries:
    print("\n----", w, "----")
    for word, sim in most_similar(w, word_matrix_normed, index_to_word, word_to_index):
        print(f"{word}\t\t{sim:.6f}")


---- cactus ----
cavalcanti		0.985301
cercocarpus		0.985115
cavalcante		0.985098
candel		0.984870
candelas		0.984796
carcasses		0.984774
camelus		0.984727
cinecitta		0.984726
civicus		0.984680
crescenzio		0.984606

---- cake ----
cakebread		0.989619
fruitcake		0.985877
cakewalk		0.984381
cupcake		0.982911
mooncake		0.982516
cakey		0.980541
cakes		0.977770
fruitcakes		0.977161
beefcake		0.976381
oatcakes		0.976072

---- angry ----
ryang		1.000000
ryanggang		0.994415
mlanghenry		0.987599
yungang		0.986811
yarang		0.986418
yanchang		0.986303
ryokan		0.986291
ryokans		0.986116
zangara		0.985850
ryong		0.985762

---- quickly ----
cleanly		0.985914
closely		0.985546
quietly		0.985276
solidly		0.985273
coldly		0.984548
wildly		0.984343
smartly		0.984117
shortly		0.983966
safely		0.983713
sweetly		0.983289

---- between ----
inbetween		0.976984
betweenness		0.972364
inbetweeners		0.963131
in-between		0.958393
go-between		0.956233
below-average		0.939994
below		0.939312
near-future		0.937697
n