In [1]:
# Requirements (uncomment the next line if running in Colab or fresh env)
# !pip install -q transformers torch tqdm

import os
import math
from collections import defaultdict
from tqdm import tqdm

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel


In [2]:
MODEL_NAME = "FacebookAI/roberta-base"

DATASET = "assignment4-dataset.txt"
GLOVE_VOCAB = "glove_vocabulary.txt"

OUTPUT_TOKEN_EMB = "token_embeds.pt"
OUTPUT_WORD_EMB = "word_embs.npz"

BATCH_SIZE = 64
FORCE_CPU = False


In [3]:
if FORCE_CPU:
    device = torch.device("cpu")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)


Using device: cuda


In [4]:
print("Loading tokenizer and model:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
model.to(device)

hidden_size = model.config.hidden_size
hidden_size


Loading tokenizer and model: FacebookAI/roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


768

In [7]:
def load_sentences():
    sentences = []
    with open(DATASET, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if s:
                sentences.append(s)
    return sentences

sentences = load_sentences()
len(sentences)


3980290

In [8]:
token_sums = defaultdict(lambda: torch.zeros(hidden_size, dtype=torch.float32))
token_counts = defaultdict(int)

def model_forward(batch):
    with torch.no_grad():
        return model(**batch).last_hidden_state


In [None]:
for i in tqdm(range(0, len(sentences), BATCH_SIZE)):
    batch = sentences[i:i+BATCH_SIZE]

    encoded = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        add_special_tokens=True
    )

    encoded = {k: v.to(device) for k, v in encoded.items()}
    last_hidden = model_forward(encoded)

    input_ids = encoded["input_ids"]

    for sent_ids, sent_embs in zip(input_ids, last_hidden):
        tok_ids = sent_ids.cpu().tolist()
        tokens = tokenizer.convert_ids_to_tokens(tok_ids)
        sent_embs_cpu = sent_embs.cpu()

        for tok, emb in zip(tokens, sent_embs_cpu):
            if tok == tokenizer.pad_token:
                continue
            token_sums[tok] += emb
            token_counts[tok] += 1


  0%|          | 13/62193 [00:06<9:04:38,  1.90it/s]

In [None]:
token_embs = {}
for tok in token_sums:
    token_embs[tok] = token_sums[tok] / token_counts[tok]

torch.save(token_embs, OUTPUT_TOKEN_EMB)
print("Saved:", OUTPUT_TOKEN_EMB)


In [None]:
vocab_words = []
with open(GLOVE_VOCAB, "r", encoding="utf-8") as f:
    for line in f:
        w = line.strip()
        if w:
            vocab_words.append(w)
token_embs_np = {tok: v.numpy() for tok, v in token_embs.items()}

len(vocab_words)


In [None]:
word_embs = {}
missing_word_tokens = 0
missing_token_hits = 0

for word in tqdm(vocab_words):
    enc = tokenizer(word, add_special_tokens=False)
    ids = enc["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(ids)

    vecs = []
    for tok in tokens:
        if tok in token_embs_np:
            vecs.append(token_embs_np[tok])
        else:
            missing_token_hits += 1

    if len(vecs) == 0:
        missing_word_tokens += 1
        continue

    word_embs[word] = np.mean(vecs, axis=0)

len(word_embs), missing_word_tokens, missing_token_hits



In [None]:
index_to_word = list(word_embs)
word_matrix = np.stack([word_embs[w] for w in index_to_word])

norms = np.linalg.norm(word_matrix, axis=1, keepdims=True)
norms[norms == 0] = 1.0
word_matrix_normed = word_matrix / norms

word_to_index = {w: i for i, w in enumerate(index_to_word)}

np.savez_compressed(OUTPUT_WORD_EMB, index_to_word=index_to_word, matrix=word_matrix_normed)
OUTPUT_WORD_EMB


In [None]:
def most_similar(word, matrix, idx2word, word2idx, topn=10):
    if word not in word2idx:
        raise KeyError(f"Word '{word}' not found.")

    wid = word2idx[word]
    emb = matrix[wid]

    sims = matrix @ emb
    sims[wid] = -np.inf

    best = np.argsort(-sims)[:topn]
    return [(idx2word[i], float(sims[i])) for i in best]


In [None]:
queries = ["cactus", "cake", "angry", "quickly", "between", "the"]

for w in queries:
    print("\n----", w, "----")
    try:
        for word, sim in most_similar(w, word_matrix_normed, index_to_word, word_to_index):
            print(f"{word}\t{sim:.6f}")
    except KeyError as e:
        print("SKIPPED:", e)
