In [1]:
!pip install torch
!pip install transformers
!pip install datasets



In [19]:
# Importing packages

import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.metrics.pairwise import cosine_similarity
print("Packages imported successfully.")

Packages imported successfully.


In [3]:
# Loading tokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base",
                                         use_fast = True)
print("Tokenizer loaded successfully.")

Tokenizer loaded successfully.


In [4]:
# Loading model

device = torch.device("cpu")
if (torch.cuda.is_available()):
    device = torch.device("cuda:0")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base").to(device)
model.eval().gradient_checkpointing_enable()
print("Model loaded successfully.")

Model loaded successfully.


In [10]:
# Loading dataset

with open("glove.6B.300d-vocabulary.txt", "r", encoding = "utf-8") as file:
    texts = file.read().strip().split("\n")
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [14]:
# Tokenzing texts

words = {}
for word in tqdm(texts, desc = "Tokenizing words"):
    words[word] = tokenizer(word,
                            return_tensors = "pt",
                            truncation = True,
                            padding = True).to(device)
print("Data tokenized successfully.")

Tokenizing words: 100%|██████████| 400000/400000 [01:01<00:00, 6466.15it/s]

Data tokenized successfully.





In [17]:
# Getting embeddings

embeddings = {}
# Using half of the words in the data to see the results.
for word, tokenized_word in tqdm(list(words.items())[:len(words) // 2], 
                                 desc = "Processing words"):
    input_ids = tokenized_word["input_ids"].to(device)
    attention_mask = tokenized_word["attention_mask"].to(device)
    with torch.no_grad():
        outputs = model(input_ids,
                        attention_mask,
                        output_hidden_states = True)
        embedding = outputs.hidden_states[-1].cpu()
        embeddings[word] = embedding.mean(dim = 1).squeeze().numpy()
print("Embeddings extracted successfully.")

Processing words: 100%|██████████| 200000/200000 [22:57<00:00, 145.21it/s]

Embeddings extracted successfully.





'\nembeddings = defaultdict(list)\nbatch_size = 32\nbatch_number = (len(inputs["input_ids"]) + batch_size - 1) // batch_size\n# batch_number = 12500\n# Iterating by batch_number - 12500 (10) so that results can actually be seen.\nfor batch in tqdm(range(batch_number - 12490), desc = "Processing batches"):\n    start = batch * batch_size\n    end = min((batch + 1) * batch_size, len(inputs["input_ids"]))\n    batch_input = {\n        "input_ids": inputs["input_ids"][start: end],\n        "attention_mask": inputs["attention_mask"][start: end]\n    }\n    for key, value in batch_input.items():\n        batch_input[key] = torch.tensor(value).to(device)\n    with torch.no_grad():\n        outputs = model(**batch_input,\n                       output_hidden_states = True)\n        batch_embeddings = outputs.hidden_states[-1].detach().cpu() \n    input_ids = batch_input["input_ids"].cpu()  \n    for i in range(input_ids.shape[0]): \n        for j in range(input_ids.shape[1]):  \n            to

In [20]:
# Running most_similar() on the six examples

def most_similar(word, embeddings, topn):
    if (word not in embeddings):
        return []
    embedding = embeddings[word]
    all_embeddings = np.array(list(embeddings.values()))
    similarities = cosine_similarity([embedding], all_embeddings)[0]
    indices = similarities.argsort()[::-1]
    top_words = []
    for index in indices[:topn]:
        candidate = list(embeddings.keys())[index]
        if (candidate != word):
            score = similarities[index]
            top_words.append((candidate, score))
    return top_words
query_words = ["cactus", "cake", "angry", "quickly", "between", "the"]
for word in query_words:
    print(f"Words similar to: {word}")
    similar_words = most_similar(word,
                                 embeddings,
                                 topn = 10)
    print(similar_words)
print("Calculated similar words successfully.")

Words similar to: cactus
[('corns', 0.99159783), ('cotton', 0.99111134), ('cabbage', 0.9900382), ('cumin', 0.9900115), ('camel', 0.9899581), ('cinnamon', 0.98961157), ('casca', 0.98909783), ('cigs', 0.98891), ('poultry', 0.98889935)]
Words similar to: cake
[('cakes', 0.9991082), ('coat', 0.9977131), ('cream', 0.99749666), ('flower', 0.997431), ('fed', 0.9974273), ('farm', 0.9973926), ('forest', 0.9973541), ('fighter', 0.9973248), ('leaf', 0.99730283)]
Words similar to: angry
[('hungry', 0.99093544), ('wry', 0.98955023), ('masry', 0.98884994), ('awry', 0.9887522), ('garry', 0.98856807), ('derry', 0.988369), ('chantry', 0.9883069), ('landry', 0.9879894), ('merry', 0.98767143)]
Words similar to: quickly
[('lastly', 0.99270844), ('reportedly', 0.9919561), ('secretly', 0.9919225), ('purely', 0.9915697), ('shortly', 0.9913832), ('greatly', 0.9913656), ('solidly', 0.99136484), ('quietly', 0.9912144), ('tightly', 0.9912058)]
Words similar to: between
[('almost', 0.9985246), ('sur', 0.99842894)