In [1]:
!pip install torch
!pip install transformers
!pip install datasets



In [2]:
# Importing packages

import numpy as np
import torch
from tqdm import tqdm
from collections import defaultdict
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
print("Packages imported successfully.")

Packages imported successfully.


In [3]:
# Loading tokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base",
                                         use_fast = True)
print("Tokenizer loaded successfully.")

Tokenizer loaded successfully.


In [4]:
# Loading model

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base").to(device)
model.eval().gradient_checkpointing_enable()
print("Model loaded successfully.")

Model loaded successfully.


In [5]:
# Loading dataset

with open("dataset.txt", "r", encoding = "utf-8") as file:
    texts = file.read().strip().split("\n\n")
texts = Dataset.from_dict({"text": texts})
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [6]:
# Tokenzing texts

def tokenizer_function(batch):
    texts = []
    for text in batch["text"]:
        if text is not None:
            texts.append(str(text))
        else:
            texts.append("")
    return tokenizer(texts,
                     return_tensors = "pt",
                     truncation = True,
                     padding = True).to(device)
inputs = texts.map(tokenizer_function, 
                   batched = True, 
                   batch_size = 32)
print("Data tokenized successfully.")

Map:   0%|          | 0/449919 [00:00<?, ? examples/s]

Data tokenized successfully.


In [7]:
# Checking dataset's column names

print(f"Column names:\n{inputs.column_names}") 

Column names:
['text', 'input_ids', 'attention_mask']


In [8]:
# Getting embeddings

embeddings = defaultdict(list)
batch_size = 32
batch_number = (len(inputs["input_ids"]) + batch_size - 1) // batch_size
# batch_number = 14060
# Iterating by batch_number - 14060 (10) so that results can actually be seen.
for batch in tqdm(range(batch_number - 14050), desc = "Processing batches"):
    start = batch * batch_size
    end = min((batch + 1) * batch_size, len(inputs["input_ids"]))
    batch_input = {
        "input_ids": inputs["input_ids"][start: end],
        "attention_mask": inputs["attention_mask"][start: end]
    }
    for key, value in batch_input.items():
        batch_input[key] = torch.tensor(value).to(device)
    with torch.no_grad():
        outputs = model(**batch_input,
                       output_hidden_states = True)
        batch_embeddings = outputs.hidden_states[-1].detach().cpu() 
    input_ids = batch_input["input_ids"].cpu()  
    for i in range(input_ids.shape[0]): 
        for j in range(input_ids.shape[1]):  
            token = input_ids[i, j].item()
            embeddings[token].append(batch_embeddings[i, j].numpy())  
    torch.cuda.empty_cache()
print("Embeddings extracted successfully.")

Processing batches: 100%|██████████| 10/10 [26:02<00:00, 156.29s/it]

Embeddings extracted successfully.





In [12]:
# Calculating average embeddings

average_embeddings = {}
for token, embedding in embeddings.items():
    array = np.array(embedding)
    average_embedding = np.mean(array, axis = 0)
    average_embeddings[token] = average_embedding
print("Average embeddings calculated successfully.")
print(f"First average embedding in the list:\n{average_embedding[0]}")

Average embeddings calculated successfully.
First average embedding in the list:
-0.07563850283622742
