In [1]:
# Importing packages

import numpy as np
import torch
from collections import defaultdict

# Run "pip install transformers" in terminal shell
from transformers import AutoTokenizer, AutoModelForMaskedLM


In [2]:
# Loading tokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
print("Tokenizer loaded successfully.")


Tokenizer loaded successfully.


In [3]:
# Loading model

model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")
model.eval()
print("Model loaded successfully.")


Model loaded successfully.


In [4]:
# Loading dataset

with open("dataset.txt", "r") as file:
    texts = file.readlines()
print("Dataset loaded successfully.")


Dataset loaded successfully.


In [5]:
# Tokenizing data and getting embeddings

batch_size = 8
batched_texts = [
    texts[i: i + batch_size]
    for i in range(0, len(texts), batch_size)]
embeddings = defaultdict(list)
loop_counter = 0
for batch in batched_texts[:10]:
    inputs = tokenizer(batch, 
                       return_tensors = "pt", 
                       truncation = True, 
                       padding = True)
    with torch.no_grad():
        outputs = model(**inputs)
    temp_embeddings = outputs[0]
    for i, token_id in enumerate(inputs["input_ids"][0]):
        embedding = temp_embeddings[0][i].cpu().numpy()
        embeddings[token_id.item()].append(embedding)
    loop_counter += 1
    print(f"Loop: {loop_counter} completed.")
print("Tokenized text and extracted embeddings successfully.")


Loop: 1 completed.
Loop: 2 completed.
Loop: 3 completed.
Loop: 4 completed.
Loop: 5 completed.
Loop: 6 completed.
Loop: 7 completed.
Loop: 8 completed.
Loop: 9 completed.
Loop: 10 completed.
Tokenized text and extracted embeddings successfully.


In [6]:
# Calculating average embeddings

average_embeddings = {}
for token_id, embedding in embeddings.items():
    embedding_tensor = torch.tensor(np.array(embedding))
    average_embeddings[token_id] = torch.mean(embedding_tensor, 
                                              dim = 0)
    token = tokenizer.decode([token_id])
    print(f"Token: {token}, Average Embedding: {embedding}")
print("Average embeddings calculated successfully.")


Token: <s>, Average Embedding: [array([35.697895 , -3.9423823, 18.967236 , ...,  2.9752254,  5.819928 ,
       13.336922 ], dtype=float32), array([35.865936 , -3.9654431, 19.072805 , ...,  2.5222478,  4.747809 ,
       13.333325 ], dtype=float32), array([35.52256  , -3.8823156, 22.372154 , ...,  2.7324758,  4.7902536,
       13.736387 ], dtype=float32), array([34.024628 , -4.2077756, 21.456978 , ...,  1.8517741,  3.9280975,
       13.183044 ], dtype=float32), array([34.50096  , -4.1729765, 21.683067 , ...,  1.2937487,  4.040618 ,
       13.031284 ], dtype=float32), array([32.95398  , -3.9394863, 18.980804 , ...,  2.7160072,  5.108945 ,
       12.600609 ], dtype=float32), array([33.215996 , -4.427697 , 20.751516 , ...,  0.9277969,  3.5222843,
       12.835725 ], dtype=float32), array([33.80972  , -3.9657261, 19.701767 , ...,  2.7942185,  5.222457 ,
       12.612955 ], dtype=float32), array([32.983593 , -4.0299335, 18.087776 , ...,  2.74647  ,  4.4981256,
       12.074823 ], dtype=float3