In [1]:
# Importing packages

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from collections import defaultdict

In [2]:
# Loading tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
print("Tokenizer loaded successfully.")

model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")
model.eval()
print("Model loaded successfully.")

Tokenizer loaded successfully.
Model loaded successfully.


In [3]:
# Loading dataset

with open("dataset.txt", "r") as file:
    texts = file.readlines()
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [4]:
# Tokenizing data and getting embeddings

embeddings = defaultdict(list)
for text in texts[:10]:
    inputs = tokenizer(text, 
                       return_tensors = "pt", 
                       truncation = True, 
                       padding = True)
    print("Tokenized sentence successfully.")
    with torch.no_grad():
        outputs = model(**inputs)
    temp_embeddings = outputs[0]
    for i, token_id in enumerate(inputs["input_ids"][0]):
        embeddings[token_id.item()].append(temp_embeddings[0][i].cpu().numpy())
    print("An embedding extracted successfully.")
print("Tokenized text and extracted embeddings successfully.")


Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized sentence successfully.
An embedding extracted successfully.
Tokenized text and extracted embeddings successfully.


In [6]:
# Calculating average embeddings

average_embeddings = {}
for token_id, embedding in embeddings.items():
    average_embeddings[token_id] = torch.mean(torch.tensor(embedding), dim = 0)
    token = tokenizer.decode([token_id])
    print(f"Token: {token}, Average Embedding: {embedding}")
print("Average embeddings calculated successfully.")

Token: <s>, Average Embedding: [array([35.69795  , -3.9423828, 18.967249 , ...,  2.975231 ,  5.8199387,
       13.336944 ], dtype=float32), array([35.42749  , -4.182551 , 19.454487 , ...,  2.4002128,  5.3630667,
       13.0762005], dtype=float32), array([35.30715  , -3.9725704, 22.668962 , ...,  2.72445  ,  4.850296 ,
       14.123692 ], dtype=float32), array([34.451824 , -3.9416547, 18.212934 , ...,  2.5928931,  4.9096193,
       12.408315 ], dtype=float32), array([34.75273  , -3.7405148, 19.737251 , ...,  3.0717683,  5.222084 ,
       12.915192 ], dtype=float32), array([33.42205  , -3.8156157, 19.922392 , ...,  2.2937562,  4.5539827,
       12.432556 ], dtype=float32), array([34.84042  , -3.813566 , 20.339752 , ...,  2.5580707,  5.0078673,
       13.302027 ], dtype=float32), array([34.27335  , -4.057126 , 19.088108 , ...,  1.6854018,  4.2115316,
       12.697863 ], dtype=float32), array([35.865936 , -3.9654422, 19.072811 , ...,  2.5222502,  4.747805 ,
       13.333325 ], dtype=float3