In [1]:
# Importing necessary packages.

import numpy as np
from transformers import RobertaTokenizer, RobertaModel
import torch

In [2]:
# Loading tokenizer and model.

tokenizer = RobertaTokenizer.from_pretrained("facebookai/roberta-base")
print("RobertaTokenizer loaded successfully.")

model = RobertaModel.from_pretrained("facebookai/roberta-base")
print("RobertaModel loaded successfully.")

RobertaTokenizer loaded successfully.


Some weights of RobertaModel were not initialized from the model checkpoint at facebookai/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel loaded successfully.


In [3]:
# Accessing text file.

file_path = "dataset.txt"
with open(file_path, "r") as file:
    data = file.readlines()
print("Data set loaded successfully.")

Data set loaded successfully.


In [4]:
# Tokenizing the data.

input = tokenizer(data[0:11], 
                  return_tensors = "pt",
                  padding = True,
                  truncation = True)

In [5]:
# Getting embedddings from tokenized data.

with torch.no_grad():
    output = model(**input)
    token_embedding = output.last_hidden_state

In [6]:
# Creating dictionary to store each token's embeddings.

token_to_embeddings = {}

for sentence_idx, sentence_embeddings in enumerate(token_embedding):
    for token_idx, token_emebdding in enumerate(sentence_embeddings):
        token_id = input.input_ids[sentence_idx][token_idx].item()
        token_embedding = token_embedding.numpy() if isinstance(token_embedding, torch.Tensor) else token_embedding

        if token_id not in token_to_embeddings:
            token_to_embeddings[token_id] = []
        token_to_embeddings[token_id].append(token_embedding)

In [7]:
# Calculating the average embedding for each token.

token_to_avg_embedding = {token: np.mean(embeddings, axis = 0) for token, embeddings in token_to_embeddings.items()}

id_to_token = {token_id: tokenizer.decode([token_id]) for token_id in token_to_avg_embedding.keys()}
for token_id, avg_embedding in token_to_avg_embedding.items():
    token = id_to_token[token_id]
    print(f"Token: {token}, Average Embedding: {avg_embedding[:5]}...")

Token: <s>, Average Embedding: [[[-7.93446898e-02  1.06238768e-01 -5.11441426e-03 ... -8.89533684e-02
   -4.09069397e-02 -8.65147449e-03]
  [-1.00156620e-01  9.11359712e-02  1.38755038e-01 ... -1.70367852e-01
    7.13143311e-03 -2.46689897e-02]
  [ 1.16417622e-02  1.50476232e-01  4.99908701e-02 ... -1.06294915e-01
   -9.82792452e-02  1.39068171e-01]
  ...
  [ 4.63088863e-02  1.21982887e-01  6.05006255e-02 ...  1.58556867e-02
   -8.00196081e-02  1.67404667e-01]
  [ 4.63088863e-02  1.21982887e-01  6.05006255e-02 ...  1.58556867e-02
   -8.00196081e-02  1.67404667e-01]
  [ 4.63088863e-02  1.21982887e-01  6.05006255e-02 ...  1.58556867e-02
   -8.00196081e-02  1.67404667e-01]]

 [[-5.65370470e-02  8.76866579e-02  5.93850808e-03 ... -9.00561884e-02
   -4.22145687e-02 -5.36320582e-02]
  [ 1.56357102e-02 -1.23610623e-01 -1.07604675e-01 ... -3.20418626e-01
    7.47087672e-02 -1.43391356e-01]
  [ 5.32989725e-02 -1.95911512e-01  3.48706916e-02 ... -3.99805337e-01
   -7.47543722e-02 -1.50837675e-01