In [1]:
!pip install torch
!pip install transformers



In [2]:
# Importing packages

import numpy as np
import torch
from collections import defaultdict
import csv
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM


In [3]:
# Loading tokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base",
                                         use_fast = True)
print("Tokenizer loaded successfully.")


Tokenizer loaded successfully.


In [4]:
# Loading model

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base").to(device)
model.eval()
print("Model loaded successfully.")


Model loaded successfully.


In [None]:
# Loading dataset

with open("dataset.txt", "r") as file:
    content = file.read()

# Split content into sections by two newlines
sections = content.strip().split("\n\n")

# Find the maximum number of lines in any section
max_columns = max(len(section.split("\n")) for section in sections)

# Process each section into rows with padding
rows = [section.split("\n") + [""] * (max_columns - len(section.split("\n"))) for section in sections]

# Save to CSV
with open("dataset.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(rows)

print("Dataset successfully converted to CSV.")

texts = pd.read_csv("dataset.csv")
print("Dataset loaded successfully.")


In [None]:
# Tokenizing data and getting embeddings

batch_size = 16
batched_texts = [
    texts[i: i + batch_size]
    for i in range(0, len(texts), batch_size)]
embeddings = defaultdict(list)
loop_counter = 0
for batch in batched_texts:
    inputs = tokenizer(batch, 
                       return_tensors = "pt", 
                       truncation = True, 
                       padding = True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    temp_embeddings = outputs[0]
    for i, token_id in enumerate(inputs["input_ids"][0]):
        embedding = temp_embeddings[0][i].cpu().numpy()
        embeddings[token_id.item()].append(embedding)
    loop_counter += 1
    print(f"Loop: {loop_counter} completed.")
print("Tokenized text and extracted embeddings successfully.")


In [None]:
# Calculating average embeddings

average_embeddings = {}
for token_id, embedding in embeddings.items():
    embedding_tensor = torch.tensor(np.array(embedding))
    average_embeddings[token_id] = torch.mean(embedding_tensor, 
                                              dim = 0)
    token = tokenizer.decode([token_id])
    print(f"Token: {token}, Average Embedding: {embedding}")
print("Average embeddings calculated successfully.")
