In [1]:
!pip install torch
!pip install transformers
!pip install datasets



In [2]:
# Importing packages

import numpy as np
import torch
import csv
from tqdm import tqdm
from collections import defaultdict
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
print("Packages imported successfully.")

Packages imported successfully.


In [3]:
# Loading tokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base",
                                         use_fast = True)
print("Tokenizer loaded successfully.")

Tokenizer loaded successfully.


In [4]:
# Loading model

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base").to(device)
model.eval()
print("Model loaded successfully.")

Model loaded successfully.


In [5]:
# Converting datatset.txt to dataset.csv

with open("dataset.txt", "r", encoding = "utf-8") as txt_file:
    rows = txt_file.read().strip().split("\n\n")
with open("dataset.csv", "w", newline = "", encoding = "utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["text"])
    for row in rows:
        writer.writerow([row.strip()])
print("CSV file created successfully.")

CSV file created successfully.


In [6]:
# Loading dataset

texts = load_dataset("csv", data_files = "dataset.csv")
print("Dataset loaded successfully.")

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully.


In [7]:
# Tokenzing texts

def tokenizer_function(batch):
    texts = [str(text) if text is not None else "" for text in batch["text"]]
    return tokenizer(texts,
                     return_tensors = "pt",
                     truncation = True,
                     padding = "max_length").to(device)
inputs = texts.map(tokenizer_function, 
                   batched = True, 
                   batch_size = 32)
print("Data tokenized successfully.")

Map:   0%|          | 0/449919 [00:00<?, ? examples/s]

Data tokenized successfully.


In [8]:
# Checking dataset's column names

print(inputs.column_names) 

{'train': ['text', 'input_ids', 'attention_mask']}


In [None]:
# Getting embeddings

embeddings = defaultdict(list)
batch_size = 32
batch_number = (len(inputs["train"]["input_ids"]) + batch_size - 1) // batch_size
for batch in tqdm(range(batch_number), desc = "Processing batches"):
    start = batch * batch_size
    end = min((batch + 1) * batch_size, 
              len(inputs["train"]["input_ids"]))
   
    # batch_input = {key: val[start: end] for key, val in inputs["train"]}.to(device)
    batch_input = {
        "input_ids": inputs["train"]["input_ids"][start: end],
        "attention_mask": inputs["train"]["attention_mask"][start: end]
    }
    batch_input = {key: torch.tensor(val).to(device) for key, val in batch_input.items()}
    
    with torch.no_grad():
        outputs = model(**batch_input,
                       output_hidden_states = True)
        temporary_embedding = outputs.hidden_states[-1]
    for index, token in enumerate(batch_input["input_ids"].view(-1)):
        embedding = temporary_embedding.view(-1, 
                                             temporary_embedding.size(-1))[index].cpu().numpy()
        embeddings[token.item()].append(embedding)
print("Tokenized text and extracted embeddings successfully.")

Processing batches:   0%|          | 1/14060 [02:48<659:00:29, 168.75s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7705bc989210>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
# Calculating average embeddings

average_embeddings = {}
for token, embedding in embeddings.items():
    embedding_tensor = torch.tensor(np.array(embedding))
    average_embeddings[token] = torch.mean(embedding_tensor, 
                                              dim = 0).numpy()
    token = tokenizer.decode([token])
    print(f"Token: {token}, Average Embedding: {embedding}")
print("Average embeddings calculated successfully.")