In [1]:
from tqdm import tqdm
import gensim
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader
import numpy as np

pretrained_repo = 'sentence-transformers/all-roberta-large-v1'
batch_size = 256  # Adjust the batch size as needed

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_repo)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_texts = [
    'I am a sentence for which I would like to get its embedding'
]

# Tokenize the input texts

encoding = tokenizer(input_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
print(encoding)

{'input_ids': tensor([[    0,   100,   524,    10,  3645,    13,    61,    38,    74,   101,
             7,   120,    63, 33183, 11303,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}




In [19]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, input_ids=None, attention_mask=None):
        super().__init__()

        self.data = {
            "input_ids": input_ids,
            "att_mask": attention_mask,
        }

    def __len__(self):
        return self.data["input_ids"].size(0)

    def __getitem__(self, index):
        if isinstance(index, torch.Tensor):
            index = index.item()
        batch_data = dict()
        for key in self.data.keys():
            if self.data[key] is not None:
                batch_data[key] = self.data[key][index]
        return batch_data

In [20]:
dataset = Dataset(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])
print("Dataset:", dataset)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
print("DataLoader:", dataloader)

Dataset: <__main__.Dataset object at 0x7fb048e8e1c0>
DataLoader: <torch.utils.data.dataloader.DataLoader object at 0x7fb048e8e670>


In [21]:
class Sentence_Bert(nn.Module):
    
    def __init__(self, pretrained_repo):
        super(Sentence_Bert, self).__init__()
        self.bert_model = AutoModel.from_pretrained(pretrained_repo)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        data_type = token_embeddings.dtype
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(data_type)
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, input_ids, att_mask):
        bert_out = self.bert_model(input_ids=input_ids, attention_mask=att_mask)
        print("input_ids: ", input_ids)
        print("bert_out: ", bert_out)
        sentence_embeddings = self.mean_pooling(bert_out, att_mask)
        print("bert_out after mean pooling: ", sentence_embeddings)

        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        print("bert_out after normalization: ", sentence_embeddings)
        return sentence_embeddings

In [22]:
all_embeddings = []

model = Sentence_Bert(pretrained_repo)
model.to(device)

with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        print("batch: ", batch)

        embeddings = model(input_ids = batch['input_ids'], att_mask = batch['att_mask'])
        all_embeddings.append(embeddings)
    
all_embeddings = torch.cat(all_embeddings)

print("All embeddings:", all_embeddings)

batch:  {'input_ids': tensor([[    0,   100,   524,    10,  3645,    13,    61,    38,    74,   101,
             7,   120,    63, 33183, 11303,     2]], device='cuda:0'), 'att_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
input_ids:  tensor([[    0,   100,   524,    10,  3645,    13,    61,    38,    74,   101,
             7,   120,    63, 33183, 11303,     2]], device='cuda:0')
bert_out:  BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.4691,  0.3588, -0.3893,  ...,  0.9207,  0.3005, -0.6692],
         [ 1.1010,  2.2223, -0.7313,  ...,  1.1773, -0.2689, -1.0141],
         [ 1.3093,  2.2479, -0.0480,  ...,  1.0683,  0.2182,  0.1668],
         ...,
         [-0.5873, -0.0315,  0.2894,  ...,  1.2412,  0.6698, -0.5678],
         [-0.7918, -0.3601, -0.9375,  ...,  1.2360,  0.7064, -0.8657],
         [ 0.1682,  0.1122, -1.0189,  ...,  0.9203,  0.1400, -1.0907]]],
       device='cuda:0'), pooler_output=tensor([[-0.1987, -0.16