## Create Embeddings for the entire corpus

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm

# Load tokenizer and model
model_name = "Narrativa/legal-longformer-base-4096-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the model is in evaluation mode
model.eval()


Some weights of RobertaModel were not initialized from the model checkpoint at Narrativa/legal-longformer-base-4096-spanish and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs


DataParallel(
  (module): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

### Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('corpus/corpus.csv')  # Replace with your dataset path
texts = df['text'].tolist()

### Create Embeddings

In [None]:
# Function to get embeddings
def get_embedding(text, model, tokenizer, device):
    # Tokenize the input text and convert to tensors
    inputs = tokenizer(text, return_tensors='pt', max_length=4096, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to the correct device
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the CLS token representation as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU before converting to numpy

  0%|          | 0/5000 [00:00<?, ?it/s]2024-09-25 21:35:13.344187: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-25 21:35:13.357378: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 21:35:13.374121: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 21:35:13.379053: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-25 21:35:13.392026: I t

ValueError: Must pass 2-d input. shape=(5000, 1, 768)

In [None]:
# List to store embeddings
embeddings = []

# Create embeddings for each document
for text in tqdm(texts):
    embedding = get_embedding(text, model, tokenizer, device)
    embeddings.append(embedding)

# Convert to DataFrame for easy handling
embeddings_df = pd.DataFrame(embeddings)

### Save Embeddings to file for future use

In [None]:
# Using list comprehension with numpy's reshape to convert (1, 768) -> (768,)
embeddings_2d = [embedding.reshape(-1) for embedding in embeddings]

# Convert the 2D embeddings list to a DataFrame
embeddings_df = pd.DataFrame(embeddings_2d)

In [None]:
embeddings_df.to_csv('corpus/corpus_embeddings.csv', index=False)

In [None]:
print(type(embeddings[0]))

<class 'numpy.ndarray'>
