## Retrieve

### Import libraries and load model

In [8]:
import faiss
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load the model and tokenizer
model_name = "Narrativa/legal-longformer-base-4096-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at Narrativa/legal-longformer-base-4096-spanish and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(52000, 768, padding_idx=1)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropo

### Create index with FAISS

In [11]:
# Load the document embeddings
embeddings_df = pd.read_csv('corpus/corpus_embeddings.csv')  # Replace with your path
document_embeddings = embeddings_df.values  # Convert to numpy array

# Normalize document embeddings
document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1, keepdims=True)

# Convert to C-contiguous format
document_embeddings = np.ascontiguousarray(document_embeddings, dtype=np.float32)

# Create Faiss index
index = faiss.IndexFlatIP(document_embeddings.shape[1])  # IP = Inner Product (dot product)
index.add(document_embeddings)  # Add the document embeddings to the index

### Function to get embedding for text

In [12]:
# Function to get query embedding
def get_embedding(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors='pt', max_length=4096, truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().reshape(-1)


### Get query embedding

In [25]:
# Example query
query = "SR. HANS FRIEDICH SCHUCHARDT"  # Replace with your query

# Get query embedding
query_embedding = get_embedding(query, model, tokenizer, device)

# Normalize the query embedding
query_embedding = query_embedding / np.linalg.norm(query_embedding)

# Convert to 2D numpy array as required by Faiss
query_embedding = np.expand_dims(query_embedding, axis=0)

### Search and retrieve top k documents

In [26]:
# Search for the top 5 most similar documents
D, I = index.search(query_embedding, k=5)  # k is the number of top results to return

# Print results
print("Top 5 similar documents (indices):", I[0])
print("Similarity scores:", D[0])

Top 5 similar documents (indices): [2276 3679 1979 2727 4854]
Similarity scores: [0.6976906  0.57066333 0.55561125 0.5490911  0.54366434]


In [27]:
import pandas as pd

# Load your original dataset
df = pd.read_csv('corpus/corpus.csv')  # Replace with your actual dataset path

# Extract document embeddings from the 'text' column
texts = df['text'].tolist()

In [28]:
# Example: Let's assume these are the top indices retrieved from the similarity search
top_indices = I[0]

# Extract the "Codigo" column into a list or another DataFrame
codigos = df['Codigo'].tolist()  # This will create a list of 'Codigo' values corresponding to the original data

# Map the retrieved indices to "Codigo" values
top_codigos = [codigos[idx] for idx in top_indices]

# Print the mapped "Codigo" values
print("Top 5 similar documents (Codigo values):", top_codigos)

Top 5 similar documents (Codigo values): [40300, 87051, 78677, 72793, 40790]


In [29]:
# get text for a specific Codigo
text = df.loc[df["Codigo"] == 87051, "text"].values[0]
print(text)

EXPEDIENTE: RECURSO EXTRAORDINARIO DE
CORTE REVISION INTERPUFSTO POR FL SR. CELSO
SUPREMA RODRIGUEZ CABELLO POR DERECHO PROPIO Y
peJUSTICIA BAJO PATROCINIO DEL ABG. ANDRES RAMON
RECALDE EN EL EXPTE MINISTERIO PUBLICO C 
CELSO RODRIGUEZ CABELLO S S.H.P. C LA
A PROPIEDAD - ESTAFA , w ene eneeoneesneesweenneeneeeneesneene
 
a ) ACUERDO Y SENTENCIA i ata y Jiele. 
la ciudad de Asuncion, capital de la Republica del Paraguay, alos ... UY...
dias del mes de Novien! .. del afio dos mil veintiuno, estando presentes en la Sala de
Acuerdos, los sefiores ministros de la Excma. Corte Suprema de Justicia, Sala Penal, Doctores
LUIS MARIA BENITEZ RIERA, MANUEL DEJESUS RAMIREZ CANDIA y MARIA
CAROLINA LLANES, ante mi la Secretaria Autorizante, se trajo a estudio el expediente
caratulado: RECURSO EXTRAORDINARIO DE REVISION INTERPUESTO POR EL
SR. CELSO RODRIGUEZ CABELLO POR DERECHO PROPIO Y BAJO PATROCINIO
DEL ABG. ANDRES RAMON RECALDE EN EL EXPTE MINISTERIO PUBLICO C 
CELSO RODRIGUEZ CABELLO S S.H.P. C L