# BENCHMARKING



Carga de Variables de entorno

In [60]:
from dotenv import load_dotenv
load_dotenv()

True

### 1.- Pipeline RAG Baseline

Carga de Documento

In [22]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

file_path = (
    "data/biblioteca-de-alimentos.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load()

Chunking

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

Creación de Embeddings

In [24]:
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
import os
#embeddings = OllamaEmbeddings(model="llama3")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
                            )

Creación de Base de Datos Vectorial y Método de Retrieval

In [25]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    splits,
    embedding=embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.DENSE,
)

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
retriever = qdrant.as_retriever()

In [47]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI


#llm = ChatOpenAI(model="gpt-4-turbo-preview") 

llm = ChatOpenAI(model="gpt-3.5-turbo") 

# Define prompt template
template = """Utilize the retrieved context below to answer the question.
If you're unsure of the answer, simply state you don't know and apologies
Keep your response concise, limited to two sentences.
Question: {question}
Context: {context}
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [48]:
rag_chain.invoke("What is the purpose of the regulation?")

'The purpose of the regulation is to review and update the regulations concerning the registration exemption for certain food products.'

### 2.- Pasos previos a la Evaluación

a) Creación del Dataset de preuntas y respuestas

In [49]:
QA_generation_prompt = ChatPromptTemplate.from_template("""
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}
Output:::""")

# Create a chain to create a question
question_chain = (
    {"context": RunnablePassthrough()}
    | QA_generation_prompt
    | llm
    | StrOutputParser()
)

b) Obtener una Muestra Aleatoria de documentos

In [50]:
import random
from tqdm import tqdm


# Sample 15 documents to generate questions

sampled_docs = random.sample(docs, 15)

# Generate questions for each document
sampled_docs_processed = [doc.page_content for doc in sampled_docs]

c) Generar Preguntas y respuestas

In [51]:
# Generate questions

questions = [question_chain.invoke({"context": sampled_context}) for sampled_context in tqdm(sampled_docs_processed)]

100%|██████████| 15/15 [00:09<00:00,  1.52it/s]


Visualización de preguntas y respuestas

In [52]:
questions

['Factoid question: What RDC altered the RDCs 43, 44, and 45 from 2011 and improved legislative techniques?\nAnswer: RDC 729/2022',
 'Factoid question: What is the document related to the restriction of use of industrial trans fats in foods?\nAnswer: Perguntas e Respostas – Requisitos para uso de Gorduras Trans Industriais em Alimentos.',
 'Factoid question: What is the document related to the conservation of food in the phases of transportation, commercialization, and consumption of perishable foods?\nAnswer: Resolução CISA/MA/MS nº 10, de 31/07/1984',
 'Factoid question: When was the Biblioteca de Alimentos last updated?\nAnswer: Atualizada em 06.11.2024',
 'Factoid question: What document establishes the requirements for nutritional labeling on packaged foods?\nAnswer: IN 75/2020 - Estabelece os requisitos técnicos para declaração da rotulagem nutricional nos alimentos embalados.',
 'Factoid question: What is the section number for Boas Práticas de Fabricação (BPF) para estabelecime

Parsing de la Preguntas

In [53]:
questions_processed = []
ground_truth = []
for question in questions:
    questions_processed.append(question.split("Factoid question: ")[-1].split("Answer: ")[0])
    ground_truth.append(question.split("Factoid question: ")[-1].split("Answer: ")[1])

In [54]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


d) Generación de Diccionario

In [39]:
data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

In [None]:
#!pip install datasets

In [55]:
from datasets import Dataset

# Convert dict to dataset
dataset = Dataset.from_dict(data)

### 3. Evaluación de Métricas on RAGAS

In [None]:
# !pip intall ragas

In [56]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness, # Measures how well the model generates answers that are faithful to the context
    answer_relevancy, # Measures how well the model generates answers that are relevant to the question
    context_recall, # Context recall measures how well the model retrieves relevant context
    context_precision, # Context precision measures how well the model retrieves only relevant context
)

In [57]:
result = evaluate(
    dataset = dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)

df = result.to_pandas()
df

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 60/60 [00:22<00:00,  2.67it/s]


Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,answer_relevancy,context_precision
0,Factoid question: What is the purpose of RDC 6...,[RDC 604/2022 - Enriquecimento obrigatório do ...,The purpose of RDC 604/2022 regulation is to m...,Enriquecimento obrigatório do sal com iodo e d...,1.0,0.5,0.591306,1.0
1,Factoid question: What is the purpose of RDC 2...,"[RDC 24/2010 – Dispõe sobre a oferta, propagan...",The purpose of RDC 24/2010 is to regulate the ...,"It regulates the offer, advertising, publicity...",1.0,0.5,0.64373,0.0
2,Factoid question: What regulatory item address...,[Tema Regulatório 3.20 da Agenda Regulatória 2...,The regulatory item that addresses the review ...,Tema Regulatório 3.20 da Agenda Regulatória 20...,1.0,,0.773108,0.75
3,Factoid question: What is the purpose of RDC 8...,[RDC 839/2023 - Comprovação de segurança e au...,The purpose of RDC 839/2023 is to ensure the s...,Comprovação de segurança e autorização de uso ...,1.0,1.0,0.677022,1.0
4,Factoid question: What substances are mandated...,[RDC 604/2022 - Enriquecimento obrigatório do ...,Iodo (iodine) is mandated to be added to salt ...,Iodo,1.0,1.0,0.669123,1.0
5,Factoid question: What is the purpose of IN 16...,[Ato relacionado: \nIN 162/2022 - Ingestão diá...,The purpose of IN 162/2022 is to establish the...,To establish the acceptable daily intake (IDA)...,1.0,1.0,0.5913,1.0
6,Factoid question: What is the purpose of Lei 1...,[3.5. Regularização da doação de alimentos com...,The purpose of Lei 14.016/2020 is to combat fo...,To combat food waste and donate surplus food f...,1.0,1.0,0.730867,1.0
7,Factoid question: What regulation updates the ...,[Alterada por: \nRDC 429/2020 - Rotulagem nutr...,The regulation that updates the requirements f...,RDC 429/2020,1.0,1.0,0.79394,1.0
8,Factoid question: What law defines the Nationa...,"[1. Regularização, avaliação de risco e padrõe...",The law that defines the National System of Sa...,Lei 9.782/1999,1.0,1.0,0.834102,0.833333
9,Factoid question: What law requires food produ...,[de 2003. \n \nLei 10.674/2003 – Obriga a que...,The law that requires food products to inform ...,Lei 10.674/2003,1.0,1.0,0.755992,1.0


Generación de Archivo CSV con resultados

In [58]:
df.to_csv("baseline_ragas_results.csv", index=False)

Promedio de Métricas

In [59]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(df["context_precision"].mean(), 4))


Mean Faithfulness:  0.8214
Mean Answer relevancy:  0.7223
Mean Context recall:  1.0
Mean Context precision:  0.8889


### 4. Agregando un paso de 'Rerankeo'

Esto permite reordenar los documentos de acuerdo a su relevancia semantica

In [61]:
query = "What is the purpose of the regulation?"

retrieved_docs = retriever.get_relevant_documents(query, kwargs={"k": 10})

In [19]:
retrieved_docs

[Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 3, '_id': '009810801d8a4da28d42c752624e9e3e', '_collection_name': 'my_documents'}, page_content='Regulation and to adopt appropriate transitional measures. Since those measures are\nof general scope and are designed to amend non-essential elements of this Regulation,'),
 Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 5, '_id': '8abb64a3aa7f4e768e993b256ed47a3d', '_collection_name': 'my_documents'}, page_content='HAVE ADOPTED THIS REGULATION:\nCHAPTER I\nSUBJECT MATTER, SCOPE AND DEFINITIONS\nArticle 1\nSubject matter\nThis Regulation lays down rules on food additives used in foods with a view to ensuring'),
 Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 6, '_id': '12467865f29c4033b63bcca25e844817', '_collection_name': 'my_documents'}, page_content='Regulation (EC) No 1333/200

Mediante Cohere (no utilizado)

In [99]:
# import cohere as co
# cohere_client = co.Client(os.getenv("COHERE_API_KEY"))
# def rerank_docs(query, retrieved_docs):
#     reranked_docs = cohere_client.rerank(
#         model="rerank-english-v3.0",
#         query=query,
#         documents=retrieved_docs,
#         rank_fields=["page_content"],
#         return_documents=True
#     )
#     return reranked_docs


Mediante Rerankers

In [None]:

from rerankers import Reranker

def open_source_reranker(query, retrieved_docs):
    #reranker = Reranker('cross-encoder', verbose=0,model_type='cross-encoder')
    reranker = Reranker("colbert") # colber model used for reranking
    retrieved_docs = [doc.page_content for doc in retrieved_docs]
    reranked_docs = reranker.rank(query, retrieved_docs)
    return reranked_docs


In [100]:
reranked_docs = open_source_reranker(query, retrieved_docs)

Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


In [101]:
reranked_docs.results

[Result(document=Document(document_type='text', text='E 170 Calcium carbonate\nE 260 Acetic acid\n[F64E 261 Potassium acetates]\nE 262 Sodium acetates\nE 263 Calcium acetate\nE 270 Lactic acid', base64=None, image_path=None, doc_id=2, metadata={}), score=0.8182083368301392, rank=1),
 Result(document=Document(document_type='text', text='E 968 Erythritol\n[F60E 969 Advantame]\n3. Additives other than colours and sweeteners\nE-number Name\nE 170 Calcium carbonate\n[F45E 172 Iron oxides and hydroxides]\nE 200 Sorbic acid\nE 202 Potassium sorbate\nF62\nE 210 Benzoic acid \na', base64=None, image_path=None, doc_id=3, metadata={}), score=0.8180067539215088, rank=2),
 Result(document=Document(document_type='text', text='E 529 Calcium oxide\nE 530 Magnesium oxide\n[F70E 534 Iron tartrate]\nE 535 Sodium ferrocyanide\nE 536 Potassium ferrocyanide\nE 538 Calcium ferrocyanide\nE 541 Sodium aluminium phosphate acidic\nE 551 Silicon dioxide\nE 552 Calcium silicate', base64=None, image_path=None, doc_

In [102]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    retrieved_docs = retriever.get_relevant_documents(query)
    reranked_docs = open_source_reranker(query, retrieved_docs)
    if reranked_docs.results:  # Check if there are any results
        contexts.append([reranked_docs.results[0].document.text])

data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting
Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting
Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model c

In [103]:
reranked_dataset = Dataset.from_dict(data)
result = evaluate(
    dataset = reranked_dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)
reranked_df = result.to_pandas()
reranked_df.to_csv("reranked_ragas_results.csv", index=False)


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

In [104]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(reranked_df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(reranked_df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(reranked_df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(reranked_df["context_precision"].mean(), 4))

Mean Faithfulness:  0.75
Mean Answer relevancy:  0.8118
Mean Context recall:  0.9333
Mean Context precision:  1.0
