# BENCHMARKING



Carga de Variables de entorno

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### 1.- Pipeline RAG Baseline

Carga de Documento

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

file_path = (
    "../../data/biblioteca-de-alimentos.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load()

Chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

Creación de Embeddings

In [5]:
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
import os
#embeddings = OllamaEmbeddings(model="llama3")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
                            )

Creación de Base de Datos Vectorial y Método de Retrieval

In [6]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    splits,
    embedding=embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.DENSE,
)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
retriever = qdrant.as_retriever()

In [8]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI


#llm = ChatOpenAI(model="gpt-4-turbo-preview") 

llm = ChatOpenAI(model="gpt-3.5-turbo") 

# Define prompt template
template = """Utilize the retrieved context below to answer the question.
If you're unsure of the answer, simply state you don't know and apologies
Keep your response concise, limited to two sentences.
Question: {question}
Context: {context}
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [9]:
rag_chain.invoke("What is the purpose of the regulation?")

'The purpose of the regulation is to review and revise the regulations governing the registration exemptions for certain foods.'

### 2.- Pasos previos a la Evaluación

a) Creación del Dataset de preuntas y respuestas

In [10]:
QA_generation_prompt = ChatPromptTemplate.from_template("""
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}
Output:::""")

# Create a chain to create a question
question_chain = (
    {"context": RunnablePassthrough()}
    | QA_generation_prompt
    | llm
    | StrOutputParser()
)

b) Obtener una Muestra Aleatoria de documentos

In [11]:
import random
from tqdm import tqdm


# Sample 15 documents to generate questions

sampled_docs = random.sample(docs, 15)

# Generate questions for each document
sampled_docs_processed = [doc.page_content for doc in sampled_docs]

c) Generar Preguntas y respuestas

In [12]:
# Generate questions

questions = [question_chain.invoke({"context": sampled_context}) for sampled_context in tqdm(sampled_docs_processed)]

100%|██████████| 15/15 [00:10<00:00,  1.48it/s]


Visualización de preguntas y respuestas

In [13]:
questions

['Factoid question: What is the theme of Regulatory Theme 3.5 in the 2024/2025 Regulatory Agenda?\nAnswer: Reavaliação da autorização de uso do aditivo alimentar dióxido de titânio em alimentos.',
 'Factoid question: What sector does the Anvisa coordinate, supervise, and control activities in?\nAnswer: In the sector of foods',
 'Factoid question: What is the law that mandates the iodization of salt for human consumption?\nAnswer: Lei 6.150/1974 – Dispõe sobre a obrigatoriedade da iodação do sal, destinado ao consumo humano, seu controle pelos órgãos sanitários e dá outras providências',
 'Factoid question: When was the Biblioteca de Alimentos last updated?\nAnswer: Atualizada em 06.11.2024',
 'Factoid question: What is the theme of the regulatory agenda 2024/2025 related to contaminants in food?\nAnswer: Theme Regulatório 3.17 da Agenda Regulatória 2024/2025: Revisão da regulamentação de contaminantes em alimentos.',
 'Factoid question: What is the regulation number that prohibits the 

Parsing de la Preguntas

In [14]:
questions_processed = []
ground_truth = []
for question in questions:
    questions_processed.append(question.split("Factoid question: ")[-1].split("Answer: ")[0])
    ground_truth.append(question.split("Factoid question: ")[-1].split("Answer: ")[1])

In [15]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


d) Generación de Diccionario

In [16]:
data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

In [17]:
#!pip install datasets

In [18]:
from datasets import Dataset

# Convert dict to dataset
dataset = Dataset.from_dict(data)

### 3. Evaluación de Métricas on RAGAS

In [19]:
# !pip intall ragas

In [20]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness, # Measures how well the model generates answers that are faithful to the context
    answer_relevancy, # Measures how well the model generates answers that are relevant to the question
    context_recall, # Context recall measures how well the model retrieves relevant context
    context_precision, # Context precision measures how well the model retrieves only relevant context
)

In [21]:
result = evaluate(
    dataset = dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)

df = result.to_pandas()
df

Evaluating:  23%|██▎       | 14/60 [00:05<00:14,  3.09it/s]No statements were generated from the answer.
Evaluating:  48%|████▊     | 29/60 [00:11<00:11,  2.70it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 60/60 [00:22<00:00,  2.69it/s]


Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,answer_relevancy,context_precision
0,Factoid question: What is the theme of Regulat...,[1.5. Procedimentos para autorização de uso de...,"I don't know, apologies.",Reavaliação da autorização de uso do aditivo a...,1.0,0.0,0.0,0.833333
1,Factoid question: What sector does the Anvisa ...,"[No setor de alimentos, a Anvisa coordena, sup...",In the sector of foods.,In the sector of foods,1.0,1.0,0.0,0.916667
2,Factoid question: What is the law that mandate...,[Lei 6.150/1974 – Dispõe sobre a obrigatorieda...,Lei 6.150/1974 mandates the iodization of salt...,Lei 6.150/1974 – Dispõe sobre a obrigatoriedad...,1.0,1.0,0.55305,1.0
3,Factoid question: When was the Biblioteca de A...,[BIBLIOTECA DE \nALIMENTOS \nAtualizada em 06....,The Biblioteca de Alimentos was last updated o...,Atualizada em 06.11.2024,1.0,1.0,0.827114,1.0
4,Factoid question: What is the theme of the reg...,[IN 297/2024 \nIN 303/2024 \nIN 306/2024 \n1.6...,The theme of the regulatory agenda 2024/2025 r...,Theme Regulatório 3.17 da Agenda Regulatória 2...,1.0,1.0,0.849043,1.0
5,Factoid question: What is the regulation numbe...,"[benzopireno, com identificação do lote e ou d...",RE 5052/2011,RE 5052/2011,1.0,,0.0,0.833333
6,Factoid question: What law obliges products to...,[de 2003. \n \nLei 10.674/2003 – Obriga a que...,Lei 10.674/2003.,Lei 10.674/2003,1.0,1.0,0.0,1.0
7,Factoid question: What is the theme of Regulat...,[1.8. Resíduos de medicamentos veterinários em...,"I don't know, apologies.","Atualização periódica da lista de LMR, IDA, DR...",1.0,0.0,0.0,1.0
8,Factoid question: What are the procedures for ...,[1.1. Procedimentos para regularização de alim...,"I'm sorry, I don't have the specific procedure...",Procedimentos para regularização de alimentos,1.0,0.0,0.0,0.5
9,Factoid question: What RDC established the req...,[RDC 241/2018 – Requisitos para comprovação da...,RDC 241/2018,RDC 241/2018,1.0,,0.0,1.0


Generación de Archivo CSV con resultados

In [22]:
df.to_csv("results/baseline_ragas_results.csv", index=False)

Promedio de Métricas

In [23]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(df["context_precision"].mean(), 4))


Mean Faithfulness:  0.5385
Mean Answer relevancy:  0.2429
Mean Context recall:  1.0
Mean Context precision:  0.8944


### 4. Agregando un paso de 'Rerankeo'

Esto permite reordenar los documentos de acuerdo a su relevancia semantica

In [24]:
query = "What is the purpose of the regulation?"

retrieved_docs = retriever.get_relevant_documents(query, kwargs={"k": 10})

In [25]:
retrieved_docs

[Document(metadata={'source': '../../data/biblioteca-de-alimentos.pdf', 'page': 4, '_id': '818add6f810445228121ec513e731d5d', '_collection_name': 'my_documents'}, page_content='Tema Regulatório 3.22 da Agenda Regulatória 2024/2025: Revisão da regulamentação sobre regularização  \nde alimentos dispensados de registro.'),
 Document(metadata={'source': '../../data/biblioteca-de-alimentos.pdf', 'page': 17, '_id': '41923215d87b498eb6d86ed069c4ee8a', '_collection_name': 'my_documents'}, page_content='Equivalente (AREE) \n \nTema Regulatório 3.1 da Agenda Regulatória 2024/2025 - A regulamentar. \n1.26. Regulamentação dos alimentos para fins médicos \n \nTema Regulatório 3.10 da Agenda Regulatória 2024/2025 - A regulamentar.'),
 Document(metadata={'source': '../../data/biblioteca-de-alimentos.pdf', 'page': 17, '_id': 'cee34825316546229774724634f14861', '_collection_name': 'my_documents'}, page_content='Tema Regulatório 3.16 da Agenda Regulatória 2024/2025: Revisão da regulamentação de autoriza

Mediante Cohere (no utilizado)

In [26]:
# import cohere as co
# cohere_client = co.Client(os.getenv("COHERE_API_KEY"))
# def rerank_docs(query, retrieved_docs):
#     reranked_docs = cohere_client.rerank(
#         model="rerank-english-v3.0",
#         query=query,
#         documents=retrieved_docs,
#         rank_fields=["page_content"],
#         return_documents=True
#     )
#     return reranked_docs


Mediante Rerankers

In [27]:
from rerankers import Reranker

def open_source_reranker(query, retrieved_docs):
    reranker = Reranker('cross-encoder', verbose=0,model_type='cross-encoder')
    #reranker = Reranker("colbert") # colber model used for reranking
    retrieved_docs = [doc.page_content for doc in retrieved_docs]
    reranked_docs = reranker.rank(query, retrieved_docs)
    return reranked_docs


In [28]:
reranked_docs = open_source_reranker(query, retrieved_docs)

Loading default cross-encoder model for language en


In [29]:
reranked_docs.results

[Result(document=Document(document_type='text', text='Tema Regulatório 3.16 da Agenda Regulatória 2024/2025: Revisão da regulamentação de autorização de uso \ne de rotulagem de aditivos edulcorantes em alimentos.', base64=None, image_path=None, doc_id=2, metadata={}), score=-1.9532668590545654, rank=1),
 Result(document=Document(document_type='text', text='Tema Regulatório 3.16 da Agenda Regulatória 2024/2025: Revisão da regulamentação de autorização de uso \ne de rotulagem de aditivos edulcorantes em alimentos.', base64=None, image_path=None, doc_id=3, metadata={}), score=-1.9532668590545654, rank=2),
 Result(document=Document(document_type='text', text='Tema Regulatório 3.22 da Agenda Regulatória 2024/2025: Revisão da regulamentação sobre regularização  \nde alimentos dispensados de registro.', base64=None, image_path=None, doc_id=0, metadata={}), score=-2.389258623123169, rank=3),
 Result(document=Document(document_type='text', text='Equivalente (AREE) \n \nTema Regulatório 3.1 da A

In [30]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    retrieved_docs = retriever.get_relevant_documents(query)
    reranked_docs = open_source_reranker(query, retrieved_docs)
    if reranked_docs.results:  # Check if there are any results
        contexts.append([reranked_docs.results[0].document.text])

data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en
Loading default cross-encoder model for language en


In [None]:
reranked_dataset = Dataset.from_dict(data)
result = evaluate(
    dataset = reranked_dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)
reranked_df = result.to_pandas()
reranked_df.to_csv("results/reranked_baseline_ragas_results.csv", index=False)


Evaluating:  52%|█████▏    | 31/60 [00:09<00:08,  3.51it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 60/60 [00:19<00:00,  3.12it/s]


In [32]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(reranked_df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(reranked_df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(reranked_df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(reranked_df["context_precision"].mean(), 4))

Mean Faithfulness:  0.6429
Mean Answer relevancy:  0.3811
Mean Context recall:  1.0
Mean Context precision:  0.8667
