# BENCHMARKING



Carga de Variables de entorno

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### 1.- Pipeline RAG Baseline

Carga de Documento

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

file_path = (
    "../data/biblioteca-de-alimentos.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load()

Chunking

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

Creación de Embeddings

In [7]:
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
import os
#embeddings = OllamaEmbeddings(model="llama3")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
                            )

Creación de Base de Datos Vectorial y Método de Retrieval

In [8]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    splits,
    embedding=embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.DENSE,
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
retriever = qdrant.as_retriever()

In [10]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI


#llm = ChatOpenAI(model="gpt-4-turbo-preview") 

llm = ChatOpenAI(model="gpt-3.5-turbo") 

# Define prompt template
template = """Utilize the retrieved context below to answer the question.
If you're unsure of the answer, simply state you don't know and apologies in portuguese.
Keep your response concise, limited to two sentences.
Question: {question}
Context: {context}
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [43]:
rag_chain.invoke("What is the purpose of the regulation?")

'The purpose of the regulation is to review and revise the regulations concerning the regularization of food products exempt from registration.'

### 2.- Pasos previos a la Evaluación

a) Creación del Dataset de preuntas y respuestas

In [12]:
QA_generation_prompt = ChatPromptTemplate.from_template("""
Your task is to write a factoid question and an answer given a context in portuguese.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}
Output:::""")

# Create a chain to create a question
question_chain = (
    {"context": RunnablePassthrough()}
    | QA_generation_prompt
    | llm
    | StrOutputParser()
)

b) Obtener una Muestra Aleatoria de documentos

In [13]:
import random
from tqdm import tqdm


# Sample 15 documents to generate questions

sampled_docs = random.sample(docs, 15)

# Generate questions for each document
sampled_docs_processed = [doc.page_content for doc in sampled_docs]

c) Generar Preguntas y respuestas

In [14]:
# Generate questions

questions = [question_chain.invoke({"context": sampled_context}) for sampled_context in tqdm(sampled_docs_processed)]

100%|██████████| 15/15 [00:11<00:00,  1.28it/s]


Visualización de preguntas y respuestas

In [15]:
questions

['Factoid question: Quais são os limites máximos tolerados (LMT) de contaminantes em alimentos estabelecidos pela RDC 722/2022?\nAnswer: Os limites máximos tolerados (LMT) de contaminantes em alimentos são estabelecidos pela RDC 722/2022.',
 'Factoid question: Qual é o tema regulatório 3.30 da Agenda Regulatória 2024/2025?\nAnswer: Atualização periódica da lista de espécies vegetais autorizadas, as designações, a composição de ácidos graxos e os valores máximos de acidez e de índice de peróxidos para óleos e gorduras vegetais.',
 'Factoid question: Quais são os procedimentos para autorização de uso de aditivos alimentares e coadjuvantes de tecnologia?\nAnswer: Procedimentos para autorização de uso de aditivos alimentares e coadjuvantes de tecnologia são descritos na seção 1.5.',
 'Factoid question: Qual é o ato que estabelece os requisitos técnicos para declaração da rotulagem nutricional nos alimentos embalados?\nAnswer: IN 75/2020',
 'Factoid question: Quais são os documentos relacio

Parsing de la Preguntas

In [16]:
questions_processed = []
ground_truth = []
for question in questions:
    questions_processed.append(question.split("Factoid question: ")[-1].split("Answer: ")[0])
    ground_truth.append(question.split("Factoid question: ")[-1].split("Answer: ")[1])

In [17]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


d) Generación de Diccionario

In [18]:
data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

In [17]:
#!pip install datasets

In [19]:
from datasets import Dataset

# Convert dict to dataset
dataset = Dataset.from_dict(data)

### 3. Evaluación de Métricas on RAGAS

In [20]:
# !pip intall ragas

In [21]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness, # Measures how well the model generates answers that are faithful to the context
    answer_relevancy, # Measures how well the model generates answers that are relevant to the question
    context_recall, # Context recall measures how well the model retrieves relevant context
    context_precision, # Context precision measures how well the model retrieves only relevant context
)

In [22]:
result = evaluate(
    dataset = dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)

df = result.to_pandas()
df

Evaluating: 100%|██████████| 60/60 [00:24<00:00,  2.41it/s]


Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,answer_relevancy,context_precision
0,Factoid question: Quais são os limites máximos...,[Documento de Perguntas e Respostas sobre Cont...,"Não sei, peço desculpas.",Os limites máximos tolerados (LMT) de contamin...,1.0,0.0,0.0,1.0
1,Factoid question: Qual é o tema regulatório 3....,[1.18. Requisitos sanitários para óleos e gord...,"I don't know, apologies.",Atualização periódica da lista de espécies veg...,1.0,0.0,0.0,1.0
2,Factoid question: Quais são os procedimentos p...,[1.5. Procedimentos para autorização de uso de...,Procedimentos para autorização de uso de aditi...,Procedimentos para autorização de uso de aditi...,1.0,1.0,0.843838,1.0
3,Factoid question: Qual é o ato que estabelece ...,[IN 75/2020 - Estabelece os requisitos técnic...,IN 75/2020.,IN 75/2020,1.0,1.0,0.0,1.0
4,Factoid question: Quais são os documentos rela...,[Documentos relacionados: \nPerguntas e Respos...,Os documentos relacionados aos requisitos sani...,Perguntas e Respostas sobre Suplementos Alimen...,0.5,0.5,0.823786,1.0
5,Factoid question: Qual é a lei que dispõe sobr...,[3.5. Regularização da doação de alimentos com...,Lei 14.016/2020.,Lei 14.016/2020,1.0,1.0,0.560817,1.0
6,Factoid question: Quando foi a última atualiza...,[BIBLIOTECA DE \nALIMENTOS \nAtualizada em 06....,Atualizada em 06.11.2024.,Atualizada em 06.11.2024,1.0,1.0,0.0,0.805556
7,Factoid question: O que é o Programa de contro...,[4.2. Programa de controle de alergênicos em a...,"Desculpe, não sei a resposta.",Um programa que visa controlar a presença de s...,1.0,0.0,0.0,0.416667
8,Factoid question: Qual a lei que define o Sist...,"[1. Regularização, avaliação de risco e padrõe...",Lei 9.782/1999.,Lei 9.782/1999,1.0,1.0,0.0,0.75
9,Factoid question: Quais normas foram alteradas...,[RDC 48/2014 – Altera a RDC 45/2011. \nRDC 241...,"RDC 43, 44, and 45/2011 were altered by RDC 42...","RDC 43, 44, e 45/2011 foram alteradas pela RDC...",1.0,1.0,0.704515,1.0


Generación de Archivo CSV con resultados

In [23]:
df.to_csv("results/baseline_ragas_results.csv", index=False)

Promedio de Métricas

In [24]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(df["context_precision"].mean(), 4))


Mean Faithfulness:  0.7667
Mean Answer relevancy:  0.2745
Mean Context recall:  0.9667
Mean Context precision:  0.9074


### 4. Agregando un paso de 'Rerankeo'

Esto permite reordenar los documentos de acuerdo a su relevancia semantica

In [25]:
query = "What is the purpose of the regulation?"

retrieved_docs = retriever.get_relevant_documents(query, kwargs={"k": 10})

In [26]:
retrieved_docs

[Document(metadata={'source': '../data/biblioteca-de-alimentos.pdf', 'page': 4, '_id': 'a2df4122737347ceb8eaf82a332a176b', '_collection_name': 'my_documents'}, page_content='Tema Regulatório 3.22 da Agenda Regulatória 2024/2025: Revisão da regulamentação sobre regularização  \nde alimentos dispensados de registro.'),
 Document(metadata={'source': '../data/biblioteca-de-alimentos.pdf', 'page': 17, '_id': 'ee4f21b81fcd406fb20670cd62b132af', '_collection_name': 'my_documents'}, page_content='Equivalente (AREE) \n \nTema Regulatório 3.1 da Agenda Regulatória 2024/2025 - A regulamentar. \n1.26. Regulamentação dos alimentos para fins médicos \n \nTema Regulatório 3.10 da Agenda Regulatória 2024/2025 - A regulamentar.'),
 Document(metadata={'source': '../data/biblioteca-de-alimentos.pdf', 'page': 17, '_id': '99d0e4de7a124e748db4de89ee8cb7a2', '_collection_name': 'my_documents'}, page_content='Tema Regulatório 3.16 da Agenda Regulatória 2024/2025: Revisão da regulamentação de autorização de us

Mediante Cohere (no utilizado)

In [26]:
# import cohere as co
# cohere_client = co.Client(os.getenv("COHERE_API_KEY"))
# def rerank_docs(query, retrieved_docs):
#     reranked_docs = cohere_client.rerank(
#         model="rerank-english-v3.0",
#         query=query,
#         documents=retrieved_docs,
#         rank_fields=["page_content"],
#         return_documents=True
#     )
#     return reranked_docs


Mediante Rerankers

In [36]:
from rerankers import Reranker

def open_source_reranker(query, retrieved_docs):
    reranker = Reranker('cross-encoder', verbose=0,model_type='cross-encoder',lang='pt')	
    #reranker = Reranker("colbert") # colber model used for reranking
    retrieved_docs = [doc.page_content for doc in retrieved_docs]
    reranked_docs = reranker.rank(query, retrieved_docs)
    return reranked_docs


In [37]:
reranked_docs = open_source_reranker(query, retrieved_docs)

Loading default cross-encoder model for language pt


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [38]:
reranked_docs.results

[Result(document=Document(document_type='text', text='Tema Regulatório 3.6 da Agenda Regulatória 2024/2025: Regulamentação da declaração quantitativa de  \ningredientes na rotulagem de alimentos embalados.', base64=None, image_path=None, doc_id=2, metadata={}), score=-0.21018770337104797, rank=1),
 Result(document=Document(document_type='text', text='Tema Regulatório 3.23 da Agenda Regulatória 2024/2025: Revisão da regulamentação sobre rotulagem  dos \nprincipais alimentos alergênicos. \nTema Regulatório 3.24 da Agenda Regulatória 2024/2025: Revisão da regulamentação sobre rotulagem geral', base64=None, image_path=None, doc_id=1, metadata={}), score=-1.5647499561309814, rank=2),
 Result(document=Document(document_type='text', text='Tema Regulatório 3.13 da Agenda Regulatória 2024/2025: Revisão da lista positiva de aditivos destinados à  \nelaboração de materiais plásticos e revestimentos poliméricos em contato com alimentos.', base64=None, image_path=None, doc_id=3, metadata={}), score

In [39]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    retrieved_docs = retriever.get_relevant_documents(query)
    reranked_docs = open_source_reranker(query, retrieved_docs)
    if reranked_docs.results:  # Check if there are any results
        contexts.append([reranked_docs.results[0].document.text])

data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt
Loading default cross-encoder model for language pt


In [40]:
reranked_dataset = Dataset.from_dict(data)
result = evaluate(
    dataset = reranked_dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)
reranked_df = result.to_pandas()
reranked_df.to_csv("results/reranked_baseline_ragas_results.csv", index=False)


Evaluating:  15%|█▌        | 9/60 [00:03<00:14,  3.46it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 60/60 [00:22<00:00,  2.68it/s]


In [41]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(reranked_df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(reranked_df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(reranked_df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(reranked_df["context_precision"].mean(), 4))

Mean Faithfulness:  0.5714
Mean Answer relevancy:  0.2812
Mean Context recall:  1.0
Mean Context precision:  0.8667
