# 3. Automatic Evaluation Dataset Generation for Ragas

In this notebook, we generate an automatic or synthetic evaluation dataset that we can use to evaluate our RAG pipeline using Ragas. Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines.  

Authors:
- Luis Bernardo Hernandez Salinas
- Juan R. Terven

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
from tqdm import tqdm
import pandas as pd
from datasets import Dataset

In [None]:
# Model to use
llm_name = "gpt-4o"

# API key 
client = os.environ['OPENAI_API_KEY']

print(f"Using model {llm_name}")

## Get the documents splits

In [None]:
loader = PyPDFLoader('chevrolet-spark.pdf')

# load pdf pages
pages = loader.load()
print(f"The document has {len(pages)} pages")

# RecursiveCharacterTextSplitter with overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,  # chunk size in characters
    chunk_overlap = 150 # Caracteres de solapamiento entre segmentos consecutivos.
)

# split documents
splits = text_splitter.split_documents(pages)

print(f"Generated {len(splits)} splits")

In [None]:
# first split with info
first_split = 46
# last split with info
last_split = 200

splits[46]

## Generating a synthetic question for testing the question generator

In [None]:
# ResponseSchema is a class that acts as the architectural blueprint for data elements in a response.
# Imagine it as the template for each piece in a complex puzzle of structured output.
question_schema = ResponseSchema(
    name='question',
    description='a question about the context.'
)
question_response_schemas = [question_schema]

# StructuredOutputParser is a class crafted for decoding and processing structured outputs,
# like a detective unraveling the mysteries of data (think JSON) returned from a source (often a language model)
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

# Define a template for the question generator
qa_templates = """\
You are a car expert creating a test for car users. For each context, create a question that is specific to the context.
Avoid creating generic or general questions. All the questions must be in english.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""
prompt_template = ChatPromptTemplate.from_template(template=qa_templates)

# Generate a question from the provided context
messages = prompt_template.format_messages(
    context=splits[50],
    format_instructions=format_instructions
)
response = llm(messages) # Utiliza el modelo de lenguaje para generar respuestas.
output_dict = question_output_parser.parse(response.content) # Procesa y extrae información estructurada de la respuesta.

In [None]:

for k, v in output_dict.items():
    print(k)  # Imprime la clave del par actual.
    print("")
    print(v)  # Imprime el valor asociado a la clave actual.

## Generate 20 synthetic questions

In [None]:
# Inicialización de la lista para almacenar los triples de pregunta, respuesta y contexto.
qac_triples = []

# Procesamiento de los primeros 20 segmentos de texto con una barra de progreso visible.
for text in tqdm(random.sample(splits[first_split:last_split], 20)):
    # Formateo de mensajes basados en el contexto para enviar al modelo de lenguaje.
    messages = prompt_template.format_messages(
        context=text,
        format_instructions=format_instructions
    )
    
    # Generación de respuesta mediante el modelo de lenguaje.
    response = llm(messages)
    
    try:
        # Intento de parsear la respuesta para extraer datos estructurados.
        output_dict = question_output_parser.parse(response.content)
    except Exception as e:
        # Continuar con el siguiente segmento de texto si hay un error en el parseo.
        continue
    
    # Añadir el texto original como contexto en el diccionario de salida.
    output_dict['context'] = text
    # Añadir el diccionario actualizado a la lista de triples.
    qac_triples.append(output_dict)

In [None]:
qac_triples[1]

## Insert the answer to each question 

In [None]:
answer_generation_llm = ChatOpenAI(model=llm_name, temperature=0)
answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)
answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a car expert creating a test for car users. For each question and context, create an answer.
answer: an answer about the context.
Format the output as JSON with the following keys:
answer
question: {question}
context: {context}
"""
prompt_template = ChatPromptTemplate.from_template(template=qa_template)
answer_generation_chain = answer_generation_llm

### Let's first try with a single one and check the results

In [None]:
messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)
response = answer_generation_chain.invoke(messages)
output_dict = answer_output_parser.parse(response.content)

In [None]:
for k, v in output_dict.items():
    print(k)
    print("-----")
    print(v)

### Now get the answers on all the questions

In [None]:
for triple in tqdm(qac_triples):
    messages = prompt_template.format_messages(
        context=triple['context'],
        question=triple['question'],
        format_instructions=format_instructions
    )
    response = answer_generation_chain.invoke(messages)
    
    try:
        output_dict = answer_output_parser.parse(response.content)
    except Exception as e:
        continue
    
    # Actualización del triple actual con la respuesta generada.
    triple['answer'] = output_dict['answer']

## Combine questions, contexts, and answers for evaluation dataset

In [None]:
# To pandas
ground_truth_qac_set = pd.DataFrame(qac_triples)

# Make sure context is string
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))

# rename answer to groundtruth
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})

# Convert to Hugging Face Dataset
eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [None]:
eval_dataset

In [None]:
eval_dataset[9]

In [None]:
eval_dataset.to_csv('ground_truth_qac_set_spark_2.csv')