# RAG Evaluation

To evaluate a RAG pipeline, we need : 
- a RAG pipeline (Retriever - LLM )
- a Vectorial Databse (with associated embedding model)
- a test dataset with Q&A. 

In [None]:
#load vectorial dataset 
!mc cp s3/projet-llm-insee-open-data/data/chroma_database/chroma_db  ./src/data --recursive
#load test dataset 
!mc cp s3/projet-llm-insee-open-data/data/eval_data/eval_dataset.csv ./src/data

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

#vector database
DB_DIR = 'src/data/chroma_db'
#embedding model 
EMB_DEVICE = "cuda"

EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

#LLM
MODEL_DEVICE = {"": 0}
#MODEL_NAME = "tiiuae/falcon-7b"  #use flash attention (faster Attention computation) and Quantization (smaller model memory usage)

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2" 
#MODEL_NAME = "EleutherAI/gpt-neo-1.3B"
#MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"

def build_llm_model():
    """
    Create the llm model
    """
    torch.cuda.empty_cache()

    #load LLM config 
    config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
    config.max_position_embeddings = 8096
    #load quantization config 
    quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=False,
        )
    #load llm tokenizer 
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, device_map='auto') 

    # Check if tokenizer has a pad_token; if not, set it to eos_token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    #load llm 
    model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            config=config,
            quantization_config=quantization_config
        )
    #Create a pipeline with  tokenizer and model  
    pipeline_HF = pipeline(task="text-generation", # TextGenerationPipeline HF pipeline
                model=model, 
                tokenizer=tokenizer,
                max_new_tokens=2000,
                temperature=0.2, 
                return_full_text=False, 
                device_map="auto",
                do_sample=True,
            )
    # Create a LangChain Runnable pipeline 

    langchain_llm = HuggingFacePipeline(pipeline=pipeline_HF)

    return langchain_llm

def format_docs(docs) -> str:
    """
    Format the retrieved document before giving their content to complete the prompt 
    """
    return "\n\n".join(doc.page_content for doc in docs) 


def build_chain(hf_embeddings, vectorstore, retriever, prompt, llm):
    """ 
    Build a LLM chain based on Langchain package and INSEE data 
    """
    #Create a Langchain LLM Chain 
    rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | llm 
        | StrOutputParser()
    )
    return rag_chain_from_docs


In [None]:
""" 
<s>[INST]
En utilisant les informations contenues dans le contexte,
fournissez une réponse complète à la question.
Répondez uniquement à la question posée, la réponse doit être concise et pertinente par rapport à la question.
Fournissez le numéro du document source lorsque cela est pertinent.
Si la réponse ne peut pas être déduite du contexte, ne donnez pas de réponse.</s>

Contexte :
{contexte}
---
Maintenant, voici la question à laquelle vous devez répondre.

Question : {question}
[/INST]
"""

In [None]:
#create prompt for chat template 
RAG_PROMPT_TEMPLATE = """
<s>[INST] 
Tu es un assistant spécialisé dans la statistique publique répondant aux questions d'agent de l'INSEE. 
Réponds en Français seulement.
Utilise les informations obtenues dans le contexte, réponds de manière argumentée à la question posée.
La réponse doit être développée et citer ses sources.

Si tu ne peux pas induire ta réponse du contexte, ne réponds pas. 
Voici le contexte sur lequel tu dois baser ta réponse : 
Contexte: {context}
        ---
Voici la question à laquelle tu dois répondre : 
Question: {question}
[/INST]
"""

#load Embedding model 
hf_embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL_NAME, model_kwargs={"device": EMB_DEVICE})
#load vector database
vectorstore = Chroma(collection_name="insee_data", embedding_function=hf_embeddings, persist_directory=str(DB_DIR))
#set up a retriever 
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs = {"k":10})
#generate prompt template
prompt = PromptTemplate(input_variables=["context", "question"], template=RAG_PROMPT_TEMPLATE)

#create a pipeline with tokenizer and LLM
llm = build_llm_model()

langchain ChromaDB class support batch querying => ask multiple questions and recieved multiple documents

In [None]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.documents import Document
from typing import Optional, Tuple, List

def answer_with_rag(
    questions: list[str],
    llm_model,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:

    """Answer a batch of questions using RAG with the given knowledge index.
    return a batch of answers and relevant documents. 
    """
    batch_final_prompt = []
    batch_relevant_documents = []
    for q in questions:
        # Gather documents with retriever
        relevant_docs = knowledge_index.similarity_search(query=q, k=num_retrieved_docs)
        relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

        # Optionally rerank results
        if reranker:
            relevant_docs = reranker.rerank(q, relevant_docs, k=num_docs_final)
            relevant_docs = [doc["content"] for doc in relevant_docs]

        relevant_docs = relevant_docs[:num_docs_final]

        # Build the final prompt
        context = "\nExtracted documents:\n"
        context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

        final_prompt = RAG_PROMPT_TEMPLATE.format(question=q, context=context)

        batch_final_prompt.append(final_prompt)
        batch_relevant_documents.append(relevant_docs)

    # Redact an answer
    batch_answer = llm_model.batch(batch_final_prompt)
    batch_answer = [out.replace("\nA: ", "") for out in batch_answer] #clean up 
    return batch_answer, batch_relevant_documents

**Generating Answers**

In [None]:
import datasets
from tqdm import tqdm
import json
import time 
import numpy as np

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
    batch_size = 2
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    num_rows = len(eval_dataset["train"])

    for batch_start in tqdm(range(0, num_rows, batch_size)):

        # Adjust batch end to handle the last incomplete batch
        batch_end = min(batch_start + batch_size, num_rows)
        batch_examples = eval_dataset["train"][batch_start:batch_end]
        #eval_dataset is a HF dataset then a slice return a sub dataset (a dict {'context' : [...,...]}) 
        #no rows like in pandas dataframe. 
        batch_questions = batch_examples["question"]

        indices_to_remove = [] #the question have been already asked
        already_answered = set([output["question"] for output in outputs]) if len(outputs) > 0 else {}
        for i, question in enumerate(batch_questions):
            if question in already_answered: 
                indices_to_remove.append(i)
                
        batch_examples = {key: [value for i, value in enumerate(values) if i not in indices_to_remove] for key, values in batch_examples.items()}
        #slice to only select the questions that have never been answered. 
        batch_questions = batch_examples["question"]
        
        batch_answers, batch_relevant_docs = answer_with_rag(batch_questions, llm, knowledge_index, reranker=reranker)
        for i , (question, answer , relevant_docs) in enumerate(zip(batch_questions, batch_answers, batch_relevant_docs)):
            
            if verbose:
                print("=======================================================")
                print(f"Question: {question}")
                print(f"Answer: {answer}")
                print(f'True answer: {batch_examples["answer"][i]}')
            
            result = {
                "question": question,
                "true_answer": batch_examples["answer"][i],
                "source_doc": batch_examples["source_doc"][i],
                "generated_answer": answer,
                "retrieved_docs": [doc for doc in relevant_docs],
            }
            if test_settings:
                result["test_settings"] = test_settings
            outputs.append(result)

            with open(output_file, "w") as f:
                json.dump(outputs, f)    

In [None]:
#! mc cp s3/projet-llm-insee-open-data/data/eval_data/eval_dataset.csv ./src/data/
eval_dataset = datasets.load_dataset('csv', data_files="src/data/eval_dataset.csv",) #load eval dataset

In [None]:
run_rag_tests(eval_dataset = eval_dataset,
                llm = llm,
                knowledge_index = vectorstore,
                output_file = "test_generated_ans.json",
                reranker = None,
                verbose=True,
                test_settings = MODEL_NAME,
                batch_size = 5
                )

**Get access to test_generated_ans.json** : mc cp s3/projet-llm-insee-open-data/data/eval_data/test_generated_ans.json ./src/data/

**Write an Evaluation prompt for a Critique LLM.**

In [None]:
EVALUATION_PROMPT = """
<s><|user|>
###Description de la tâche :
Une instruction (pouvant inclure une Entrée à l'intérieur), une réponse à évaluer, une réponse de référence qui obtient un score de 5, et une grille de notation représentant un critère d'évaluation sont fournis.

###L'instruction à évaluer :
{instruction}

###Réponse à évaluer :
{response}

###Réponse de référence (Score 5) :
{reference_answer}

###Grille de notation :
[La réponse est-elle correcte, précise et factuelle par rapport à la réponse de référence ?]
Score 1 : La réponse est complètement incorrecte, imprécise et/ou non factuelle.
Score 2 : La réponse est principalement incorrecte, imprécise et/ou non factuelle.
Score 3 : La réponse est quelque peu correcte, précise et/ou factuelle.
Score 4 : La réponse est principalement correcte, précise et factuelle.
Score 5 : La réponse est complètement correcte, précise et factuelle.

1. Rédigez un feedback détaillé évaluant la qualité de la réponse strictement en fonction de la grille de notation donnée, sans évaluation générale.
2. Après avoir rédigé un feedback, attribuez un score qui est un entier entre 1 et 5. Vous devez vous référer à la grille de notation.
3. Le format de sortie devrait ressembler à ce qui suit : "Feedback: {{écrire un feedback pour le critère}} [RESULTAT] {{un nombre entier entre 1 et 5}}"
4. Veuillez ne pas générer d'autres ouvertures, fermetures et explications. Assurez-vous d'inclure la balise [RESULTAT] dans votre sortie.
<|end|>
<|assistant|>
Feedback: """

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage


evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="Tu es un modèle de langue évaluateur juste"),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

In [None]:
print(evaluation_prompt_template)

**Evaluate the generated answer**

To assess the generated answer by our RAG (using Mistral 8b), we use the open source alternative of GPT4 evaluation called Prometheus-13b-v1.0 part of the prometheus-eval LLM family. It has been fined tune on 100K feedback messages 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

#teacher_name = "prometheus-eval/prometheus-7b-v2.0"
teacher_name = "microsoft/Phi-3-mini-128k-instruct" 

#load LLM config 
teacher_config = AutoConfig.from_pretrained(teacher_name, trust_remote_code=True)

#load quantization config 
teacher_quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=False,
    )
#load llm tokenizer 
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_name, use_fast=True, device_map='auto')

#load llm 
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_name,
        config=teacher_config,
        device_map="auto",
        quantization_config = teacher_quantization_config ,
        trust_remote_code=True
    )


In [None]:
#create a pipeline for Evaluator Model 
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
pipeline_HF = pipeline(task="text-generation", # TextGenerationPipeline HF pipeline
                model=teacher_model, 
                tokenizer=teacher_tokenizer,
                temperature=0.2, 
                return_full_text=False, 
                device_map="auto",
                do_sample=True,
            )
# Create a LangChain Runnable pipeline 
evaluator_model = HuggingFacePipeline(pipeline=pipeline_HF, model_kwargs={"max_length": 4000})

In [None]:
import os 
from tqdm import tqdm 
import json 
import random

generation_args = {
    "return_full_text": False,
    "do_sample": True,
    'max_length': 10000
}

def evaluate_answers(
    answer_path: str,
    eval_chat_model : pipeline,
    evaluator_name: str,
    evaluation_prompt_template: str, 
    pipeline_args : dict = generation_args
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    i = random.randint(0, len(answers))
    experiment = answers[i]

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )

        try:
            eval_result = eval_chat_model(eval_prompt,**pipeline_args)
            feedback, score = eval_result[0]["generated_text"].split("[RESULTAT]")
        except Exception as e:
            print('Error:', e)
            print(eval_result)
            continue
        """
        print("feedback : ", feedback)
        print("score : ", score)
        """
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

In [None]:
evaluate_answers(
        answer_path = "/home/onyxia/work/llm-open-data-insee/src/data/test_generated_ans.json", 
        eval_chat_model = pipeline_HF,
        evaluator_name= teacher_name.replace("/","-"),
        evaluation_prompt_template = EVALUATION_PROMPT, 
        pipeline_args = generation_args
)

**inspect the results**

In [None]:
import glob
import pandas as pd
outputs = []
for file in glob.glob("/home/onyxia/work/llm-open-data-insee/src/data/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)

In [None]:
result["eval_score_microsoft-Phi-3-mini-128k-instruct"] = result["eval_score_microsoft-Phi-3-mini-128k-instruct"] .apply(lambda x: int(x) if isinstance(x, str) else 1)
result["eval_score_microsoft-Phi-3-mini-128k-instruct"]  = (result["eval_score_microsoft-Phi-3-mini-128k-instruct"] - 1 )/ 4

In [None]:
average_scores = result.groupby("settings")["eval_score_microsoft-Phi-3-mini-128k-instruct"].mean()
average_scores.sort_values()

In [None]:
result