# Retrieval Information Evaluation

Goal : Evaluate Retrieval part of the RAG pipeline.

- use appropriate non Eval LLM metrics like Recall from Information Retrieval (IR) (LLM based metrics will be used later on)
- Allow hyperparameter tuning (chunck size, embedding model, number of retrieved documents, use a RERANKER or not, add metadata to the chuncks)
- Create clear visuals
- Focus on Reproductibility with a small dataset. 

### Extract data sample

From **data_complete.csv** (raw data) OR **insee_documents.csv** (Extracted metadata and paragraphs)

In [None]:
# python insee_data_processing.py
import pandas as pd

data = pd.read_csv("data_complete.csv", low_memory=False)
data_sample = data.sample(1000)
data_sample.head()

from utils import extract_paragraphs

data_sample = extract_paragraphs(data_sample)

In [None]:
from langchain.docstore.document import Document as LangchainDocument
from tqdm import tqdm

# pd.DataFrame.from_dict(data_sample)
ds = pd.read_csv("insee_documents.csv", low_memory=False)

In [None]:
langchain_docs = [
    LangchainDocument(
        page_content=doc["paragraphs"],
        metadata={
            "source": doc["url_source"],
            "title": doc["titles_para"],
            "insee_id": doc["id_origin"],
            "categories": doc["categories"],
            "date_diffusion": doc["dateDiffusion"],
            "themes": doc["themes"],
            "collections": doc["collections"],
            "libelleAffichageGeo": doc["libelleAffichageGeo"],
            "intertitres": doc["intertitres"],
            "authors": doc["authors"],
            "subtitle": doc["subtitle"],
        },
    )
    for _, doc in tqdm(ds.iterrows())
]

In [None]:
len(langchain_docs)

### Get the top N documents with largest contexts

In [None]:
from typing import List


def get_top_n_documents_with_largest_content(documents: List[LangchainDocument], n=1000):
    # Create a list of tuples (content_size, document)
    document_sizes = [(len(doc.page_content.split()), doc) for doc in documents]

    # Sort the list by content size in descending order
    sorted_documents = sorted(document_sizes, key=lambda x: x[0], reverse=True)

    # Get the top N documents
    top_n_documents = [doc for (_, doc) in sorted_documents[:n]]

    return top_n_documents


sample_langchain_docs = get_top_n_documents_with_largest_content(langchain_docs, n=100)

In [None]:
import numpy as np


def stats(documents: List[LangchainDocument], type="categories"):
    if not documents:
        return None, None, None

    res = {}
    for doc in documents:
        if doc.metadata[type] in res:
            res[doc.metadata[type]].append(len(doc.page_content.split()))
        else:
            res[doc.metadata[type]] = [len(doc.page_content.split())]

    for k in res:
        lengths = res[k]
        n = len(lengths)
        max_length = max(lengths)
        min_length = min(lengths)
        avg_length = sum(lengths) / len(lengths)
        res[k] = (n, max_length, np.round(avg_length, 2), min_length)

    return res


stats(sample_langchain_docs)

In [None]:
def store_langchain_document(documents: List[LangchainDocument]):
    data = []

    for document in documents:
        # Create a dictionary for each document
        doc_data = {
            "content": document.page_content,
        }
        # Add metadata fields
        doc_data.update(document.metadata)

        data.append(doc_data)

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    return df


df = store_langchain_document(sample_langchain_docs)
# Optionally, save the DataFrame to a CSV file
output_csv_path = "insee_documents_sample_ref_retrieval_evaluation.csv"
df.to_csv(output_csv_path, index=False)

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

counters_para = Counter()

for doc in sample_langchain_docs:
    counters_para[len(doc.page_content.split())] += 1

lengths_sorted = sorted(counters_para.items())
lengths, counts = zip(*lengths_sorted)
# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(lengths, counts, color="skyblue")
plt.xlabel("Paragraph Length (number of words)")
plt.ylabel("Number of Paragraphs")
plt.title(f"Paragraph Length Distribution ({len(sample_langchain_docs)} docs)")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

## Generate Questions

In [None]:
import pandas as pd
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders import DataFrameLoader

# load reference dataset :


def load_sample_data(path_data, content_column="content"):
    df = pd.read_csv(path_data)
    loader = DataFrameLoader(df, page_content_column=content_column)
    return loader.load()

In [None]:
from typing import Dict, List, Optional

import pandas as pd
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str],
    params: Dict,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=params["markdown_separator"],
    )

    docs_processed = []

    for doc in tqdm(knowledge_base):
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

REF_EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MARKDOWN_SEPARATORS = ["\n\n", "\n", ".", " ", ""]

print(f"----------Loading Embedding model {REF_EMBEDDING_MODEL_NAME}------------")

embedding_model = HuggingFaceEmbeddings(  # load sentence transformers
    model_name=REF_EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
    show_progress=False,
)

print("---------Chunking LangChain documents-----------")

embedder_max_token_length = SentenceTransformer(REF_EMBEDDING_MODEL_NAME).max_seq_length
print(f"max sequence of token for {REF_EMBEDDING_MODEL_NAME} : {embedder_max_token_length}")

docs_processed_sample = split_documents(
    knowledge_base=raw_ref_database,
    chunk_size=embedder_max_token_length,
    tokenizer_name=REF_EMBEDDING_MODEL_NAME,
)

print("----------Building Chroma DB------------")
collection_name = "insee_sample_" + str(REF_EMBEDDING_MODEL_NAME.split("/")[-1])
db_chroma = Chroma.from_documents(
    collection_name=collection_name,
    persist_directory="./chroma_db",
    documents=docs_processed_sample,
    embedding=embedding_model,
)

In [None]:
import torch
from transformers import pipeline


def remove_extra_spaces(text):
    # Split the text by whitespaces and join them with a single whitespace
    return " ".join(text.split())


def extract_question(text):
    return text.split("?")[0] + "?"


Q_generation_prompt = """
<|user|>
Ta tâche consiste à écrire une question factuelle en te basant sur un contexte donné.
Ta question factuelle doit pouvoir être répondue par une information factuelle spécifique et concise tirée du contexte.
Ta question factuelle doit être formulée dans le même style que les questions que les utilisateurs pourraient poser dans un moteur de recherche.
Cela signifie que ta question factuelle NE DOIT PAS mentionner des phrases comme "selon le passage" ou "le contexte".
La question doit avoir pour sujet une thématiques d'un institut de statistique public. 
Tu DOIS respecter faire apparaitre "Question factuelle : " avant ta réponse. 
<|user|>
<|assistant|>
Voici maintenant le contexte.

Contexte : {context}

Question factuelle : (ta question factuelle)
<|assistant|>
"""


def generate_test_question(
    vector_database: Chroma, pipeline: pipeline, nb_documents=50, batch_size=5, generation_args=None
):
    """
    Generate Question based on a given vector_database (based on the smallest "max_length_token").
    """
    test_data = {"question": [], "content": [], "source": []}

    # sample indices
    indices = np.random.choice(len(vector_database.get()["ids"]), nb_documents)

    for x in indices:
        doc = vector_database.get()["metadatas"][x]
        source = doc["source"]
        content = vector_database.get()["documents"][x]

        test_data["content"].append(content)
        test_data["source"].append(source)

    torch.cuda.empty_cache()
    print(f"Generating {nb_documents} Questions ...")
    # generate final prompts
    batch_prompts = [Q_generation_prompt.format(context=ctx) for ctx in test_data["content"]]
    # add batch size params
    if "batch_size" not in generation_args.keys():
        generation_args["batch_size"] = batch_size

    with torch.no_grad():
        generated_out = pipeline(batch_prompts, **generation_args)

    for out in generated_out:
        output_Q = out[0]["generated_text"]
        question = extract_question(
            remove_extra_spaces(output_Q.split("Question factuelle : ")[-1])
        )
        test_data["question"].append(question)

    return test_data

In [None]:
from transformers import AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "microsoft/Phi-3-mini-128k-instruct"

# load LLM config
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)

# load quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    quantization_config=quantization_config,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

generation_args = {
    "max_new_tokens": 300,
    "return_full_text": False,
    "do_sample": False,
}
test_data = generate_test_question(
    vector_database=db_chroma,
    pipeline=pipe,
    nb_documents=100,
    batch_size=10,
    generation_args=generation_args,
)
q_and_s_df = pd.DataFrame(test_data)
q_and_s_df.to_csv(f"Q&S_ref_retrieval_evaluation_{model_name.split("/")[-1]}.csv")

## Evaluates Embedding models on Retrieval Task

embedding model to test : 
- sentence-transformers/all-MiniLM-L6-v2 (multi)
- manu/sentence_croissant_alpha_v0.4
- OrdalieTech/Solon-embeddings-large-0.1
- sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
- intfloat/multilingual-e5-large

IR Hyperparameters : 
- chunck size : integer
- embedding model : str
- overlap size : integer
- Reranker : str (name + hyperparams if exists => BM25 ou ColBERT.)
- quantization : Bool (hyperparams if exist
- database format : str (ChromaDB,...)
- max token length : integer (number of maximum token in the context window of the embedding model)
- number of chunks generated by the embedding model (depends on chunck size)
- Embedding fine tuning
- Filter k : integer (number of retrieved documents (before any post-processing computation like reranking))
- use Embedding metadata (creating another index based on metadata (like Title)): Bool (if True which metadata)

We will ask to retrieved 20 or 30 documents but only extract the top 15 (with or without reranking models.)

### Evaluate Retrieval 

MLFlow is used to keep tracking experiments 

mc cp s3/projet-llm-insee-open-data/data/eval_data/eval_retrieval . --recursive

In [None]:
from typing import Dict

import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.vectorstores import VectorStore
from ragatouille import RAGPretrainedModel
from rank_bm25 import BM25L, BM25Okapi, BM25Plus
from sklearn.metrics import f1_score, precision_score
from tqdm import tqdm


def recall(retrieved, relevant):
    intersection = set(retrieved) & set(relevant)
    return len(intersection) / len(relevant)


def precision(retrieved, relevant):
    intersection = set(retrieved) & set(relevant)
    return len(intersection) / len(retrieved) if len(retrieved) > 0 else 0


def compute_hit_rate(predictions, labels):
    """
    Hit rate metric is equivalent to the accuracy
    """
    correct_predictions = sum(1 for pred, label in zip(predictions, labels) if pred == label)
    total_predictions = len(predictions)
    hit_rate = correct_predictions / total_predictions
    return hit_rate


def compute_mmr(source, retrieved_sources):
    # compute Mean Reciprocal Rank (Order Aware Metrics)
    if source not in retrieved_sources:
        mrr_score = 1 / np.inf
    else:
        rank_q = retrieved_sources.index(source)
        mrr_score = 1 / (rank_q + 1)
    return mrr_score


def remove_duplicates(docs):
    seen_content = set()
    unique_docs = []
    for doc in docs:
        if doc.page_content not in seen_content:
            unique_docs.append(doc)
            seen_content.add(doc.page_content)
    return unique_docs


def rerank_with_ColBERT(reranker, query, retrieved_docs, filter_k):
    relevant_docs = [doc.page_content for doc in retrieved_docs]  # keep only text
    reranked_docs = reranker.rerank(query=query, documents=relevant_docs, k=filter_k)
    content_to_doc = {
        doc.page_content: doc for doc in retrieved_docs if isinstance(doc.page_content, str)
    }
    return [content_to_doc[doc["content"]] for doc in reranked_docs]


def rerank_with_BM25(model_class, query, retrieved_docs, filter_k):
    relevant_docs = [doc.page_content for doc in retrieved_docs]
    bm25 = model_class(relevant_docs)
    tokenized_query = query.split()
    return bm25.get_top_n(tokenized_query, retrieved_docs, n=filter_k)


def rerank_with_metadata(reranker, query, retrieved_docs, filter_k, params):
    """
    note if the metadata is missing we use a "content" information (always exists) as a fallback
    """
    new_data = []
    for doc in retrieved_docs:
        metadata_field = params.get("use_metadata")
        if metadata_field in doc.metadata:
            page_content = doc.metadata[metadata_field]
        else:
            page_content = doc.page_content

        new_data.append(
            LangchainDocument(
                page_content=page_content,
                metadata={"source": doc.metadata.get("source", "unknown")},
            )
        )
    # load reranker
    new_retrieved_docs = rerank_with_ColBERT(
        reranker=reranker, query=query, retrieved_docs=new_data, filter_k=filter_k
    )

    source_to_doc_map = {doc.metadata.get("source", "unknown"): doc for doc in retrieved_docs}
    # reorder the original retrieved documents based on thee new reranked docs
    reordered_docs = [
        source_to_doc_map[new_doc.metadata["source"]]
        for new_doc in new_retrieved_docs
        if new_doc.metadata["source"] in source_to_doc_map
    ]

    return reordered_docs


def test_retriever(
    knowledge_index: VectorStore,
    eval_data: pd.DataFrame,
    embedding_model: HuggingFaceEmbeddings = None,
    re_ranker_config: Dict = None,
    params={},
):
    # recall at K : look at the k first document retrieved
    """
    knowledge_index : vector database for evaluation
    eval_data : dataframe with at least tuple : question - source
    embedding_model : embedding model to query the knowledge_index
    reranker : reranking model
    params : other hyperparameters
    """
    results = {
        "question": [],
        "source": [],
        "pred": [],
        "recall_at_1": [],
        "recall_at_2": [],
        "recall_at_3": [],
        "recall_at_5": [],
        "recall_at_10": [],
        "recall_at_15": [],
        "mrr_at_5": [],
        "mrr_at_10": [],
    }

    # print("pre-computing query embeddings")
    queries = list(eval_data["question"])
    embeddings_queries = embedding_model.embed_documents(queries)

    for i, row in tqdm(eval_data.iterrows(), total=len(eval_data)):
        q = row["question"]
        source = row["source"]

        results["question"].append(q)
        results["source"].append(source)

        if params["nb_retrieved_doc"] is None:
            params["nb_retrieved_doc"] = 20
        if params["filter_k"] is None:
            params["filter_k"] = 15

        filter_k = int(params["filter_k"])

        # retrieved documents from the vector database
        embedded_query = embeddings_queries[i]
        retrieved_docs = knowledge_index.similarity_search_by_vector(
            embedding=embedded_query, k=int(params["nb_retrieved_doc"])
        )
        # remove duplicates if exist
        # retrieved_docs = remove_duplicates(retrieved_docs)

        if re_ranker_config is not None:
            # relevant_docs = [doc.page_content for doc in retrieved_docs] # keep only text
            params["reranker"] = re_ranker_config["name"]

            if re_ranker_config["type"] == "ColBERT":
                reranker = RAGPretrainedModel.from_pretrained(re_ranker_config["name"], verbose=0)
                retrieved_docs = rerank_with_ColBERT(
                    reranker=reranker, query=q, retrieved_docs=retrieved_docs, filter_k=filter_k
                )
            if re_ranker_config["type"] == "BM25":
                retrieved_docs = rerank_with_BM25(
                    re_ranker_config["model_class"],
                    query=q,
                    retrieved_docs=retrieved_docs,
                    filter_k=filter_k,
                )

        if (
            params["use_metadata"] is not None
        ):  # build a small index on retrieved documents metadata.
            reranker = RAGPretrainedModel.from_pretrained(
                "antoinelouis/colbertv2-camembert-L4-mmarcoFR", verbose=0
            )  # use by default the best french ColBERT model
            retrieved_docs = rerank_with_metadata(
                reranker=reranker,
                query=q,
                retrieved_docs=retrieved_docs,
                filter_k=filter_k,
                params=params,
            )

        retrieved_docs = retrieved_docs[:filter_k]  # only keep the first num_docs_final documents
        retrieved_sources = [doc.metadata.get("source") for doc in retrieved_docs]

        if len(retrieved_sources) > 0:
            results["pred"].append(
                retrieved_sources[0]
            )  # will be compare to source using precision and recall and average precision
            # compute recall at
            results["recall_at_1"].append(recall(retrieved_sources[:1], [source]))
            results["recall_at_2"].append(recall(retrieved_sources[:2], [source]))
            results["recall_at_3"].append(recall(retrieved_sources[:3], [source]))
            results["recall_at_5"].append(recall(retrieved_sources[:5], [source]))
            results["recall_at_10"].append(recall(retrieved_sources[:10], [source]))
            results["recall_at_15"].append(recall(retrieved_sources[:15], [source]))

            # compute Mean Reciprocal Rank (Order Aware Metrics)
            results["mrr_at_5"].append(compute_mmr(source, retrieved_sources[:5]))
            results["mrr_at_10"].append(compute_mmr(source, retrieved_sources[:10]))
        else:
            logging.warning(f"No documents retrieved for query: {q}")

    metrics = {
        "recall_at_1": np.mean(results["recall_at_1"]),
        "recall_at_2": np.mean(results["recall_at_2"]),
        "recall_at_3": np.mean(results["recall_at_3"]),
        "recall_at_5": np.mean(results["recall_at_5"]),
        "recall_at_10": np.mean(results["recall_at_10"]),
        "recall_at_15": np.mean(results["recall_at_15"]),
        "precision_score": precision_score(
            y_true=results["source"], y_pred=results["pred"], average="micro"
        ),
        "f1_score": f1_score(y_true=results["source"], y_pred=results["pred"], average="micro"),
        "mrr_at_5": np.mean(results["mrr_at_5"]),
        "mrr_at_10": np.mean(results["mrr_at_10"]),
        "hit_rate": compute_hit_rate(predictions=results["pred"], labels=results["source"]),
    }
    return pd.DataFrame.from_dict(results), metrics

In [None]:
# MLFlow experiment
import warnings

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.",
)
import logging
import os

import mlflow
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

params = {
    "chunck_size": None,
    "embedding_model": None,
    "overlap_size": None,
    "reranker": None,
    "quantization": False,
    "database_format": "ChromaDB",
    "max_token_length": None,
    "nb_chunks": None,
    "fine_tuned_embedding": False,
    "nb_retrieved_doc": 30,
    "filter_k": 15,
    "use_metadata": None,
    "markdown_separator": None,
    "word_embedding_dim": None,
}

EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1"
MARKDOWN_SEPARATORS = ["\n\n", "\n", ".", " ", ""]
# since the questions have been generated on "sentence-transformers/all-MiniLM-L6-v2" chunks, it seems that there is a bias in the results (see the 30/05 exp)
embedding_model_test = [
    "sentence-transformers/all-mpnet-base-v2",
    "manu/sentence_croissant_alpha_v0.4",
    "OrdalieTech/Solon-embeddings-large-0.1",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "intfloat/multilingual-e5-large",
]
re_ranker_model_test = [
    {"type": "BM25", "model_class": BM25Okapi, "name": "BM25Okapi"},
    {"type": "BM25", "name": "BM25L", "model_class": BM25L},
    {"type": "BM25", "name": "BM25Plus", "model_class": BM25Plus},
    {"type": None, "name": None, "model_class": None},
    {
        "type": "ColBERT",
        "name": "antoinelouis/colbertv2-camembert-L4-mmarcoFR",
        "model_class": None,
    },
    {"type": "ColBERT", "name": "bclavie/FraColBERTv2", "model_class": None},
    {"type": "ColBERT", "name": "colbert-ir/colbertv2.0", "model_class": None},
]

default_reranker = {"type": None, "name": None, "model_class": None}

### Reranking Experiment 

In [None]:
# define logging display level
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# set MLFlow exp name
EXPERIMENT_NAME = "RETRIEVER_rerank"
# check if MLFlow URI and S3 endpoints are correctly set up.
assert (
    "MLFLOW_TRACKING_URI" in os.environ
), "Please set the MLFLOW_TRACKING_URI environment variable."

# Set our tracking server uri for logging
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
# Create a new MLflow Experiment
mlflow.set_experiment(EXPERIMENT_NAME)

for re_ranker_config in re_ranker_model_test:
    with mlflow.start_run():
        logging.info(f"---------- Exp {EXPERIMENT_NAME} model : {re_ranker_config["name"]} ")

        params["embedding_model"] = EMBEDDING_MODEL_NAME
        params["reranker"] = re_ranker_config["name"]
        params["markdown_separator"] = MARKDOWN_SEPARATORS

        # Load ref Corpus
        logging.info("---------- Loading test Corpus data")
        path_data = (
            "/home/onyxia/work/eval_retrieval/insee_documents_sample_ref_retrieval_evaluation.csv"
        )
        raw_ref_database = load_sample_data(path_data)

        # Load the embedding model to get the max token length
        logging.info(f"---------- Loading Embedding model {EMBEDDING_MODEL_NAME}")
        embedding_model = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL_NAME,
            multi_process=True,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"normalize_embeddings": True},  # Set True for cosine similarity
            show_progress=False,
        )

        # get the max token length
        embedder_max_token_length = embedding_model.client.get_max_seq_length()
        params["max_token_length"] = embedder_max_token_length
        params["chunck_size"] = embedder_max_token_length
        params["overlap_size"] = int(params["chunck_size"] / 10)

        logging.info("--------- Chunking LangChain documents")
        logging.info(f"Max sequence length for {EMBEDDING_MODEL_NAME}: {embedder_max_token_length}")

        # build test corpus
        documents = split_documents(
            knowledge_base=raw_ref_database,
            chunk_size=embedder_max_token_length,
            tokenizer_name=EMBEDDING_MODEL_NAME,
            params=params,
        )
        params["nb_chunks"] = len(documents)

        logging.info("---------- Building Chroma DB")
        # build vector database
        collection_name = "insee_sample_" + EMBEDDING_MODEL_NAME.split("/")[-1]

        db_chroma = Chroma.from_documents(  # no persistance
            documents=documents, embedding=embedding_model
        )

        # load eval data
        eval_data_path = "/home/onyxia/work/eval_retrieval/q_and_s_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
        eval_data = pd.read_csv(eval_data_path)

        # run experiement
        logging.info(f"---------- Evaluating Retrieval Performances for {EMBEDDING_MODEL_NAME}")
        results, metrics = test_retriever(
            knowledge_index=db_chroma,
            eval_data=eval_data,
            embedding_model=embedding_model,
            re_ranker_config=re_ranker_config,
            params=params,
        )

        # log hyperparameters
        mlflow.log_params(params)

        # log metrics
        mlflow.log_metrics(metrics)

        # log prediction
        preds = mlflow.data.from_pandas(results, targets="source", predictions="pred")
        mlflow.log_input(preds, context="testing")
        logging.info("---------- End of Evaluation")

### Embedding Experiment 

In [None]:
# define logging display level
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# set MLFlow exp name
EXPERIMENT_NAME = "RETRIEVER_embedding_model"
# check if MLFlow URI and S3 endpoints are correctly set up.
assert (
    "MLFLOW_TRACKING_URI" in os.environ
), "Please set the MLFLOW_TRACKING_URI environment variable."

# Set our tracking server uri for logging
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
# Create a new MLflow Experiment
mlflow.set_experiment(EXPERIMENT_NAME)


for EMBEDDING_MODEL_NAME in embedding_model_test:
    with mlflow.start_run():
        logging.info(f"---------- Exp {EXPERIMENT_NAME} model : {EMBEDDING_MODEL_NAME}")

        params["embedding_model"] = EMBEDDING_MODEL_NAME
        params["reranker"] = default_reranker["name"]
        params["markdown_separator"] = MARKDOWN_SEPARATORS

        # Load ref Corpus
        logging.info("---------- Loading test Corpus data")
        path_data = (
            "/home/onyxia/work/eval_retrieval/insee_documents_sample_ref_retrieval_evaluation.csv"
        )
        raw_ref_database = load_sample_data(path_data)

        # Load the embedding model to get the max token length
        logging.info(f"---------- Loading Embedding model {EMBEDDING_MODEL_NAME}")
        embedding_model = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL_NAME,
            multi_process=True,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"normalize_embeddings": True},  # Set True for cosine similarity
            show_progress=False,
        )

        # get the max token length
        embedder_max_token_length = embedding_model.client.get_max_seq_length()
        params["max_token_length"] = embedder_max_token_length
        params["chunck_size"] = embedder_max_token_length
        params["overlap_size"] = int(params["chunck_size"] / 10)

        logging.info("--------- Chunking LangChain documents")
        logging.info(f"Max sequence length for {EMBEDDING_MODEL_NAME}: {embedder_max_token_length}")

        # build test corpus
        documents = split_documents(
            knowledge_base=raw_ref_database,
            chunk_size=embedder_max_token_length,
            tokenizer_name=EMBEDDING_MODEL_NAME,
            params=params,
        )
        params["nb_chunks"] = len(documents)

        logging.info("---------- Building Chroma DB")
        # build vector database
        collection_name = "insee_sample_" + EMBEDDING_MODEL_NAME.split("/")[-1]

        db_chroma = Chroma.from_documents(  # no persistance
            documents=documents, embedding=embedding_model, collection_name=collection_name
        )

        # load eval data
        # eval_data_path = "/home/onyxia/work/eval_retrieval/q_and_s_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
        eval_data_path = (
            "/home/onyxia/work/eval_retrieval/q_and_a_scored_filtered_Phi-3-mini-128k-instruct.csv"
        )
        eval_data = pd.read_csv(eval_data_path)

        # run experiement
        logging.info(f"---------- Evaluating Retrieval Performances for {EMBEDDING_MODEL_NAME}")
        results, metrics = test_retriever(
            knowledge_index=db_chroma,
            eval_data=eval_data,
            embedding_model=embedding_model,
            re_ranker_config=default_reranker,
            params=params,
        )

        # log hyperparameters
        mlflow.log_params(params)

        # log metrics
        mlflow.log_metrics(metrics)

        # log prediction
        preds = mlflow.data.from_pandas(results, targets="source", predictions="pred")
        mlflow.log_input(preds, context="testing")
        logging.info("---------- End of Evaluation")

### Quantization benchmark

In [None]:
# define the embedding model
# define logging display level
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# set MLFlow exp name
EXPERIMENT_NAME = "RETRIEVER_quantization"
# check if MLFlow URI and S3 endpoints are correctly set up.
assert (
    "MLFLOW_TRACKING_URI" in os.environ
), "Please set the MLFLOW_TRACKING_URI environment variable."

# Set our tracking server uri for logging
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
# Create a new MLflow Experiment
mlflow.set_experiment(EXPERIMENT_NAME)

for precision in ["float32", "int8", "uint8", "binary", "ubinary"]:
    with mlflow.start_run():
        logging.info(f"---------- Exp {EXPERIMENT_NAME} quantization : {precision}")

        params["embedding_model"] = EMBEDDING_MODEL_NAME
        params["reranker"] = None
        params["markdown_separator"] = MARKDOWN_SEPARATORS
        params["quantization"] = precision

        # Load ref Corpus
        logging.info("---------- Loading test Corpus data")
        path_data = (
            "/home/onyxia/work/eval_retrieval/insee_documents_sample_ref_retrieval_evaluation.csv"
        )
        raw_ref_database = load_sample_data(path_data)

        # Load the embedding model to get the max token length
        logging.info(f"---------- Loading Embedding model {EMBEDDING_MODEL_NAME}")
        embedding_model = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL_NAME,
            multi_process=True,
            model_kwargs={"device": "cuda"},
            encode_kwargs={
                "normalize_embeddings": True,
                "precision": precision,
            },  # Set True for cosine similarity
            show_progress=False,
        )

        # get the max token length
        embedder_max_token_length = embedding_model.client.get_max_seq_length()
        params["max_token_length"] = embedder_max_token_length
        params["chunck_size"] = embedder_max_token_length
        params["overlap_size"] = int(params["chunck_size"] / 10)

        logging.info("--------- Chunking LangChain documents")
        logging.info(f"Max sequence length for {EMBEDDING_MODEL_NAME}: {embedder_max_token_length}")

        # build test corpus
        documents = split_documents(
            knowledge_base=raw_ref_database,
            chunk_size=embedder_max_token_length,
            tokenizer_name=EMBEDDING_MODEL_NAME,
            params=params,
        )
        params["nb_chunks"] = len(documents)

        logging.info("---------- Building Chroma DB")
        # build vector database
        collection_name = "insee_sample_" + EMBEDDING_MODEL_NAME.split("/")[-1]

        db_chroma = Chroma.from_documents(  # no persistance
            documents=documents, embedding=embedding_model
        )

        # load eval data
        eval_data_path = "/home/onyxia/work/eval_retrieval/q_and_s_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
        eval_data = pd.read_csv(eval_data_path)

        # run experiement
        logging.info(f"---------- Evaluating Retrieval Performances for {EMBEDDING_MODEL_NAME}")
        results, metrics = test_retriever(
            knowledge_index=db_chroma,
            eval_data=eval_data,
            embedding_model=embedding_model,
            re_ranker_config=None,
            params=params,
        )

        # log hyperparameters
        mlflow.log_params(params)

        # log metrics
        mlflow.log_metrics(metrics)

        # log prediction
        preds = mlflow.data.from_pandas(results, targets="source", predictions="pred")
        mlflow.log_input(preds, context="testing")
        logging.info("---------- End of Evaluation")

### Use Metadata

Documents' metadata are would have potential a positive impact on retrieval performance. \
Expected results : helps retriever to refine its choices, use a reranker (BM25 or ColBERT model) OR an embedding model to query the retrieved documents. 

In [None]:
raw_ref_database[0].metadata.keys()

In [None]:
# define the embedding model
# define logging display level
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# set MLFlow exp name
EXPERIMENT_NAME = "RETRIEVER_metadata"
# check if MLFlow URI and S3 endpoints are correctly set up.
assert (
    "MLFLOW_TRACKING_URI" in os.environ
), "Please set the MLFLOW_TRACKING_URI environment variable."

# Set our tracking server uri for logging
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
# Create a new MLflow Experiment
mlflow.set_experiment(EXPERIMENT_NAME)

for metadata in [None, "title", "intertitre"]:
    with mlflow.start_run():
        logging.info(f"---------- Exp {EXPERIMENT_NAME} Metadata : {metadata}")
        params["embedding_model"] = EMBEDDING_MODEL_NAME
        params["reranker"] = None
        params["markdown_separator"] = MARKDOWN_SEPARATORS
        params["use_metadata"] = metadata

        # Load ref Corpus
        logging.info("---------- Loading test Corpus data")
        path_data = (
            "/home/onyxia/work/eval_retrieval/insee_documents_sample_ref_retrieval_evaluation.csv"
        )
        raw_ref_database = load_sample_data(path_data=path_data, content_column="content")

        # Load the embedding model to get the max token length
        logging.info(f"---------- Loading Embedding model {EMBEDDING_MODEL_NAME}")
        embedding_model = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL_NAME,
            multi_process=True,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"normalize_embeddings": True},  # Set True for cosine similarity
            show_progress=False,
        )

        # get the max token length
        embedder_max_token_length = embedding_model.client.get_max_seq_length()
        params["max_token_length"] = embedder_max_token_length
        params["chunck_size"] = embedder_max_token_length
        params["overlap_size"] = int(params["chunck_size"] / 10)

        logging.info("--------- Chunking LangChain documents")
        logging.info(f"Max sequence length for {EMBEDDING_MODEL_NAME}: {embedder_max_token_length}")

        # build test corpus
        documents = split_documents(
            knowledge_base=raw_ref_database,
            chunk_size=embedder_max_token_length,
            tokenizer_name=EMBEDDING_MODEL_NAME,
            params=params,
        )
        params["nb_chunks"] = len(documents)

        logging.info("---------- Building Chroma DB")
        # build vector database
        collection_name = "insee_sample_" + EMBEDDING_MODEL_NAME.split("/")[-1]

        # since the vector database is already stored
        db_chroma = Chroma.from_documents(  # no persistance
            documents=documents, embedding=embedding_model
        )

        # load eval data
        eval_data_path = "/home/onyxia/work/eval_retrieval/q_and_s_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
        eval_data = pd.read_csv(eval_data_path)

        # run experiement
        logging.info(f"---------- Evaluating Retrieval Performances for {EMBEDDING_MODEL_NAME}")
        results, metrics = test_retriever(
            knowledge_index=db_chroma,
            eval_data=eval_data,
            embedding_model=embedding_model,
            re_ranker_config=None,
            params=params,
        )

        # log hyperparameters
        mlflow.log_params(params)

        # log metrics
        mlflow.log_metrics(metrics)

        # log prediction
        preds = mlflow.data.from_pandas(results, targets="source", predictions="pred")
        mlflow.log_input(preds, context="testing")
        logging.info("---------- End of Evaluation")

### Context Recall and Context precision

#### Context Recall

In [None]:
import pandas as pd

data = pd.read_csv("eval_retrieval/q_and_a_scored_filtered_Phi-3-mini-128k-instruct.csv")

How to evaluate Context Recall ? 
1. Break the ground truth answer into individual statements. 
2. Verify if each statements are related to the given contexts.

In [None]:
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

model_name = "microsoft/Phi-3-mini-128k-instruct"  # (to download)

# load LLM config
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
# load quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    quantization_config=quantization_config,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
prompt_statement_generation = """ 
<|user|>
Pour CHAQUE phrase de la réponse attendue ci-dessous, déterminez si la phrase peut être attribuée aux contextes. Veuillez générer une liste de JSON avec deux clés : « verdict » et « raison ».
La clé « verdict » doit STRICTEMENT être un « oui » ou un « non ». Répondez « oui » si la phrase peut être attribuée à n'importe quelle partie du contexte de récupération, sinon répondez « non ».
La clé « raison » doit fournir une raison pour le verdict. Dans la raison, vous devez viser à inclure le nombre de nœuds dans le contexte de récupération (par exemple, le 1er nœud et le 2ème nœud dans le contexte de récupération) qui sont attribués à ladite phrase. Vous devez également viser à citer la partie spécifique du contexte de récupération pour justifier votre verdict, mais restez extrêmement concis et raccourcissez la citation avec des points de suspension si possible.
**
IMPORTANT : assurez-vous de renvoyer uniquement au format JSON, avec la clé "verdicts" sous forme de liste d'objets JSON, chacun avec deux clés : "verdict" et "raison".

{{
 "verdicts": [
 {{
 "raison": "...",
 "verdict": "oui"
 
 }},
 ...
 ]
}}

Puisque vous allez générer un verdict pour chaque phrase, le nombre de « verdicts » DEVRAIT ÊTRE STRICTEMENT ÉGAL au nombre de phrases du « résultat attendu ».
**
<|end|>
<|assistant|>

Réponse attendue:
{expected_output}

Contexte :
{retrieval_context}

JSON :
"""

In [None]:
def calculate_score(verdicts):
    number_of_verdicts = len(verdicts)
    if number_of_verdicts == 0:
        return 0

    justified_sentences = 0
    for verdict in verdicts:
        if verdict["verdict"].lower() == "oui":
            justified_sentences += 1

    score = justified_sentences / number_of_verdicts
    return score

In [None]:
import json

generation_args = {
    "max_new_tokens": 2000,
    "return_full_text": False,
    "do_sample": False,
    "batch_size": 1,
}

verdicts = []
results = []
for i in range(len(data[:5])):
    row = data.iloc[i]
    print(f"context : {row.context} \n question : {row.question} \n answer : {row.answer}")
    final_prompt = prompt_statement_generation.format(
        expected_output=row.answer, retrieval_context=row.context
    )
    generated_answer = pipe(final_prompt, **generation_args)
    answer = generated_answer[0]["generated_text"]
    d = json.loads(answer)
    try:
        verdicts += d
        results.append(calculate_score(d))
    except Exception as e:
        print("error :", e)
        print("issue at parsing : ", answer)

### Questions generated by GPT4o 

In [None]:
[
    {
        "source": "https://www.insee.fr/fr/statistiques/6047789",
        "questions": [
            "Comment les nouvelles mesures sociales et fiscales de 2018 ont-elles globalement affecté le niveau de vie des différentes tranches de population ?",
            "Quels sont les impacts spécifiques de la bascule des cotisations sociales vers la CSG pour les travailleurs par rapport aux retraités ?",
            "Pourquoi l'indice de Gini et d'autres indicateurs d'inégalité varient-ils faiblement malgré les réformes de 2018 ?",
            "En quoi la transformation de l’ISF en IFI et la mise en place du prélèvement forfaitaire unique sur les revenus du patrimoine ont-elles influencé les inégalités économiques en France ?",
            "Comment les nouvelles mesures fiscales et sociales de 2018 ont-elles globalement affecté le niveau de vie des ménages en France ?",
            "Quelles méthodes de microsimulation ont été utilisées pour évaluer les effets des réformes socio-fiscales de 2018, et quels sont leurs avantages et limites ?",
            "Comment les réformes fiscales et sociales de 2018 ont-elles contribué à la réduction (ou à l’augmentation) des inégalités en France ?",
        ],
    }
]