## Test 12/07/2024 - Update Retrieval Evaluation process

Goal:
The previous tests were conducted using a dataset comprising the 100 largest available documents from the raw database. As a result, this high-quality content may not accurately reflect the distribution of data in the entire vector database. While we can observe differences between the configurations, it is challenging to determine which combination is the best choice for our use case among the top configurations.

Several experiments will be done : 
- add random documents to the base corpus (uniformly distributed)
- add other big documents to the base corpus (keeping the same extraction procedure of the top N largest content documents)

what to observe : 
- evolution of the retrieval metrics facing this added noise. 

Assumption : run python src/db_building/insee_data_processing.py in the terminal

In [None]:
import sys

sys.path.append("../src")

import os 
import pandas as pd
import numpy as np
from typing import List 
from collections import Counter
import matplotlib.pyplot as plt 
from typing import List, Dict 
import math 
from tqdm import tqdm 
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from db_building import extract_paragraphs
from doc_building import compute_autokonenizer_chunk_size, build_documents_from_dataframe
from evaluation import RetrievalConfiguration, hist_results, plot_results
from config import MARKDOWN_SEPARATORS, DB_DIR_LOCAL, EMB_DEVICE


In [None]:
data = pd.read_csv("/home/onyxia/work/llm-open-data-insee/data_complete.csv", low_memory=False) #we assume the textual information have already been extracted. 

In [None]:
results = extract_paragraphs(data) 

In [None]:
ds = pd.DataFrame.from_dict(results)
ds.to_csv("insee_documents.csv", index=False)

In [None]:
ds = pd.read_csv("insee_documents.csv")

In [None]:
ds.head()

In [None]:
#question and answer
path_qa = "../data/q_and_a_scored_filtered_Phi-3-mini-128k-instruct.csv"
test = pd.read_csv(path_qa)

df_dict = {}
df_dict["the_df_dataset"] = test

In [None]:
langchain_docs = [LangchainDocument(
            page_content= doc["paragraphs"],
             metadata={
                "source": doc["url_source"], 
                "title": doc["title"],
                "insee_id": doc["id_origin"], 
                "categories" : doc["categories"],
                "date_diffusion" : doc["dateDiffusion"], 
                "themes" : doc["themes"],
                "collections" : doc["collections"], 
                "libelleAffichageGeo" : doc["libelleAffichageGeo"], 
                "intertitres" : doc["intertitres"],
                "authors" : doc["authors"],
                "subtitle" : doc["subtitle"]
             }) for _, doc in tqdm(ds.iterrows())]

In [None]:
def get_top_n_documents_with_largest_content(documents : List[LangchainDocument], n=1000):
    # Create a list of tuples (content_size, document)
    document_sizes = [(len(doc.page_content.split()), doc) for doc in documents]

    # Sort the list by content size in descending order
    sorted_documents = sorted(document_sizes, key=lambda x: x[0], reverse=True)

    # Get the top N documents
    top_n_documents = [doc for (_ , doc) in sorted_documents[:n]]

    return top_n_documents

In [None]:
sample_langchain_docs = get_top_n_documents_with_largest_content(langchain_docs, n=1000)

counters_para = Counter()

for doc in sample_langchain_docs:
    counters_para[len(doc.page_content.split())] +=1

lengths_sorted = sorted(counters_para.items())
lengths , counts = zip(*lengths_sorted)
# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(lengths, counts, color='skyblue')
plt.xlabel('Paragraph Length (number of words)')
plt.ylabel('Number of Paragraphs')
plt.title( f'Paragraph Length Distribution ({len(sample_langchain_docs)} docs)')
plt.xticks(rotation=45)
plt.xscale("log")
plt.yscale("log")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
import chromadb
def from_docs_to_vector_database(langchain_docs: List[LangchainDocument], mode: str = "random", top_n = 100, config: RetrievalConfiguration = None): 

    if mode=="random":
        np.random.seed(42) 
        top_documents = get_top_n_documents_with_largest_content(langchain_docs, n=100) 
        indices = np.random.randint(low=100, high=len(langchain_docs), size=top_n-100, dtype=int)
        sample_langchain_docs = top_documents + [langchain_docs[i] for i in indices]
    elif mode=="top":
        sample_langchain_docs = get_top_n_documents_with_largest_content(langchain_docs, n=top_n) 

    autokenizer, chunk_size, chunk_overlap = compute_autokonenizer_chunk_size(config.get("embedding_model_name"))
    
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        autokenizer,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=MARKDOWN_SEPARATORS,
    )
    docs_processed = text_splitter.split_documents(sample_langchain_docs)

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    embedding_model = HuggingFaceEmbeddings(  # load from sentence transformers
        model_name=config.get("embedding_model_name"),
        model_kwargs={"device": EMB_DEVICE, "trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
        show_progress=False,
    )
    # Process documents in batches
    chroma_client = chromadb.PersistentClient
    max_batch_size = chroma_client._producer.max_batch_size

    for i in range(0, len(docs_processed_unique), max_batch_size):
        batch_docs = docs_processed_unique[i:i + max_batch_size]
        db = Chroma.from_documents(
            collection_name=config.get("collection"),
            documents=batch_docs,
            persist_directory=DB_DIR_LOCAL,
            embedding=embedding_model,
        )
    

In [None]:
list_nb_docs = [10000, 15000, 20000, 30000, 35000, 40000]
list_config = [ 
    RetrievalConfiguration(
        name=f'test_docs_{nb_docs}',
        database="chromadb",
        collection=f"Solon-embeddings-large-0.1_docs_{nb_docs}",
        database_path=None,
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
        )
        for nb_docs in list_nb_docs
]

In [None]:
for (config, nb_doc) in zip(list_config, list_nb_docs):
    from_docs_to_vector_database(langchain_docs=langchain_docs, mode="random", top_n=nb_doc, config=config)

In [None]:
from evaluation import RetrievalEvaluator

results = RetrievalEvaluator.run(
    eval_configurations=list_config,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

### Experiment : Random Documents Injection (seed 42)

In [None]:
hist_results(
    list_config, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg','runtime'], 
    focus="name", 
    title = "Evaluating Retriever Performance on Multiple Dataset Sizes Using Solon Embedding Model and Random Document Selection (Seed 42)",
    k = 5)

In [None]:
plot_results(
    list_config, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg', 'precision'], 
    focus="name", 
    title = "Retriever performances on multiple size dataset",
    k = 50)

### Experiment : On Full Dataset

In [None]:
cross_encoders = ["BAAI/bge-reranker-v2-m3", "antoinelouis/crossencoder-camembert-large-mmarcoFR", "BAAI/bge-reranker-base"]
colberts = ["bclavie/FraColBERTv2", "antoinelouis/colbertv2-camembert-L4-mmarcoFR"]

colbert_vs_cross_encoder_eval_config = [
    RetrievalConfiguration(
        name=f'cross_encoder_{i}',
        database="chromadb",
        collection="insee_data",
        database_path=None,
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="Cross-encoder",
        reranker_name=cross_encoder,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
        rerank_k= 50
    ) for i, cross_encoder in enumerate(cross_encoders)] + [
    RetrievalConfiguration(
        name=f'colbert_{i}',
        database="chromadb",
        collection="insee_data",
        database_path=None,
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="ColBERT",
        reranker_name=colbert,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
        rerank_k= 50
    ) for i, colbert in enumerate(colberts)] + [
    RetrievalConfiguration(
        name=f'baseline',
        database="chromadb",
        collection="insee_data",
        database_path=None,
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
        rerank_k= 50
        )
    ]
        

In [None]:
# Run the evaluator
results_colbert_vs_cross_encoder = RetrievalEvaluator.run(
    eval_configurations = colbert_vs_cross_encoder_eval_config,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
# Run the evaluator
plot_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg', 'precision'], 
    focus= "reranker_name", 
    title = "Performance of Rerankers on Full INSEE Dataset (~40,000 Docs) using Solon-Embeddings-Large-0.1 model embedding model",
    k = 50,
    cmap_name = "tab10"
    )

In [None]:
# Run the evaluator
hist_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg',"runtime"], 
    focus = "reranker_name", 
    title = "Performance of Rerankers on Full INSEE Dataset (~40,000 Docs) using Solon-Embeddings-Large-0.1 model embedding model",
    k = 5,
    cmap_name = "tab10",
    x_min=0.6
    )

In [None]:
import copy

def compute_performance(results):
    config_names = [config_name for config_name in results.keys()]
    
    if "baseline" in config_names:
        results_percentages = copy.deepcopy(results)  # Use deepcopy to avoid modifying the original
        result_baseline = results["baseline"]
        
        for config_name, config_res in results.items():
            if config_name == "baseline":
                continue  # Skip baseline itself
            for metric in config_res.keys():
                if isinstance(config_res[metric], dict):
                    for i in config_res[metric]:
                        base_res = result_baseline[metric][i]
                        res = config_res[metric][i]
                        res_over_baseline = ((res - base_res) / base_res) * 100
                        results_percentages[config_name][metric][i] = res_over_baseline
                elif isinstance(config_res[metric], float):
                    base_res = result_baseline[metric]
                    res = config_res[metric]
                    res_over_baseline = ((res - base_res) / base_res) * 100
                    results_percentages[config_name][metric] = res_over_baseline
        
        return results_percentages
    else: 
        return None


In [None]:
results_percentages_over_baseline = compute_performance(results=results_colbert_vs_cross_encoder["the_df_dataset"])

In [None]:
plot_results(
    colbert_vs_cross_encoder_eval_config, 
    results_percentages_over_baseline, 
    ir_metrics=['recall', 'mrr', 'ndcg', 'precision'], 
    focus= "reranker_name", 
    title = "Performance of Rerankers on Full INSEE Dataset (~40,000 Docs) using Solon-Embeddings-Large-0.1 model embedding model \n(% over the baseline)",
    k = 50,
    cmap_name = "tab10"
    )

### Experiment : Longest Documents Injection

In [None]:
list_nb_docs = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [None]:
list_config = [ 
    RetrievalConfiguration(
        name=f'test_docs_{nb_docs}',
        database="chromadb",
        collection=f"Solon-embeddings-large-0.1_docs_{nb_docs}",
        database_path=None,
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
        )
        for nb_docs in list_nb_docs
]

for (config, nb_doc) in zip(list_config, list_nb_docs):
    from_docs_to_vector_database(langchain_docs=langchain_docs, mode="top", top_n=nb_doc, config=config)

In [None]:
from evaluation import RetrievalEvaluator

results = RetrievalEvaluator.run(
    eval_configurations=list_config,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
hist_results(
    list_config, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg','runtime'], 
    focus="name", 
    title = "Retriever performances on multiple size dataset (longest documents in Insee Data)",
    k = 5)

In [None]:
plot_results(
    list_config, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg','precision'], 
    focus="name", 
    title = "Retriever performances on multiple size dataset \n (longest documents in Insee Data)",
    k = 50)