In [None]:
import sys
sys.path.append("../src")

### Load Q&A test data

In [None]:
!mc cp s3/projet-llm-insee-open-data/data/eval_data/eval_retrieval/q_and_a_scored_filtered_Phi-3-mini-128k-instruct.csv ../data/q_and_a_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv

### Load Knowledge Data

In [None]:
!mc cp s3/projet-llm-insee-open-data/data/eval_data/eval_retrieval/insee_documents_sample_ref_retrieval_evaluation.csv ../data/insee_documents_sample_ref_retrieval_evaluation.csv

In [None]:
import pandas as pd 

#question and answer
path_qa = "../data/q_and_a_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
test = pd.read_csv(path_qa)

df_dict = {}
df_dict["the_df_dataset"] = test

#knowledge data 
path_knowledge = "../data/insee_documents_sample_ref_retrieval_evaluation.csv"
data = pd.read_csv(path_knowledge)

In [None]:
display(test.head())
display(data.head())

### About test data 

In [None]:
data.nunique()

In [None]:
data.isna().sum()

In [None]:
# Run the evaluator
results = RetrievalEvaluator.run(
    eval_configurations=eval_configs,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)


In [None]:
eval_configs

## Test 03/07/2024 - Update Retrieval Evaluation process

In [None]:
import matplotlib.pyplot as plt 
from typing import List, Dict 
import math
from evaluation import RetrievalConfiguration

def plot_results(eval_configs: List[RetrievalConfiguration], results: Dict[str, Dict[str, Dict[int, float]]], 
                    ir_metrics: List[str] = ['recall', 'precision', 'mrr', 'ndcg'], focus: str = None, 
                    title: str = "", k: int = 15, cmap_name: str = "tab10"):
    """
    Plots IR metrics for different retrieval configurations.
    
    Parameters:
    - eval_configs: List of RetrievalConfiguration objects.
    - results: Dictionary of results where the keys are configuration names and values are dictionaries 
               of metrics.
    - ir_metrics: List of metrics to plot (default is ['recall', 'precision', 'mrr', 'ndcg']).
    - focus: The name of the config parameter to highlight in the legend.
    - title: The title of the plot.
    - k: The maximum value of k to plot.
    """

    # dynamic plotting  
    num_metrics = len(ir_metrics)
    num_cols = math.ceil(math.sqrt(num_metrics))
    num_rows = math.ceil(num_metrics / num_cols)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 10))
    axes = axes.flatten() if num_metrics > 1 else [axes]

    cmap = plt.get_cmap(cmap_name)
    colors = cmap(range(len(eval_configs)))
    
    for i, metric in enumerate(ir_metrics):
        ax = axes[i]

        for j, config in enumerate(eval_configs):
            config_results = results.get(config.name, {})
            metric_values = config_results.get(metric, {})
            
            k_values = [key for key in config.k_values if key <= k]
            values = [metric_values.get(ki, None) for ki in k_values]

            label = config.get(focus) 

            if label is None:
                label = config.name
            ax.plot(k_values, values, marker='o', label=label.split("/")[-1], color=colors[j])
        
        ax.set_xlabel('k')
        ax.set_ylabel(metric.capitalize())
        ax.set_title(f'{metric.upper()} vs k')
        ax.set_xticks(k_values)
        ax.grid(True)
        ax.legend()
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.suptitle(title, fontsize=16, y=1.02)
    plt.show()
    
def hist_results(eval_configs: List[RetrievalConfiguration], results: Dict[str, Dict[str, Dict[int, float]]], 
                 ir_metrics: List[str] = ['recall', 'precision', 'mrr', 'ndcg'], focus: str = None, 
                 title: str = "", k: int = 15, cmap_name : str = 'tab10', x_min=0.6):
    """
    Plots histograms of IR metrics for different retrieval configurations at a given k.
    
    Parameters:
    - eval_configs: List of RetrievalConfiguration objects.
    - results: Dictionary of results where the keys are configuration names and values are dictionaries 
               of metrics.
    - ir_metrics: List of metrics to plot (default is ['recall', 'precision', 'mrr', 'ndcg']).
    - focus: The name of the config parameter to highlight in the legend.
    - title: The title of the plot.
    - k: The value of k to plot the histograms for.
    """
    num_metrics = len(ir_metrics)
    num_cols = math.ceil(math.sqrt(num_metrics))
    num_rows = math.ceil(num_metrics / num_cols)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(14, 10))
    axes = axes.flatten() if num_metrics > 1 else [axes]

    cmap = plt.get_cmap(cmap_name)
    colors = cmap(range(len(eval_configs)))

    for i, metric in enumerate(ir_metrics):
        ax = axes[i]
        values = []
        labels = []
        
        for j, config in enumerate(eval_configs):
            config_results = results.get(config.name, {})
            metric_values = config_results.get(metric, {})
            if isinstance(metric_values, dict):
                value = metric_values.get(k, None)
            else:
                value = metric_values
            
            if value is not None:
                values.append(value)
                label = config.get(focus) 
                if label is None:
                    label = config.name
                labels.append(label.split("/")[-1])
                ax.barh(labels[-1], values[-1], color=colors[j])
        
        ax.set_xlabel('Value')
        ax.set_ylabel('Configuration')
        ax.set_xlim(left=x_min)
        ax.set_title(f'{metric.upper()} at k={k}')
        ax.grid(True)
    
    # Hide any unused subplots
    for j in range(i + 1, num_rows * num_cols):
        fig.delaxes(axes[j])
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.suptitle(title, fontsize=16, y=1.02)
    plt.show()
    

### Test Cross-encoder

In [None]:
from evaluation import RetrievalConfiguration, RetrievalEvaluator

cross_encoders = ["BAAI/bge-reranker-v2-m3", "antoinelouis/crossencoder-camembert-large-mmarcoFR", "dangvantuan/CrossEncoder-camembert-large","antoinelouis/crossencoder-electra-base-french-mmarcoFR", "BAAI/bge-reranker-base"]

eval_configs = [RetrievalConfiguration(
        name=f'test_cross_encoder_{i}',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="Cross-encoder",
        reranker_name=cross_encoder,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 20, 30, 50]
    ) for i,cross_encoder in enumerate(cross_encoders)] + [RetrievalConfiguration(
        name=f'test_baseline',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 20, 30, 50]
    )]

# Run the evaluator
results = RetrievalEvaluator.run(
    eval_configurations=eval_configs,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
plot_results(
    eval_configs, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title = "Rerankers",
    k = 30)

In [None]:
hist_results(
    eval_configs, 
    results["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg','runtime'], 
    focus="reranker_name", 
    title = "Rerankers",
    k = 5)

### Test ColBERTs

In [None]:
colberts = ["colbert-ir/colbertv2.0", "bclavie/FraColBERTv2", "antoinelouis/colbertv2-camembert-L4-mmarcoFR","antoinelouis/colbertv1-camembert-base-mmarcoFR"]

eval_configs_colbert = [RetrievalConfiguration(
        name=f'test_colBERT_{i}',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="ColBERT",
        reranker_name=colbert,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
    ) for i, colbert in enumerate(colberts)] + [RetrievalConfiguration(
        name=f'test_baseline',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
    )]

# Run the evaluator
results_colbert = RetrievalEvaluator.run(
    eval_configurations=eval_configs_colbert,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
plot_results(
    eval_configs_colbert, 
    results_colbert["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title="Rerankers (Solon-embeddings-large-0.1)",
    k = 10
    )

In [None]:
eval_configs_colbert

In [None]:
results_colbert["the_df_dataset"]["test_colBERT_2"]

In [None]:
hist_results(
    eval_configs_colbert, 
    results_colbert["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg','runtime'], 
    focus="reranker_name", 
    title = "Rerankers (Solon-embeddings-large-0.1)",
    k = 5)

### Test BM25

In [None]:
eval_configs_bm25_metadata = [
    RetrievalConfiguration(
        name=f'test_BM25',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="BM25",
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
    ),
    RetrievalConfiguration(
        name=f'test_baseline',
        database="chromadb",
        collection="Solon-embeddings-large-0.1_512_51",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
    )
    ] + [
        RetrievalConfiguration(
            name=f'test_metadata',
            database="chromadb",
            collection="Solon-embeddings-large-0.1_512_51",
            database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
            embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
            reranker_type="Metadata",
            use_metadata=meta,
            reranker_name=None,
            rerank_k=50,
            k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50])
         for meta in ['title', "intertitres", "themes","subtitle", "libelleAffichageGeo"]
    ]
    
# Run the evaluator
results_bm25_metadata = RetrievalEvaluator.run(
    eval_configurations=eval_configs_bm25_metadata,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
plot_results(
    eval_configs_bm25_metadata, 
    results_bm25_metadata["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg'], 
    focus="use_metadata", 
    title="Rerankers (Solon-embeddings-large-0.1)",
    k = 10
    )

In [None]:
hist_results(
    eval_configs_bm25_metadata, 
    results_bm25_metadata["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg', 'runtime'], 
    focus="use_metadata", 
    title="Rerankers (Solon-embeddings-large-0.1)",
    k = 5
)

### Test Embeddings 

In [None]:
from evaluation import RetrievalConfiguration, RetrievalEvaluator
model_embeddings = [
    "sentence-transformers/all-MiniLM-L6-v2", 
    "manu/sentence_croissant_alpha_v0.4",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "OrdalieTech/Solon-embeddings-large-0.1",
    "intfloat/multilingual-e5-large",
    "Lajavaness/bilingual-embedding-large",
    ]

eval_configs_embeddings= [
    RetrievalConfiguration(
        name=f'test_embeddings_{i}',
        database="chromadb",
        collection=model.split("/")[-1],
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name=model,
        reranker_type=None,
        reranker_name=None,
        rerank_k=50,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 45, 50]
    ) for i, model in enumerate(model_embeddings)
]

# Run the evaluator
results_embeddings = RetrievalEvaluator.run(
    eval_configurations=eval_configs_embeddings,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
plot_results(
    eval_configs_embeddings, 
    results_embeddings["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg'], 
    focus="embedding_model_name", 
    title="Embedding models",
    k = 50
    )

In [None]:
hist_results(
    eval_configs_embeddings, 
    results_embeddings["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg', 'runtime'], 
    focus="embedding_model_name", 
    title="Embedding models",
    k = 5
)

### Ensemble methods 

In [None]:
def generate_sequence(start, step, count):
    """
    Generate a sequence of tuples (x, y) such that x + y = 1.
    
    Parameters:
    start (float): Starting value for the sequence
    step (float): Increment value for each step in the sequence
    count (int): Number of tuples to generate
    
    Returns:
    list: A list of tuples (x, y)
    """
    sequence = []
    for i in range(count):
        x = start + i * step
        y = 1 - x
        sequence.append((x, y))
    return sequence

# Example usage
start = 0.0
step = 0.1
count = 10
list_tuples = generate_sequence(start, step, count)
print(list_tuples)

In [None]:

ensemble_eval_config = [
    RetrievalConfiguration(
        name='baseline',
        database="chromadb",
        collection="all-MiniLM-L6-v2",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
        reranker_type=None,
        reranker_name=None,
        k_values=[1, 2, 3, 5, 10, 15, 20,25, 30, 35, 40, 45, 50],
        rerank_k= 50
    )
    ] + [
        RetrievalConfiguration(
            name=f'ensemble_{i}_{j}',
            database="chromadb",
            collection="all-MiniLM-L6-v2",
            database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
            embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
            reranker_type="Ensemble",
            param_ensemble = [
                {"reranker_type":"Cross-encoder",
                "reranker_name":"BAAI/bge-reranker-base",
                "reranker_weight": i
                },
                {"reranker_type":"BM25",
                "reranker_name": None,
                "reranker_weight": j
                },
            ],
            k_values=[1, 2, 3, 5, 10, 15, 20,25, 30, 35, 40, 45, 50],
            rerank_k= 50
        ) for i,j in list_tuples 
    ]

# Run the evaluator
results_ensemble = RetrievalEvaluator.run(
    eval_configurations=ensemble_eval_config,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)


In [None]:
# Run the evaluator
plot_results(
    ensemble_eval_config, 
    results_ensemble["the_df_dataset"], 
    ir_metrics=['recall', 'precision', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title = "Embedding (all-MiniLM-L6-v2) Ensemble (bge-reranker-base + BM25)",
    k = 15,
    cmap_name = "tab20"
    )

In [None]:
# Run the evaluator
hist_results(
    ensemble_eval_config, 
    results_ensemble["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title = "Ensemble",
    k = 5,
    cmap_name = "tab20"
)

### Multiple dataset 

In [None]:
#question and answer
path_qa = "../data/q_and_s_ref_retrieval_evaluation_Phi-3-mini-128k-instruct.csv"
test = pd.read_csv(path_qa)

df_dict["the_df_dataset_big"] = test

In [None]:
cross_encoders = ["BAAI/bge-reranker-v2-m3", "antoinelouis/crossencoder-camembert-large-mmarcoFR", "BAAI/bge-reranker-base"]
colberts = ["bclavie/FraColBERTv2", "antoinelouis/colbertv2-camembert-L4-mmarcoFR"]

colbert_vs_cross_encoder_eval_config = [
    RetrievalConfiguration(
        name=f'cross_encoder_{i}',
        database="chromadb",
        collection="Solon-embeddings-large-0.1",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="Cross-encoder",
        reranker_name=cross_encoder,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
        rerank_k= 50
    ) for i, cross_encoder in enumerate(cross_encoders)] + [
    RetrievalConfiguration(
        name=f'colbert_{i}',
        database="chromadb",
        collection="Solon-embeddings-large-0.1",
        database_path="../data/insee_documents_sample_ref_retrieval_evaluation.csv",
        embedding_model_name="OrdalieTech/Solon-embeddings-large-0.1",
        reranker_type="ColBERT",
        reranker_name=colbert,
        k_values=[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
        rerank_k= 50
    ) for i, colbert in enumerate(colberts)]

# Run the evaluator
results_colbert_vs_cross_encoder = RetrievalEvaluator.run(
    eval_configurations = colbert_vs_cross_encoder_eval_config,
    eval_dict=df_dict,  # Ensure 'df_dict' is a dictionary containing pandas DataFrames with the required structure
)

In [None]:
# Run the evaluator
plot_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title = "Reranker Dataset 77",
    k = 5,
    cmap_name = "tab10"
    )

In [None]:
plot_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset_big"], 
    ir_metrics=['recall', 'mrr', 'ndcg'], 
    focus="reranker_name", 
    title = "Embedding (Solon-embeddings-large-0.1) Reranker",
    k = 5,
    cmap_name = "tab10"
    )

In [None]:
# Run the evaluator
hist_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset"], 
    ir_metrics=['recall', 'mrr', 'ndcg',"runtime"], 
    focus = "reranker_name", 
    title = "Reranker Dataset 77",
    k = 5,
    cmap_name = "tab10",
    x_min=0.8
    )

In [None]:
hist_results(
    colbert_vs_cross_encoder_eval_config, 
    results_colbert_vs_cross_encoder["the_df_dataset_big"], 
    ir_metrics=['recall', 'mrr', 'ndcg',"runtime"], 
    focus = "reranker_name", 
    title = "Reranker Dataset 100",
    k = 5,
    cmap_name = "tab10",
    x_min=0.8
    )

## Test 12/07/2024 - Update Retrieval Evaluation process

Goal:
The previous tests were conducted using a dataset comprising the 100 largest available documents from the raw database. As a result, this high-quality content may not accurately reflect the distribution of data in the entire vector database. While we can observe differences between the configurations, it is challenging to determine which combination is the best choice for our use case among the top configurations.

Several experiments will be done : 
- add random documents to the base corpus (uniformly distributed)
- add other big documents to the base corpus (keeping the same extraction procedure of the top N largest content documents)

what to observe : 
- evolution of the retrieval metrics facing this added noise. 
- 