In [20]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document

# LLM
from llama_index.llms.databricks import Databricks

# Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

# Rerankers
from llama_index.core.indices.query.schema import QueryBundle, QueryType
from llama_index.core.schema import NodeWithScore
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

# Evaluator
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.core.evaluation import RetrieverEvaluator

from typing import List
import pandas as pd

import nest_asyncio

nest_asyncio.apply()


In [65]:
from datasets import load_dataset

dataset = pd.read_parquet("../../data/parquet/train.parquet")


In [67]:
dataset

Unnamed: 0,question,answer,support
0,Quel rôle la question alimentaire et les polit...,La question alimentaire et les politiques de s...,La question alimentaire et son corollaire les ...
1,Quelle est la répartition des subventions tota...,"Pour la période 2000-2019, les subventions tot...",Les subventions totales (Consommateurs + Produ...
2,Quels sont les principaux objectifs et méthode...,L’objectif est d’éclairer le débat sur les pol...,Si la nécessité d’assurer la sécurité alimenta...
3,Pourquoi les méthodes et outils de la prospect...,Les méthodes et outils de la prospective sont ...,Le recours aux méthodes et outils de la prospe...
4,Quel est le rôle de l’innovation dans le chang...,L’innovation est considérée comme le vecteur p...,"L’innovation au sens large : politique, organi..."
...,...,...,...
167,Quels sont les axes des orientations stratégiq...,Les orientations stratégiques de la politique ...,Orientations stratégiques de la politique agri...
168,Quels sont les axes des orientations stratégiq...,Les orientations stratégiques de la politique ...,Orientations stratégiques de la politique hali...
169,Quels sont les axes des orientations stratégiq...,Les orientations stratégiques du SNI à 2035 in...,Orientations stratégiques du SNI à 2035 présen...
170,Quelle est la dimension transversale proposée ...,La dimension transversale proposée est la mise...,"Enfin, une dimension transversale celle d’une ..."


In [68]:
# Set the chunk size as 512 in node parser
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)

corpus = []
filtered_queries = []
counter = 0
for index, row in dataset.iterrows():    # Remove the empty documents
    if len(row["support"].strip()) == 0:
        continue
    current_document = Document(text=row["support"])
    # If the number of nodes for the document is 1, use them for evaluation
    if len(node_parser.get_nodes_from_documents([current_document])) == 1:
        corpus.append(row["support"])
        filtered_queries.append(row["question"])
        counter += 1
    # Limit to 500 documents
    if counter == 500:
        break


In [70]:
# Create the nodes from documents
documents = [Document(text=c) for c in corpus]
nodes = node_parser.get_nodes_from_documents(documents)
# Manually assign node id for retrieval and evaluation
for idx, node in enumerate(nodes):
    node.id_ = f"corpus_{idx}"


In [71]:
# Create inputs for EmbeddingQAFinetuneDataset
queries_dict = {f"query_{index}":filtered_queries[index] for index in range(counter)}
corpus_dict = {f"corpus_{index}":corpus[index] for index in range(counter)}
relevant_docs_dict = {f"query_{index}":[f"corpus_{index}"] for index in range(counter)}

# Create QA dataset
qa_dataset = EmbeddingQAFinetuneDataset(
    queries=queries_dict,
    corpus=corpus_dict,
    relevant_docs=relevant_docs_dict
)


In [72]:
qa_dataset

EmbeddingQAFinetuneDataset(queries={'query_0': 'Quel rôle la question alimentaire et les politiques de soutien jouent-elles dans le débat politique et sociétal de l’Algérie indépendante ?', 'query_1': 'Quelle est la répartition des subventions totales entre consommateurs et producteurs pour la période 2000-2019, et quelle part des subventions est allouée aux céréales et au lait ?', 'query_2': 'Quels sont les principaux objectifs et méthodes de l’analyse rétrospective comparative et de l’évaluation des politiques de soutien en matière de sécurité alimentaire en Algérie ?', 'query_3': 'Pourquoi les méthodes et outils de la prospective sont-ils nécessaires pour aborder la problématique de la sécurité alimentaire, et quel est l’horizon de la plupart des travaux en la matière ?', 'query_4': 'Quel est le rôle de l’innovation dans le changement et la résilience des systèmes productifs agricoles et halieutiques, et quel type d’innovation est privilégié pour la sécurité alimentaire durable ?', 

In [82]:
import os 
from dotenv import load_dotenv
load_dotenv()

COHERE_API_KEY = os.environ.get('COHERE_API_KEY')

# Define all embeddings and rerankers
EMBEDDINGS = {
    "all-MiniLM": LangchainEmbedding(HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")), # You can use mean pooling by addin pooling='mean' parameter
}

RERANKERS = {
    "WithoutReranker": "None",
    "cohere-rerank": CohereRerank(api_key=COHERE_API_KEY, top_n=5),
}


In [81]:
from llama_index.core import Settings

Settings.llm = None
Settings.embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5"))

LLM is explicitly disabled. Using MockLLM.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [159]:
from llama_index.core import SimpleKeywordTableIndex, VectorStoreIndex

vector_index = VectorStoreIndex(nodes)

In [160]:

# define custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)

In [161]:
from llama_index.retrievers.bm25 import BM25Retriever

# vector_retriever = index.as_retriever(similarity_top_k=5)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=vector_index.docstore, similarity_top_k=5
)

In [169]:
from llama_index.core.retrievers import QueryFusionRetriever

hybrid_retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=5,
    num_queries=1, 
    mode='reciprocal_rerank',
    use_async=True,
    verbose=True,
    # query_gen_prompt="...",  # we could override the query generation prompt here
)

In [154]:
class HybridRetriever(BaseRetriever):
    """Custom retriever that performs both semantic search and hybrid search."""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: KeywordTableSimpleRetriever,
        mode: str = "AND",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [170]:
# Define Retriever
class CustomRetriever(QueryFusionRetriever):
    """Custom retriever that performs both Vector search and Reranking"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

        if reranker != 'None':
            retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
        else:
            retrieved_nodes = retrieved_nodes[:5]

        return retrieved_nodes

    async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Asynchronously retrieve nodes given query.

        Implemented by the user.

        """
        return self._retrieve(query_bundle)

    async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
        if isinstance(str_or_query_bundle, str):
            str_or_query_bundle = QueryBundle(str_or_query_bundle)
        return await self._aretrieve(str_or_query_bundle)

# custom_retriever = CustomRetriever(vector_retriever)
hybrid_custom_retriever = CustomRetriever(hybrid_retriever)


In [85]:
reranker = RERANKERS["cohere-rerank"] 
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=custom_retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)



In [86]:
reranker = RERANKERS["WithoutReranker"]
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=custom_retriever
)
eval_results2 = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [164]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=vector_retriever
)
vector_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [165]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=bm25_retriever
)
bm25_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [171]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=hybrid_retriever
)
hybrid_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [172]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=hybrid_custom_retriever
)
hybrid_custom_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [117]:
import pandas as pd

def display_results(*eval_sets):
    """Display results from evaluate for multiple sets of evaluation results.
    
    Args:
        *eval_sets: A list of tuples, where each tuple contains the name of the evaluation set 
                    and the evaluation results, e.g., ("name1", eval_results1).
    
    Returns:
        A pandas DataFrame containing the hit rate and MRR for each evaluation set.
    """
    
    def calculate_metrics(name, eval_results):
        metric_dicts = []
        for eval_result in eval_results:
            metric_dict = eval_result.metric_vals_dict
            metric_dicts.append(metric_dict)

        full_df = pd.DataFrame(metric_dicts)

        hit_rate = full_df["hit_rate"].mean()
        mrr = full_df["mrr"].mean()
        return {"Search": name, "Hit Rate": hit_rate, "MRR": mrr}

    # Calculate metrics for each set of results provided
    metric_data = []
    for name, eval_results in eval_sets:
        metric_data.append(calculate_metrics(name, eval_results))

    # Create the DataFrame to display all sets of metrics
    metric_df = pd.DataFrame(metric_data)

    return metric_df



In [114]:
display_results(("Without Reranker", eval_results2), ("Cohere Reranker", eval_results))

Unnamed: 0,Reranker,Hit Rate,MRR
0,Without Reranker,0.855422,0.758936
1,Cohere Reranker,0.855422,0.838353


In [173]:
display_results(("BM25", bm25_results), ('Vector Search', vector_results), ('Hybrid Search', hybrid_results), ('Hybrid Search Cohere', hybrid_custom_results))


Unnamed: 0,Search,Hit Rate,MRR
0,BM25,0.674699,0.575703
1,Vector Search,0.855422,0.758936
2,Hybrid Search,0.891566,0.785743
3,Hybrid Search Cohere,0.891566,0.879518


In [91]:
import pandas as pd

# Creating the data for the DataFrame
data = {
    "model": ["bge-small-en-1.5", "all-MiniLM-L6-v2"],
    "hit rate": [0.855422, 0.771084],
    "MRR": [0.758936, 0.640462]
}

# Creating the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,model,hit rate,MRR
0,bge-small-en-1.5,0.855422,0.758936
1,all-MiniLM-L6-v2,0.771084,0.640462
