# Information Retrieval Evaluation Pipeline

This notebook implements a comprehensive evaluation pipeline for information retrieval systems using FAISS indices and BGE-M3 embeddings. The pipeline includes:
- Retrieval evaluation with multiple metrics
- Visualization of results
- Detailed performance analysis by categories
- Report generation

First, let's set up our environment and install required dependencies.

## Add Reranking Support

First, let's add the necessary imports and reranker model initialization.

In [1]:

!pip install torch transformers sentence-transformers rank_bm25 nltk

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [2]:

!pip install faiss-cpu FlagEmbedding tqdm numpy matplotlib seaborn


from google.colab import drive
drive.mount('/content/drive')

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting FlagEmbedding
  Downloading FlagEmbedding-1.3.5.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ir-datasets (from FlagEmbedding)
  Downloading ir_datasets-0.5.11-py3-none-any.whl.metadata (12 kB)
Collecting inscriptis>=2.2.0 (from ir-datasets->FlagEmbedding)
  Downloading inscriptis-2.6.0-py3-none-any.whl.metadata (25 kB)
Collecting trec-car-tools>=2.5.4 (from ir-datasets->FlagEmbedding)
  Downloading trec_car_tools-2.6-py3-none-any.whl.metadata (640 bytes)
Collecting lz4>=3.1.10 (from ir-datasets->FlagEmbedding)
  Downloading lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting warc3-wet>=0.2.3 (from ir-datasets->FlagEmbedding)
  Downloading warc

## Import Required Libraries

Let's import all the necessary Python libraries and configure the environment.

In [3]:

import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
import faiss
faiss.omp_set_num_threads(1)


import json
import numpy as np
from typing import List, Dict, Tuple, Any
from dataclasses import dataclass
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

## Define Data Classes and Helper Functions

First, let's define our evaluation metrics dataclass and implement the core evaluation class.

In [4]:
@dataclass
class EvaluationMetrics:
    recall_at_k: Dict[int, float]
    precision_at_k: Dict[int, float]
    mrr: float
    map_score: float
    ndcg_at_k: Dict[int, float]
    hit_rate_at_k: Dict[int, float]

class RetrievalEvaluator:
    def __init__(self,
                 faiss_index_path: str,
                 qa_dataset_path: str,
                 documents_path: str,
                 embeddings_model=None,
                 reranker_model=None,
                 k_values: List[int] = [1, 3, 5, 10, 20],
                 metric_type: str = 'ip',
                 rerank_cutoff: int = 100,
                 use_hybrid: bool = False,
                 hybrid_weights: Tuple[float, float] = (0.7, 0.3)):

        self.k_values = k_values
        self.index = faiss.read_index(faiss_index_path)
        self.embeddings_model = embeddings_model
        self.reranker_model = reranker_model
        self.rerank_cutoff = rerank_cutoff
        self.metric_type = metric_type.lower()
        self.use_hybrid = use_hybrid
        self.dense_weight, self.sparse_weight = hybrid_weights

        if self.metric_type not in ['ip', 'l2']:
            raise ValueError("metric_type must be either 'ip' or 'l2'")

        print("Loading QA dataset...")
        with open(qa_dataset_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            self.qa_pairs = data.get('qa_pairs', [])
            self.metadata = data.get('metadata', {})

        print(f"Loaded {len(self.qa_pairs)} QA pairs")

        print(f"Loading documents from {documents_path}...")
        self.doc_texts = {}
        doc_id_counter = 0

        with open(documents_path, 'r', encoding='utf-8') as f:
            docs_data = json.load(f)

            if isinstance(docs_data, dict):
                for category, subcats in docs_data.items():
                    if isinstance(subcats, dict):
                        for subcat, systems in subcats.items():
                            if isinstance(systems, dict):
                                for system_name, system_data in systems.items():
                                    if isinstance(system_data, dict):
                                        parts = system_data.get('parts', {})
                                        if isinstance(parts, dict):
                                            for part_name, articles in parts.items():
                                                if isinstance(articles, list):
                                                    for article in articles:
                                                        if isinstance(article, dict):
                                                            article_id = article.get('id', doc_id_counter)
                                                            article_text = article.get('Article_Text', '')
                                                            article_title = article.get('Article_Title', '')


                                                            full_text = f"{article_title}\n{article_text}".strip()

                                                            if full_text:
                                                                self.doc_texts[article_id] = full_text
                                                                doc_id_counter += 1

        if not self.doc_texts:
            raise ValueError(f"No documents found in {documents_path}! Could not extract articles from nested structure.")

        print(f"Loaded {len(self.doc_texts)} documents from nested structure")


        if self.use_hybrid:
            print("Initializing BM25 for hybrid retrieval...")
            from rank_bm25 import BM25Okapi
            from nltk.tokenize import word_tokenize
            import nltk
            nltk.download('punkt', quiet=True)


            self.doc_ids = sorted(self.doc_texts.keys())
            self.corpus = [self.doc_texts[doc_id] for doc_id in self.doc_ids]

            print(f"Tokenizing {len(self.corpus)} documents for BM25...")
            self.tokenized_corpus = [word_tokenize(doc.lower()) for doc in self.corpus]

            if not self.tokenized_corpus:
                raise ValueError("No documents available for BM25 indexing!")

            self.bm25 = BM25Okapi(self.tokenized_corpus)
            print("BM25 initialization complete!")

        print(f"Evaluator initialized successfully!")

In [5]:
def embed_query(self, query: str) -> np.ndarray:
    """Generate embedding for a query using BGE-M3"""
    if self.embeddings_model is None:
        raise ValueError("Embeddings model not provided")

    embedding_dict = self.embeddings_model.encode([query])
    embedding = np.array(embedding_dict["dense_vecs"], dtype='float32')

    if len(embedding.shape) == 1:
        embedding = embedding.reshape(1, -1)

    if self.metric_type == 'ip':
        embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True)

    return embedding

def min_max_normalize(self, scores: np.ndarray) -> np.ndarray:
    """Apply min-max normalization to scores"""
    if len(scores) == 0:
        return scores
    min_score = np.min(scores)
    max_score = np.max(scores)
    if max_score == min_score:
        return np.ones_like(scores)
    return (scores - min_score) / (max_score - min_score)

def hybrid_search(self, query: str, k: int = 10) -> Tuple[List[int], List[float]]:
    """Perform hybrid search combining dense and sparse retrieval"""
    from nltk.tokenize import word_tokenize


    query_embedding = self.embed_query(query)
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding.reshape(1, -1)

    dense_distances, dense_indices = self.index.search(query_embedding, k)
    dense_indices = dense_indices[0].tolist()
    dense_scores = dense_distances[0]


    tokenized_query = word_tokenize(query.lower())
    sparse_scores = self.bm25.get_scores(tokenized_query)


    if self.metric_type == 'l2':
        dense_scores = 1.0 / (1.0 + dense_scores)


    dense_scores = self.min_max_normalize(dense_scores)
    sparse_scores = self.min_max_normalize(sparse_scores)


    final_scores = []
    final_ids = []
    seen_ids = set()


    for idx, score in zip(dense_indices, dense_scores):
        doc_id = idx
        if doc_id not in seen_ids:
            seen_ids.add(doc_id)
            combined_score = (self.dense_weight * score +
                            self.sparse_weight * sparse_scores[doc_id])
            final_scores.append(combined_score)
            final_ids.append(doc_id)


    final_scores = self.min_max_normalize(np.array(final_scores))


    sorted_pairs = sorted(zip(final_ids, final_scores),
                         key=lambda x: x[1], reverse=True)
    final_ids, final_scores = zip(*sorted_pairs)

    return list(final_ids)[:k], list(final_scores)[:k]

def rerank_results(self, query: str, doc_ids: List[int], scores: List[float]) -> Tuple[List[int], List[float]]:
    """Rerank the retrieved documents using the cross-encoder model"""
    if self.reranker_model is None or not doc_ids:
        return doc_ids, scores

    if isinstance(doc_ids[0], (list, np.ndarray)):
        doc_ids = [item for sublist in doc_ids for item in sublist]


    doc_texts = [self.doc_texts.get(doc_id, "") for doc_id in doc_ids]


    pairs = [[query, doc] for doc in doc_texts]


    rerank_scores = self.reranker_model.predict(pairs)


    rerank_scores = self.min_max_normalize(rerank_scores)


    ranked_pairs = list(zip(doc_ids, rerank_scores))
    ranked_pairs.sort(key=lambda x: x[1], reverse=True)


    reranked_ids, reranked_scores = zip(*ranked_pairs)

    return list(reranked_ids), list(reranked_scores)

def retrieve(self, query: str, k: int = 10) -> Tuple[np.ndarray, np.ndarray]:
    """Retrieve and rerank top-k documents for a query"""

    if self.use_hybrid:
        initial_k = max(k, self.rerank_cutoff) if self.reranker_model else k
        retrieved_ids, scores = self.hybrid_search(query, k=initial_k)
        scores = np.array(scores)
        indices = np.array(retrieved_ids)
    else:
        query_embedding = self.embed_query(query)
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)

        initial_k = max(k, self.rerank_cutoff) if self.reranker_model else k
        distances, indices = self.index.search(query_embedding, initial_k)


        if len(indices.shape) > 1:
            indices = indices[0]

        if self.metric_type == 'l2':
            scores = 1.0 / (1.0 + distances[0])
            scores = self.min_max_normalize(scores)
        else:
            scores = (distances[0] + 1) / 2
            scores = self.min_max_normalize(scores)


    if self.reranker_model:
        reranked_ids, reranked_scores = self.rerank_results(
            query, indices.tolist(), scores.tolist())


        reranked_ids = np.array(reranked_ids[:k])
        reranked_scores = np.array(reranked_scores[:k])

        return reranked_scores, reranked_ids

    return scores, indices[:k]

RetrievalEvaluator.embed_query = embed_query
RetrievalEvaluator.min_max_normalize = min_max_normalize
RetrievalEvaluator.hybrid_search = hybrid_search
RetrievalEvaluator.rerank_results = rerank_results
RetrievalEvaluator.retrieve = retrieve

In [6]:

def calculate_recall_at_k(self, retrieved_ids: List[int], relevant_ids: List[int], k: int) -> float:
    """Calculate Recall@k"""
    if not relevant_ids:
        return 0.0
    retrieved_k = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    hits = len(retrieved_k.intersection(relevant_set))
    return hits / len(relevant_set)

def calculate_precision_at_k(self, retrieved_ids: List[int], relevant_ids: List[int], k: int) -> float:
    """Calculate Precision@k"""
    if k == 0:
        return 0.0
    retrieved_k = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    hits = len(retrieved_k.intersection(relevant_set))
    return hits / k

def calculate_reciprocal_rank(self, retrieved_ids: List[int], relevant_ids: List[int]) -> float:
    """Calculate Reciprocal Rank"""
    relevant_set = set(relevant_ids)
    for rank, doc_id in enumerate(retrieved_ids, 1):
        if doc_id in relevant_set:
            return 1.0 / rank
    return 0.0

def calculate_average_precision(self, retrieved_ids: List[int], relevant_ids: List[int]) -> float:
    """Calculate Average Precision"""
    if not relevant_ids:
        return 0.0
    relevant_set = set(relevant_ids)
    precision_sum = 0.0
    hits = 0
    for rank, doc_id in enumerate(retrieved_ids, 1):
        if doc_id in relevant_set:
            hits += 1
            precision_sum += hits / rank
    return precision_sum / len(relevant_set) if relevant_set else 0.0

def calculate_ndcg_at_k(self, retrieved_ids: List[int], relevant_ids: List[int], k: int) -> float:
    """Calculate NDCG@k"""
    def dcg(relevances, k):
        relevances = relevances[:k]
        return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevances))

    retrieved_k = retrieved_ids[:k]
    relevances = [1 if doc_id in relevant_ids else 0 for doc_id in retrieved_k]
    dcg_score = dcg(relevances, k)
    ideal_relevances = [1] * min(len(relevant_ids), k)
    idcg_score = dcg(ideal_relevances, k)
    return dcg_score / idcg_score if idcg_score > 0 else 0.0

def calculate_hit_rate_at_k(self, retrieved_ids: List[int], relevant_ids: List[int], k: int) -> float:
    """Calculate Hit Rate@k"""
    retrieved_k = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    return 1.0 if len(retrieved_k.intersection(relevant_set)) > 0 else 0.0


RetrievalEvaluator.calculate_recall_at_k = calculate_recall_at_k
RetrievalEvaluator.calculate_precision_at_k = calculate_precision_at_k
RetrievalEvaluator.calculate_reciprocal_rank = calculate_reciprocal_rank
RetrievalEvaluator.calculate_average_precision = calculate_average_precision
RetrievalEvaluator.calculate_ndcg_at_k = calculate_ndcg_at_k
RetrievalEvaluator.calculate_hit_rate_at_k = calculate_hit_rate_at_k

In [7]:

def evaluate_single_query(self, qa_pair: Dict[str, Any], max_k: int = 20) -> Dict[str, Any]:
    """Evaluate a single query"""
    question = qa_pair['question']
    relevant_ids = qa_pair['references_ids']

    distances, retrieved_indices = self.retrieve(question, k=max_k)

    if self.metric_type == 'l2':
        scores = 1.0 / (1.0 + distances)
    else:
        scores = distances

    retrieved_ids = retrieved_indices.tolist()

    results = {
        'qa_id': qa_pair['id'],
        'question': question,
        'relevant_ids': relevant_ids,
        'retrieved_ids': retrieved_ids,
        'distances': distances.tolist(),
        'scores': scores.tolist(),
        'metrics': {}
    }

    for k in self.k_values:
        if k <= max_k:
            results['metrics'][f'recall@{k}'] = self.calculate_recall_at_k(
                retrieved_ids, relevant_ids, k)
            results['metrics'][f'precision@{k}'] = self.calculate_precision_at_k(
                retrieved_ids, relevant_ids, k)
            results['metrics'][f'ndcg@{k}'] = self.calculate_ndcg_at_k(
                retrieved_ids, relevant_ids, k)
            results['metrics'][f'hit_rate@{k}'] = self.calculate_hit_rate_at_k(
                retrieved_ids, relevant_ids, k)

    results['metrics']['reciprocal_rank'] = self.calculate_reciprocal_rank(
        retrieved_ids, relevant_ids)
    results['metrics']['average_precision'] = self.calculate_average_precision(
        retrieved_ids, relevant_ids)

    return results

RetrievalEvaluator.evaluate_single_query = evaluate_single_query

In [8]:

def evaluate_all(self, sample_size: int = None, sample_indices_path: str = None) -> Tuple[EvaluationMetrics, List[Dict]]:
    """Evaluate all queries in the dataset"""
    qa_pairs = self.qa_pairs
    if sample_size and sample_size < len(qa_pairs):
        import random

        if sample_indices_path and os.path.exists(sample_indices_path):
            with open(sample_indices_path, 'r') as f:
                sample_indices = json.load(f)
            qa_pairs = [self.qa_pairs[i] for i in sample_indices]
            print(f"Loaded {len(qa_pairs)} samples from {sample_indices_path}")
        else:
            sample_indices = random.sample(range(len(self.qa_pairs)), sample_size)
            qa_pairs = [self.qa_pairs[i] for i in sample_indices]
            print(f"Generated {sample_size} random samples")

            if sample_indices_path:
                with open(sample_indices_path, 'w') as f:
                    json.dump(sample_indices, f)
                print(f"Saved sample indices to {sample_indices_path}")

    detailed_results = []
    recall_scores = {k: [] for k in self.k_values}
    precision_scores = {k: [] for k in self.k_values}
    ndcg_scores = {k: [] for k in self.k_values}
    hit_rate_scores = {k: [] for k in self.k_values}
    rr_scores = []
    ap_scores = []

    print(f"Evaluating {len(qa_pairs)} queries...")
    for qa_pair in tqdm(qa_pairs):
        result = self.evaluate_single_query(qa_pair)
        detailed_results.append(result)

        for k in self.k_values:
            recall_scores[k].append(result['metrics'][f'recall@{k}'])
            precision_scores[k].append(result['metrics'][f'precision@{k}'])
            ndcg_scores[k].append(result['metrics'][f'ndcg@{k}'])
            hit_rate_scores[k].append(result['metrics'][f'hit_rate@{k}'])

        rr_scores.append(result['metrics']['reciprocal_rank'])
        ap_scores.append(result['metrics']['average_precision'])

    metrics = EvaluationMetrics(
        recall_at_k={k: np.mean(recall_scores[k]) for k in self.k_values},
        precision_at_k={k: np.mean(precision_scores[k]) for k in self.k_values},
        mrr=np.mean(rr_scores),
        map_score=np.mean(ap_scores),
        ndcg_at_k={k: np.mean(ndcg_scores[k]) for k in self.k_values},
        hit_rate_at_k={k: np.mean(hit_rate_scores[k]) for k in self.k_values}
    )

    return metrics, detailed_results

RetrievalEvaluator.evaluate_all = evaluate_all

## Visualization Functions

Let's implement functions to visualize our evaluation results.

In [9]:
def plot_metrics(self, metrics: EvaluationMetrics, save_path: str = None):
    """Visualize evaluation metrics"""
    plt.style.use('default')
    plt.rcParams.update({
        'figure.facecolor': 'white',
        'axes.facecolor': '#f0f0f0',
        'axes.grid': True,
        'grid.alpha': 0.3,
        'grid.color': '#cccccc',
        'axes.spines.top': False,
        'axes.spines.right': False,
    })

    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    fig.suptitle('Retrieval Model Evaluation Metrics', fontsize=20, y=1.02, fontweight='bold')
    fig.patch.set_facecolor('white')
    for ax in axes.flat:
        ax.set_facecolor('#f8f9fa')

    line_width = 3
    marker_size = 12
    font_size = 14
    title_size = 18
    k_vals = sorted(metrics.recall_at_k.keys())


    recall_vals = [metrics.recall_at_k[k] for k in k_vals]
    axes[0, 0].plot(k_vals, recall_vals, marker='o', linewidth=line_width,
                   markersize=marker_size, color='#2E86C1', label='Recall')
    axes[0, 0].set_xlabel('k', fontsize=font_size, fontweight='bold')
    axes[0, 0].set_ylabel('Recall@k', fontsize=font_size, fontweight='bold')
    axes[0, 0].set_title('Recall@k', fontsize=title_size, pad=20, fontweight='bold')
    axes[0, 0].set_ylim(max(0, min(recall_vals) - 0.1), min(1.0, max(recall_vals) + 0.1))
    axes[0, 0].legend(fontsize=font_size, loc='lower right')


    precision_vals = [metrics.precision_at_k[k] for k in k_vals]
    axes[0, 1].plot(k_vals, precision_vals, marker='s', linewidth=line_width,
                   markersize=marker_size, color='#E67E22', label='Precision')
    axes[0, 1].set_xlabel('k', fontsize=font_size, fontweight='bold')
    axes[0, 1].set_ylabel('Precision@k', fontsize=font_size, fontweight='bold')
    axes[0, 1].set_title('Precision@k', fontsize=title_size, pad=20, fontweight='bold')
    axes[0, 1].set_ylim(max(0, min(precision_vals) - 0.1), min(1.0, max(precision_vals) + 0.1))
    axes[0, 1].legend(fontsize=font_size, loc='upper right')


    ndcg_vals = [metrics.ndcg_at_k[k] for k in k_vals]
    axes[1, 0].plot(k_vals, ndcg_vals, marker='^', linewidth=line_width,
                   markersize=marker_size, color='#27AE60', label='NDCG')
    axes[1, 0].set_xlabel('k', fontsize=font_size, fontweight='bold')
    axes[1, 0].set_ylabel('NDCG@k', fontsize=font_size, fontweight='bold')
    axes[1, 0].set_title('NDCG@k', fontsize=title_size, pad=20, fontweight='bold')
    axes[1, 0].set_ylim(max(0, min(ndcg_vals) - 0.1), min(1.0, max(ndcg_vals) + 0.1))
    axes[1, 0].legend(fontsize=font_size, loc='lower right')


    hit_rate_vals = [metrics.hit_rate_at_k[k] for k in k_vals]
    axes[1, 1].plot(k_vals, hit_rate_vals, marker='d', linewidth=line_width,
                   markersize=marker_size, color='#C0392B', label='Hit Rate')
    axes[1, 1].set_xlabel('k', fontsize=font_size, fontweight='bold')
    axes[1, 1].set_ylabel('Hit Rate@k', fontsize=font_size, fontweight='bold')
    axes[1, 1].set_title('Hit Rate@k', fontsize=title_size, pad=20, fontweight='bold')
    axes[1, 1].set_ylim(max(0, min(hit_rate_vals) - 0.1), min(1.0, max(hit_rate_vals) + 0.1))
    axes[1, 1].legend(fontsize=font_size, loc='lower right')

    plt.tight_layout()


    summary = f'Summary Metrics:\nMRR: {metrics.mrr:.3f}\nMAP: {metrics.map_score:.3f}'
    fig.text(0.92, 0.5, summary, fontsize=14,
            bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray'))


    for ax in axes.flat:
        line = ax.get_lines()[0]
        for x, y in zip(line.get_xdata(), line.get_ydata()):
            ax.annotate(f'{y:.3f}', (x, y), textcoords="offset points",
                      xytext=(0,10), ha='center', fontsize=10)

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
    else:
        plt.show()

RetrievalEvaluator.plot_metrics = plot_metrics

## Run Evaluation Pipeline

Now let's run the evaluation pipeline. First, we need to:
1. Initialize the BGE-M3 model
2. Set up paths for the FAISS index and QA dataset
3. Create an evaluator instance
4. Run the evaluation
5. Generate visualizations and reports

In [10]:
from FlagEmbedding import BGEM3FlagModel
retriever_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)


from sentence_transformers import CrossEncoder
reranker_model = CrossEncoder('BAAI/bge-reranker-large', max_length=512)


metric_type = 'ip'
metric_suffix = 'V2' if metric_type == 'l2' else ''
rerank_suffix = '_reranked'


index_path = '/content/drive/MyDrive/m3_legal_faiss.index'
qa_dataset_path = '/content/drive/MyDrive/law_qa_dataset_validated.json'
documents_path = '/content/drive/MyDrive/saudi_laws_scraped.json'

print(f"Using {metric_type.upper()} metric with index: {index_path}")

evaluator_regular = RetrievalEvaluator(
    faiss_index_path=index_path,
    qa_dataset_path=qa_dataset_path,
    documents_path=documents_path,
    embeddings_model=retriever_model,
    reranker_model=reranker_model,
    k_values=[1, 3, 5, 10, 20],
    metric_type=metric_type,
    rerank_cutoff=100,
    use_hybrid=False
)

import nltk
nltk.download('punkt_tab', quiet=True)

evaluator_hybrid = RetrievalEvaluator(
    faiss_index_path=index_path,
    qa_dataset_path=qa_dataset_path,
    documents_path=documents_path,
    embeddings_model=retriever_model,
    reranker_model=reranker_model,
    k_values=[1, 3, 5, 10, 20],
    metric_type=metric_type,
    rerank_cutoff=100,
    use_hybrid=True,
    hybrid_weights=(0.7, 0.3)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

onnx/model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

onnx/model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Using IP metric with index: /content/drive/MyDrive/m3_legal_faiss.index
Loading QA dataset...
Loaded 4156 QA pairs
Loading documents from /content/drive/MyDrive/saudi_laws_scraped.json...
Loaded 16371 documents from nested structure
Evaluator initialized successfully!
Loading QA dataset...
Loaded 4156 QA pairs
Loading documents from /content/drive/MyDrive/saudi_laws_scraped.json...
Loaded 16371 documents from nested structure
Initializing BM25 for hybrid retrieval...
Tokenizing 16371 documents for BM25...
BM25 initialization complete!
Evaluator initialized successfully!


In [11]:

print("Examining QA dataset structure...")
with open(qa_dataset_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    print("\nDataset keys:", list(data.keys()))
    if 'qa_pairs' in data:
        sample_qa = data['qa_pairs'][0]
        print("\nSample QA pair keys:", list(sample_qa.keys()))
        if 'references' in sample_qa:
            print("Sample reference structure:", type(sample_qa['references']))
            if isinstance(sample_qa['references'], list):
                print("Sample reference keys:", list(sample_qa['references'][0].keys()))
            elif isinstance(sample_qa['references'], dict):
                print("Sample reference (dict) first item:", next(iter(sample_qa['references'].items())))

Examining QA dataset structure...

Dataset keys: ['metadata', 'qa_pairs']

Sample QA pair keys: ['id', 'law_name', 'phase', 'category', 'question', 'answer', 'references_ids', 'type', 'selected_articles']


In [None]:

print("Starting evaluation for regular retrieval...")
metrics_regular, detailed_results_regular = evaluator_regular.evaluate_all()

print("\nStarting evaluation for hybrid retrieval...")
metrics_hybrid, detailed_results_hybrid = evaluator_hybrid.evaluate_all()


print("\nGenerating visualizations...")
evaluator_regular.plot_metrics(metrics_regular, save_path=f'evaluation_metrics_regular_m3{metric_suffix}{rerank_suffix}.png')
evaluator_hybrid.plot_metrics(metrics_hybrid, save_path=f'evaluation_metrics_hybrid_m3{metric_suffix}{rerank_suffix}.png')


print("\nGenerating reports...")


report_regular = {
    'overall_metrics': {
        'recall_at_k': metrics_regular.recall_at_k,
        'precision_at_k': metrics_regular.precision_at_k,
        'ndcg_at_k': metrics_regular.ndcg_at_k,
        'hit_rate_at_k': metrics_regular.hit_rate_at_k,
        'mrr': metrics_regular.mrr,
        'map': metrics_regular.map_score
    },
    'metadata': {
        'approach': 'regular',
        'reranking': {
            'model': 'BAAI/bge-reranker-large',
            'cutoff': evaluator_regular.rerank_cutoff
        }
    }
}


report_hybrid = {
    'overall_metrics': {
        'recall_at_k': metrics_hybrid.recall_at_k,
        'precision_at_k': metrics_hybrid.precision_at_k,
        'ndcg_at_k': metrics_hybrid.ndcg_at_k,
        'hit_rate_at_k': metrics_hybrid.hit_rate_at_k,
        'mrr': metrics_hybrid.mrr,
        'map': metrics_hybrid.map_score
    },
    'metadata': {
        'approach': 'hybrid',
        'dense_weight': evaluator_hybrid.dense_weight,
        'sparse_weight': evaluator_hybrid.sparse_weight,
        'reranking': {
            'model': 'BAAI/bge-reranker-large',
            'cutoff': evaluator_hybrid.rerank_cutoff
        }
    }
}


with open(f'evaluation_report_regular_m3{metric_suffix}{rerank_suffix}.json', 'w', encoding='utf-8') as f:
    json.dump(report_regular, f, ensure_ascii=False, indent=2)

with open(f'evaluation_report_hybrid_m3{metric_suffix}{rerank_suffix}.json', 'w', encoding='utf-8') as f:
    json.dump(report_hybrid, f, ensure_ascii=False, indent=2)

with open(f'detailed_results_regular_m3{metric_suffix}{rerank_suffix}.json', 'w', encoding='utf-8') as f:
    json.dump(detailed_results_regular, f, ensure_ascii=False, indent=2)

with open(f'detailed_results_hybrid_m3{metric_suffix}{rerank_suffix}.json', 'w', encoding='utf-8') as f:
    json.dump(detailed_results_hybrid, f, ensure_ascii=False, indent=2)

print("\nEvaluation completed successfully!")


print("\nComparison of Regular vs Hybrid Retrieval:")
print(f"{'Metric':<15} {'Regular':<10} {'Hybrid':<10} {'Improvement':<10}")
print("-" * 45)
for k in metrics_regular.recall_at_k.keys():
    reg_recall = metrics_regular.recall_at_k[k]
    hyb_recall = metrics_hybrid.recall_at_k[k]
    imp = ((hyb_recall - reg_recall) / reg_recall) * 100
    print(f"Recall@{k:<12} {reg_recall:.3f}    {hyb_recall:.3f}    {imp:+.1f}%")

print(f"MRR           {metrics_regular.mrr:.3f}    {metrics_hybrid.mrr:.3f}    {((metrics_hybrid.mrr - metrics_regular.mrr) / metrics_regular.mrr) * 100:+.1f}%")
print(f"MAP           {metrics_regular.map_score:.3f}    {metrics_hybrid.map_score:.3f}    {((metrics_hybrid.map_score - metrics_regular.map_score) / metrics_regular.map_score) * 100:+.1f}%")