## Load the data

In [1]:
import json

file_path = "../data/embeddings/rog_strix_gaming_notebook_pc_unscanned_file_chunks_embeddings.json"
with open(file_path, "r") as f:
    data = json.load(f)

chunks = data["chunks"]
questions = data["question_answer_pairs"]

print(f"Number of chunks    : {len(chunks)}")
print(f"Number of questions : {len(questions)}")

Number of chunks    : 43
Number of questions : 377


## Evaluation metrics

There are several metrics commonly used to benchmark embedding models. In this notebook, you will use the following metrics:

- **Mean Reciprocal Rank (MRR)**
- **Recall@K**
- **Normalized Discounted Cumulative Gain (NDCG)**

For more details and additional metrics, see the [ranx metrics documentation](https://amenra.github.io/ranx/metrics/).

### Mean Reciprocal Rank (MRR)

MRR measures how quickly the first relevant result appears in the ranked list. The equation is:

$$
\text{MRR} = \frac{1}{\text{rank}}
$$

where **rank** is the position of the first relevant document.

In [2]:
import math


def calculate_mrr(rank: int | None) -> float:
    if rank is None:
        return 0.0
    return 1.0 / rank

### Recall@K

Recall@K indicates whether the relevant document appears within the top-$K$ results.  
For datasets with one relevant document ($R=1$):

$$
\text{Recall@K} = \frac{r}{R}
$$

where $r$ is 1 if the relevant document is in the top-$K$, otherwise 0.


In [3]:
def calculate_recall_at_k(rank: int | None, k: int) -> float:
    if rank is None:
        return 0.0
    return 1.0 if rank <= k else 0.0

### Normalized Discounted Cumulative Gain (NDCG@K)

N[DCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)@K evaluates ranking quality, considering the position of the relevant document.  
For a single relevant document:

$$
\text{DCG@K} = 
\begin{cases}
\frac{1}{\log_2(\text{rank} + 1)}, & \text{if rank} \leq K \\
0, & \text{otherwise}
\end{cases}
$$

$$
\text{IDCG@K} = 1
$$

$$
\text{NDCG@K} = \frac{\text{DCG@K}}{\text{IDCG@K}} = \text{DCG@K}
$$

In [4]:
def calculate_ndcg_at_k(rank: int | None, k: int) -> float:
    if rank is None:
        return 0.0

    if rank <= k:
        return 1.0 / math.log2(rank + 1)
    else:
        return 0.0

## Prepare data for evaluation

Get the list of embedding models.

In [5]:
models_to_benchmark = list(chunks[0]["embeddings"].keys())
print(models_to_benchmark)

['all-minilm-l6-v2', 'qwen3-embedding-0.6b', 'gemini-embedding-001', 'qwen3-embedding-4b', 'qwen3-embedding-8b', 'text-embedding-3-small', 'text-embedding-3-large']


Create a mapping from each question to its correct chunk.

In [6]:
import numpy as np

ground_truth = {i: question["chunk_id"] for i, question in enumerate(questions)}
ground_truth

{0: 0,
 1: 0,
 2: 0,
 3: 0,
 4: 1,
 5: 1,
 6: 1,
 7: 1,
 8: 1,
 9: 1,
 10: 1,
 11: 1,
 12: 2,
 13: 2,
 14: 2,
 15: 2,
 16: 2,
 17: 2,
 18: 2,
 19: 2,
 20: 3,
 21: 3,
 22: 3,
 23: 4,
 24: 4,
 25: 4,
 26: 4,
 27: 4,
 28: 4,
 29: 4,
 30: 4,
 31: 4,
 32: 4,
 33: 4,
 34: 4,
 35: 4,
 36: 4,
 37: 4,
 38: 4,
 39: 4,
 40: 4,
 41: 4,
 42: 4,
 43: 4,
 44: 4,
 45: 4,
 46: 4,
 47: 4,
 48: 4,
 49: 4,
 50: 4,
 51: 4,
 52: 4,
 53: 4,
 54: 4,
 55: 4,
 56: 4,
 57: 4,
 58: 4,
 59: 5,
 60: 5,
 61: 5,
 62: 5,
 63: 5,
 64: 5,
 65: 5,
 66: 6,
 67: 6,
 68: 6,
 69: 6,
 70: 6,
 71: 6,
 72: 6,
 73: 7,
 74: 7,
 75: 7,
 76: 7,
 77: 7,
 78: 7,
 79: 7,
 80: 7,
 81: 8,
 82: 8,
 83: 8,
 84: 8,
 85: 8,
 86: 8,
 87: 8,
 88: 8,
 89: 9,
 90: 9,
 91: 9,
 92: 9,
 93: 9,
 94: 9,
 95: 10,
 96: 10,
 97: 10,
 98: 10,
 99: 10,
 100: 10,
 101: 10,
 102: 10,
 103: 10,
 104: 10,
 105: 10,
 106: 10,
 107: 10,
 108: 11,
 109: 11,
 110: 11,
 111: 11,
 112: 11,
 113: 11,
 114: 11,
 115: 11,
 116: 11,
 117: 11,
 118: 12,
 119: 12,
 120:

Build a list of chunk IDs to make sure that the IDs are in sequential order.

In [7]:
chunk_ids_list = [chunk["id"] for chunk in chunks]
chunk_ids_list

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42]

Pre-load all chunk and question embeddings into NumPy arrays to make similarity calculations efficient.

In [8]:
chunk_embeddings = {}
question_embeddings = {}

for model in models_to_benchmark:
    chunk_embeddings[model] = np.array([chunk["embeddings"][model] for chunk in chunks])
    question_embeddings[model] = np.array(
        [question["embeddings"][model] for question in questions]
    )

In [9]:
chunk_embeddings["gemini-embedding-001"].shape

(43, 3072)

In [10]:
question_embeddings["gemini-embedding-001"].shape

(377, 3072)

## Run the manual benchmark

This loop runs the benchmark for each embedding model:

- For each model, it calculates the cosine similarity between every question and all chunks.
- For each question, it ranks the chunks by similarity and finds the position (rank) of the correct chunk.
- It computes four metrics (MRR, Recall@1, Recall@5, NDCG@5) for each question using the rank.
- After processing all questions, it averages the scores for each metric and stores the results for the model.

This way, you get a summary of how well each model retrieves the correct chunk for all questions.

In [11]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


def get_rank(ranked_chunk_ids: list[int], correct_chunk_id: int) -> int | None:
    try:
        return ranked_chunk_ids.index(correct_chunk_id) + 1
    except ValueError:
        return None


benchmark_results = {}
for model_name in tqdm(models_to_benchmark, total=len(models_to_benchmark)):
    all_question_embeddings = question_embeddings[model_name]
    all_chunk_embeddings = chunk_embeddings[model_name]
    similarity_matrix = cosine_similarity(all_question_embeddings, all_chunk_embeddings)

    all_mrr_scores = []
    all_recall_1_scores = []
    all_recall_5_scores = []
    all_ndcg_5_scores = []

    for i in range(len(questions)):
        correct_chunk_id = ground_truth[i]
        scores_for_this_question = similarity_matrix[i]

        chunk_scores = list(zip(chunk_ids_list, scores_for_this_question))
        sorted_chunk_scores = sorted(
            chunk_scores, key=lambda item: item[1], reverse=True
        )
        ranked_chunk_ids = [chunk_id for chunk_id, _ in sorted_chunk_scores]

        rank = get_rank(ranked_chunk_ids, correct_chunk_id)
        all_mrr_scores.append(calculate_mrr(rank))
        all_recall_1_scores.append(calculate_recall_at_k(rank, k=1))
        all_recall_5_scores.append(calculate_recall_at_k(rank, k=5))
        all_ndcg_5_scores.append(calculate_ndcg_at_k(rank, k=5))

    benchmark_results[model_name] = {
        "mrr": np.mean(all_mrr_scores),
        "recall@1": np.mean(all_recall_1_scores),
        "recall@5": np.mean(all_recall_5_scores),
        "ndcg@5": np.mean(all_ndcg_5_scores),
    }

100%|██████████| 7/7 [00:00<00:00, 61.97it/s]


## Display results

Find the best scores for each metric to highlight them in the results table.

In [12]:
metrics = ["mrr", "recall@1", "recall@5", "ndcg@5"]
max_scores = {metric: -float("inf") for metric in metrics}
min_scores = {metric: float("inf") for metric in metrics}

for model_name, scores in benchmark_results.items():
    for metric in metrics:
        if scores[metric] > max_scores[metric]:
            max_scores[metric] = scores[metric]
        if scores[metric] < min_scores[metric]:
            min_scores[metric] = scores[metric]

print(max_scores)
print(min_scores)

{'mrr': np.float64(0.8389737436156535), 'recall@1': np.float64(0.7480106100795756), 'recall@5': np.float64(0.9681697612732095), 'ndcg@5': np.float64(0.8683514691382817)}
{'mrr': np.float64(0.7120984864084288), 'recall@1': np.float64(0.6127320954907162), 'recall@5': np.float64(0.8408488063660478), 'ndcg@5': np.float64(0.7340709524343748)}


Show the benchmark results in a formatted table, highlighting the best and worst scores for each metric.

In [21]:
from rich.box import HEAVY
from rich.table import Table
from rich.console import Console

console = Console()
table = Table(show_header=True, header_style="bold bright_magenta", box=HEAVY)
table.add_column("Model", style="yellow", width=25)
for metric in metrics:
    table.add_column(metric, justify="right")

for model_name, scores in benchmark_results.items():
    row_data = [model_name]
    for metric in metrics:
        score = scores[metric]
        score_str = f"{score:.4f}"

        if score == max_scores[metric]:
            score_str = f"[bold bright_green]{score_str}[/bold bright_green]"
        elif score == min_scores[metric]:
            score_str = f"[bold bright_red]{score_str}[/bold bright_red]"

        row_data.append(score_str)

    table.add_row(*row_data)

console.print(table)

If you want to sort the models by their average score across all metrics, you can calculate the average for each model and then sort them accordingly. This way, you can see which models perform best overall, rather than just on individual metrics.

In [22]:
from rich.box import HEAVY
from rich.table import Table
from rich.console import Console

console = Console()
table = Table(show_header=True, header_style="bold bright_magenta", box=HEAVY)
table.add_column("Model", style="yellow", width=25)
for metric in metrics:
    table.add_column(metric, justify="right")
table.add_column("Average", justify="right", style="bold cyan")

model_averages = {}
for model_name, scores in benchmark_results.items():
    average_score = np.mean([scores[metric] for metric in metrics])
    model_averages[model_name] = average_score

sorted_models = sorted(model_averages.items(), key=lambda x: x[1], reverse=True)
best_model_name = sorted_models[0][0]

for model_name, avg_score in sorted_models:
    model_name_cell_value = model_name
    if model_name == best_model_name:
        model_name_cell_value = f"[bold bright_green]{model_name}[/bold bright_green]"

    scores = benchmark_results[model_name]
    row_data = [model_name_cell_value]

    for metric in metrics:
        score = scores[metric]
        score_str = f"{score:.4f}"

        if score == max_scores[metric]:
            score_str = f"[bold bright_green]{score_str}[/bold bright_green]"
        elif score == min_scores[metric]:
            score_str = f"[bold bright_red]{score_str}[/bold bright_red]"

        row_data.append(score_str)

    row_data.append(f"{avg_score:.4f}")
    table.add_row(*row_data)

console.print(table)