## Load the data

In [24]:
import json

file_path = "../data/embeddings/rog_strix_gaming_notebook_pc_unscanned_file_chunks_embeddings.json"
with open(file_path, "r") as f:
    data = json.load(f)

chunks = data["chunks"]
questions = data["question_answer_pairs"]

print(f"Number of chunks    : {len(chunks)}")
print(f"Number of questions : {len(questions)}")

Number of chunks    : 43
Number of questions : 377


## Create Qrels

[Qrels (Query Relevance Judgments)](https://amenra.github.io/ranx/qrels/) are the ground truth file or answer key used to evaluate a search system. They explicitly map each query to the list of documents that are considered the correct and relevant answers for that query.

In [25]:
from ranx import Qrels


def create_qrels_from_data(questions: list[dict]) -> Qrels:
    qrels_dict = {}
    for i, question in enumerate(questions):
        query_id = f"q_{i}"
        correct_chunk_id = question["chunk_id"]
        qrels_dict[query_id] = {str(correct_chunk_id): 1}

    return Qrels(qrels_dict)


qrels = create_qrels_from_data(questions)

## Create runs

A [Run](https://amenra.github.io/ranx/run/) is the output from your model. It shows which documents your model thinks are most relevant for each query, along with their similarity scores. These results are then compared to the Qrels (the answer key) to measure how well your model did.

In [26]:
import numpy as np

from ranx import Run
from sklearn.metrics.pairwise import cosine_similarity


def create_run_from_data(
    chunks: list[dict], questions: list[dict], model_name: str
) -> Run:
    run_dict = {}

    chunk_embeddings = np.array([chunk["embeddings"][model_name] for chunk in chunks])
    question_embeddings = np.array(
        [question["embeddings"][model_name] for question in questions]
    )
    similarity_scores = cosine_similarity(question_embeddings, chunk_embeddings)

    chunk_ids = [str(chunk["id"]) for chunk in chunks]
    for i, scores_for_one_question in enumerate(similarity_scores):
        chunk_scores = {
            chunk_id: score
            for chunk_id, score in zip(chunk_ids, scores_for_one_question)
        }

        query_id = f"q_{i}"
        run_dict[query_id] = chunk_scores

    return Run(run_dict, name=model_name)


runs = []
models_to_benchmark = list(chunks[0]["embeddings"].keys())

for current_model_name in models_to_benchmark:
    run = create_run_from_data(
        chunks=chunks, questions=questions, model_name=current_model_name
    )
    runs.append(run)

## Run the benchmark

Ranx handles the evaluation and statistical testing internally. You just need to call the [compare](https://amenra.github.io/ranx/compare/) function with the appropriate parameters.

In [27]:
from ranx import compare


metrics = ["mrr", "recall@1", "recall@5", "ndcg@5"]
report = compare(
    qrels=qrels,
    runs=runs,
    metrics=metrics,
    max_p=0.05,
    stat_test="fisher",
)

## Display results

### Metrics table

Assign a letter to each model for easy reference in the superscripts.

In [28]:
import string

model_to_letter = {}
for letter, name in zip(string.ascii_lowercase, report.model_names):
    model_to_letter[name] = letter

model_to_letter

{'all-minilm-l6-v2': 'a',
 'qwen3-embedding-0.6b': 'b',
 'gemini-embedding-001': 'c',
 'qwen3-embedding-4b': 'd',
 'qwen3-embedding-8b': 'e',
 'text-embedding-3-small': 'f',
 'text-embedding-3-large': 'g'}

Compute the cell data for each model and metric, including formatting the score and adding superscripts for statistical significance.

When you see something like this in a cell `0.8390 abcfg`, it means:
- `0.8390` is the score for that model on that metric.
- `abcfg` are the letters corresponding to other models that this model's score is statistically significantly better than.

The `max_widths` dictionary keeps track of the maximum width (in characters) for each column to make sure the output table lines up neatly and looks well-formatted. This helps each metric column fit its longest value, so the table is easy to read.

In [29]:
report.rounding_digits = 4
cell_data = {}
max_widths = {metric: len(metric) for metric in report.metrics}

for current_model_name in report.model_names:
    cell_data[current_model_name] = {}
    for metric in report.metrics:
        superscripts = ""
        current_model_score = report.results[current_model_name][metric]

        for other_model_name in report.model_names:
            if current_model_name == other_model_name:
                continue

            comparison_key = {current_model_name, other_model_name}
            p_value = report.comparisons[comparison_key][metric]["p_value"]
            other_model_score = report.results[other_model_name][metric]

            if current_model_score > other_model_score and p_value < report.max_p:
                superscripts += model_to_letter[other_model_name]

        score_str = f"{current_model_score:.{report.rounding_digits}f}"
        sorted_superscripts = "".join(sorted(superscripts))
        cell_data[current_model_name][metric] = (score_str, sorted_superscripts)

        total_len = len(score_str)
        if sorted_superscripts:
            space_len = 1
            total_len += space_len + len(sorted_superscripts)

        if total_len > max_widths[metric]:
            max_widths[metric] = total_len

In [30]:
print(max_widths)

{'mrr': 12, 'recall@1': 11, 'recall@5': 12, 'ndcg@5': 12}


In [31]:
cell_data

{'all-minilm-l6-v2': {'mrr': ('0.7121', ''),
  'recall@1': ('0.6127', ''),
  'recall@5': ('0.8408', ''),
  'ndcg@5': ('0.7341', '')},
 'qwen3-embedding-0.6b': {'mrr': ('0.8075', 'acfg'),
  'recall@1': ('0.7188', 'a'),
  'recall@5': ('0.9098', 'ag'),
  'ndcg@5': ('0.8254', 'acfg')},
 'gemini-embedding-001': {'mrr': ('0.7836', 'a'),
  'recall@1': ('0.7029', 'a'),
  'recall@5': ('0.8886', 'a'),
  'ndcg@5': ('0.8015', 'a')},
 'qwen3-embedding-4b': {'mrr': ('0.8390', 'abcfg'),
  'recall@1': ('0.7480', 'acfg'),
  'recall@5': ('0.9682', 'abcfg'),
  'ndcg@5': ('0.8684', 'abcfg')},
 'qwen3-embedding-8b': {'mrr': ('0.8307', 'abcfg'),
  'recall@1': ('0.7401', 'acg'),
  'recall@5': ('0.9496', 'abcfg'),
  'ndcg@5': ('0.8559', 'abcfg')},
 'text-embedding-3-small': {'mrr': ('0.7851', 'a'),
  'recall@1': ('0.7056', 'a'),
  'recall@5': ('0.8859', 'a'),
  'ndcg@5': ('0.8028', 'a')},
 'text-embedding-3-large': {'mrr': ('0.7847', 'a'),
  'recall@1': ('0.7003', 'a'),
  'recall@5': ('0.8780', 'a'),
  'ndcg@

Find the best scores for each metric to highlight them in the results table.

In [32]:
max_scores = {metric: -float("inf") for metric in report.metrics}
min_scores = {metric: float("inf") for metric in report.metrics}
for model_name in report.model_names:
    for metric in report.metrics:
        score = report.results[model_name][metric]
        if score > max_scores[metric]:
            max_scores[metric] = score
        if score < min_scores[metric]:
            min_scores[metric] = score

print(max_scores)
print(min_scores)

{'mrr': 0.8389737436156535, 'recall@1': 0.7480106100795756, 'recall@5': 0.9681697612732095, 'ndcg@5': 0.8683514691382817}
{'mrr': 0.7120984864084288, 'recall@1': 0.6127320954907162, 'recall@5': 0.8408488063660478, 'ndcg@5': 0.7340709524343749}


In [33]:
from typing import Any
from rich.box import HEAVY
from rich.text import Text
from rich.table import Table
from rich.console import Console


console = Console()
table = Table(show_header=True, header_style="bold bright_magenta", box=HEAVY)
table.add_column("#", style="dim")
table.add_column("Model", style="yellow")

for metric in report.metrics:
    table.add_column(metric, justify="left", width=max_widths[metric])

for model_name in report.model_names:
    row_data: list[Any] = [f"[dim]{model_to_letter[model_name]}[/dim]", model_name]

    for metric in report.metrics:
        cell_text = Text()
        score = report.results[model_name][metric]
        score_part, superscript_part = cell_data[model_name][metric]

        if score == max_scores[metric]:
            cell_text.append(score_part, style="bold bright_green")
        elif score == min_scores[metric]:
            cell_text.append(score_part, style="bold bright_red")
        else:
            cell_text.append(score_part)

        if superscript_part:
            cell_text.append(" ")
            cell_text.append(superscript_part, style="bold bright_cyan")

        row_data.append(cell_text)

    table.add_row(*row_data)

console.print(table)

If you want to sort the models by their average score across all metrics, you can calculate the average for each model and then sort them accordingly. This way, you can see which models perform best overall, rather than just on individual metrics.

In [35]:
from typing import Any
from rich.box import HEAVY
from rich.text import Text
from rich.table import Table
from rich.console import Console


console = Console()
table = Table(show_header=True, header_style="bold bright_magenta", box=HEAVY)
table.add_column("#", style="dim")
table.add_column("Model", style="yellow")

for metric in report.metrics:
    table.add_column(metric, justify="left", width=max_widths[metric])

table.add_column("Average", justify="right", style="bold cyan")

model_averages = {}
for model_name in report.model_names:
    avg_score = np.mean(
        [report.results[model_name][metric] for metric in report.metrics]
    )
    model_averages[model_name] = avg_score

sorted_models = sorted(model_averages.items(), key=lambda x: x[1], reverse=True)
best_model_name = sorted_models[0][0]

for model_name, avg_score in sorted_models:
    model_name_cell_value = model_name
    if model_name == best_model_name:
        model_name_cell_value = f"[bold bright_green]{model_name}[/bold bright_green]"

    row_data: list[Any] = [
        f"[dim]{model_to_letter[model_name]}[/dim]",
        model_name_cell_value,
    ]

    for metric in report.metrics:
        cell_text = Text()
        score = report.results[model_name][metric]
        score_part, superscript_part = cell_data[model_name][metric]

        if score == max_scores[metric]:
            cell_text.append(score_part, style="bold bright_green")
        elif score == min_scores[metric]:
            cell_text.append(score_part, style="bold bright_red")
        else:
            cell_text.append(score_part)

        if superscript_part:
            cell_text.append(" ")
            cell_text.append(superscript_part, style="bold bright_cyan")

        row_data.append(cell_text)

    row_data.append(f"{avg_score:.4f}")
    table.add_row(*row_data)

console.print(table)

### Win Tie Loss table

The `win_tie_loss` dictionary shows how many times a model won, tied, or lost against other models based on the statistical significance of their scores.

Here is an example of an object from the `win_tie_loss` dictionary:

```json
"('all-minilm-l6-v2', 'qwen3-embedding-0.6b')": {
    "mrr": {
        "W": 37,
        "T": 229,
        "L": 111
    },
    "recall@1": {
        "W": 15,
        "T": 307,
        "L": 55
    },
    "recall@5": {
        "W": 10,
        "T": 331,
        "L": 36
    },
    "ndcg@5": {
        "W": 29,
        "T": 252,
        "L": 96
    }
}
```

The keys are tuples representing pairs of models being compared. The values are dictionaries where each key is a metric (like `mrr`, `recall@1`, etc.) and the value is another dictionary with counts of Wins (`W`), Ties (`T`), and Losses (`L`) for the first model against the second model.

In [64]:
import json


def tuple_keys_to_str(dictionary: dict) -> dict:
    return {str(k): v for k, v in dictionary.items()}


print(json.dumps(tuple_keys_to_str(report.win_tie_loss), indent=2))

{
  "('all-minilm-l6-v2', 'qwen3-embedding-0.6b')": {
    "mrr": {
      "W": 37,
      "T": 229,
      "L": 111
    },
    "recall@1": {
      "W": 15,
      "T": 307,
      "L": 55
    },
    "recall@5": {
      "W": 10,
      "T": 331,
      "L": 36
    },
    "ndcg@5": {
      "W": 29,
      "T": 252,
      "L": 96
    }
  },
  "('all-minilm-l6-v2', 'gemini-embedding-001')": {
    "mrr": {
      "W": 36,
      "T": 233,
      "L": 108
    },
    "recall@1": {
      "W": 17,
      "T": 309,
      "L": 51
    },
    "recall@5": {
      "W": 8,
      "T": 343,
      "L": 26
    },
    "ndcg@5": {
      "W": 29,
      "T": 264,
      "L": 84
    }
  },
  "('all-minilm-l6-v2', 'qwen3-embedding-4b')": {
    "mrr": {
      "W": 25,
      "T": 231,
      "L": 121
    },
    "recall@1": {
      "W": 16,
      "T": 294,
      "L": 67
    },
    "recall@5": {
      "W": 2,
      "T": 325,
      "L": 50
    },
    "ndcg@5": {
      "W": 22,
      "T": 240,
      "L": 115
    }
  },
  "('all-mi

Create a table that displays the Win, Tie, and Loss counts for each pair of models across all metrics.

In [None]:
console = Console()
table = Table(show_header=True, header_style="bold bright_magenta", box=HEAVY)
table.add_column("Model A", style="yellow")
table.add_column("Model B", style="yellow")
table.add_column("Metric", style="white")
table.add_column("Wins (A)", justify="center", style="bold bright_green")
table.add_column("Ties", justify="center", style="grey70")
table.add_column("Losses (A)", justify="center", style="bold bright_red")
table.add_column("p-value", justify="right", style="bold")

win_tie_loss = report.win_tie_loss
shown_pairs = set()

for key, metrics_data in win_tie_loss.items():
    model_a, model_b = key  # type: ignore
    pair = frozenset([model_a, model_b])
    if pair in shown_pairs:
        continue
    shown_pairs.add(pair)

    for metric, scores in metrics_data.items():
        p_value = report.comparisons[key][metric]["p_value"]
        p_value_style = "bold bright_green" if p_value < report.max_p else "white"

        if scores["W"] > scores["L"]:
            model_a_text = Text(model_a, style="bold bright_green")
            model_b_text = Text(model_b)
        elif scores["L"] > scores["W"]:
            model_a_text = Text(model_a)
            model_b_text = Text(model_b, style="bold bright_green")
        else:
            model_a_text = Text(model_a)
            model_b_text = Text(model_b)

        table.add_row(
            model_a_text,
            model_b_text,
            metric,
            str(scores["W"]),
            str(scores["T"]),
            str(scores["L"]),
            Text(f"{p_value:.3f}", style=p_value_style),
        )

    table.add_row("")

console.print(table)