In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Frag-grounding%2Franking-api%2Franking_api_beir_evaluation.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/rag-grounding/ranking-api/ranking_api_beir_evaluation.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Jannis Grönberg](https://github.com/puebster) |

## Overview
### Ranking API BEIR Evaluation

This notebook serves to replicate the results from `googles/semantic-ranker-default-004` on the BEIR dataset.

**Important Notes on Evaluation:**

*   **Labeled Data Only:** We have restricted our evaluation to only the labeled datapoints within each BEIR subdataset. This decision stems from the observation that many datasets contain examples that are clearly relevant but lack explicit labels. Including these unlabeled but relevant examples would lead to an underestimation of the model's performance, making a fair evaluation challenging.
*   **Datasets with Multiple Labels:** We have further filtered the BEIR datasets to include only those that have more than one unique label assigned. This is because computing meaningful Normalized Discounted Cumulative Gain (NDCG) scores on datasets where each query has only a single relevant document is problematic and can lead to misleading results.

This notebook aims to provide a focused and more reliable assessment of the `googles/semantic-ranker-default-004` model on a subset of the BEIR benchmark.

### Install Google Cloud SDKs and other required packages


In [2]:
%pip install --upgrade --quiet google-cloud-discoveryengine pytrec_eval numpy pandas scikit-learn tqdm

# Setup

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [3]:
# @title Colab authentification
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [4]:
# @title Import Libraries

# Standard library imports
import json
import pickle
import time

from google.cloud import aiplatform
from google.cloud import discoveryengine_v1 as discoveryengine
from google.cloud import storage

# Third-party library imports
import numpy as np
import pandas as pd
import pytrec_eval
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import vertexai

print(f"pickle version: {pickle.format_version}")

pickle version: 4.0


In [5]:
# @title Helper functions
def call_model_in_batches(
    df: pd.DataFrame, model_name: str, project_id: str, location: str
):
    """Generates ranking scores for document IDs based on provided queries using

    the Google Cloud Discovery Engine Rank API.

    The function processes a DataFrame of queries and associated documents,
    sends batches of documents to the Discovery Engine Rank API for scoring
    against each query, and returns a dictionary of query-to-document-score
    mappings.

    Args:
        df: A Pandas DataFrame containing query and document information. It must
          have columns: "query_id", "query", "document_id", "title", and
          "content". Each row represents a document associated with a specific
          query.
        project_id: The ID of the Google Cloud project.
        location: The location (region) of the Discovery Engine instance.
        model_name: The name of the ranking model to use in the Discovery Engine.

    Returns:
        A dictionary where the outer keys are query IDs (str) and the inner values
        are dictionaries mapping document IDs (str) to their ranking scores
        (float),
        rounded to four decimal places.
    """
    results = {}

    client = discoveryengine.RankServiceClient()
    ranking_config = client.ranking_config_path(
        project=project_id,
        location=location,
        ranking_config="default_ranking_config",
    )
    query_groups = df.groupby("query_id")
    dataset_name = df.iloc[0]["name"]
    overall_passed_seconds = 0

    # Iterate through datasets
    for query_id, group in tqdm(query_groups, desc=f"Scoring {dataset_name}"):

        # Add the query id to the results
        query = group.iloc[0]["query"]
        results[query_id] = {}

        records = []
        # iterate through rows of each subgroup.
        for _, row in group.iterrows():

            records.append(
                discoveryengine.RankingRecord(
                    id=row["document_id"],
                    title=(
                        row["title"] if len(row["title"]) <= 799 else row["title"][:799]
                    ),
                    content=row["content"],
                )
            )
            # We process in in batches of 200
            if len(records) >= 200:
                now = time.time()
                request = discoveryengine.RankRequest(
                    ranking_config=ranking_config,
                    model=model_name,
                    query=query,
                    records=records,
                    ignore_record_details_in_response=True,
                )
                resp = client.rank(request=request)
                overall_passed_seconds += time.time() - now
                for i in resp.records:
                    results[query_id][i.id] = i.score
                records = []

        # If any records are left process them
        if len(records) > 0:
            now = time.time()
            request = discoveryengine.RankRequest(
                ranking_config=ranking_config,
                model=model_name,
                query=query,
                records=records,
                ignore_record_details_in_response=True,
            )
            resp = client.rank(request=request)
            overall_passed_seconds += time.time() - now

        # Round the scores and add them to the overall results.
        for i in resp.records:
            results[query_id][i.id] = round(i.score, 4)

    print(
        f"The time to process {len(query_groups)} queries with at total number of"
        f" {len(df)} documents took: {round(overall_passed_seconds, 2)}s.\n That"
        f" is {round(1000*overall_passed_seconds/len(query_groups), 2)}ms per"
        f" query or {round(1000*overall_passed_seconds/len(df), 2)}ms per"
        " document."
    )
    return results


def evaluate(
    qrels: dict[str, dict[str, int]],
    results: dict[str, dict[str, float]],
    k_values: list[int],
    ignore_identical_ids: bool = True,
    verbose: bool = True,
) -> tuple[dict[str, float], dict[str, float], dict[str, float], dict[str, float]]:
    """Evaluates the retrieval results against the ground truth relevance judgments (qrels)

    using Normalized Discounted Cumulative Gain (NDCG).
    This code is taken from the official BEIR evaluation code.

    Args:
        qrels: A dictionary representing the ground truth relevance judgments. The
          outer keys are query IDs (str), and the inner dictionary maps document
          IDs (str) to their relevance scores (int).
        results: A dictionary representing the retrieval results. The outer keys
          are query IDs (str), and the inner dictionary maps retrieved document
          IDs (str) to their retrieval scores (float).
        k_values: A list of integers representing the cutoff ranks for NDCG
          calculation (e.g., [1, 3, 5, 10]). NDCG will be calculated for each k in
          this list.
        ignore_identical_ids: If True, documents with the same ID as the query ID
          will be ignored during evaluation to prevent self-ranking. Defaults to
          True.
        verbose: If True, detailed evaluation metrics for each k-value will be
          printed. Defaults to True.

    Returns:
        A tuple containing four dictionaries:
            - ndcg_scores: A dictionary with the k values as keys and ndcg@k
            scores as values.
    """

    if ignore_identical_ids:
        popped = []
        for qid, rels in results.items():
            for pid in list(rels):
                if qid == pid:
                    results[qid].pop(pid)
                    popped.append(pid)

    ndcg = {}

    for k in k_values:
        ndcg[f"NDCG@{k}"] = 0.0

    ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels,
        {
            ndcg_string,
        },
    )

    scores = evaluator.evaluate(results)

    for query_id in scores.keys():
        for k in k_values:
            ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]

    for k in k_values:
        ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"] / len(scores), 5)

    if verbose:
        for eval in [ndcg]:
            for k in eval.keys():
                print(f"{k}: {eval[k]:.4f}")

    return ndcg


def load_bytes_from_gcs(bucket_name: str, file_path: str):
    """Accesses a Google Cloud Storage (GCS) bucket, opens a file,

    loads its contents and returns its bytes

    Args:
        bucket_name: The name of the GCS bucket.
        file_path: The path to the file within the bucket (e.g.,
          'data/my_data.json').

    Returns:
        Bytes like object
    """
    try:
        # Initialize the GCS client
        client = storage.Client()

        # Get the bucket object
        bucket = client.bucket(bucket_name)

        # Get the blob (file) object
        blob = bucket.blob(file_path)

        # Download the blob's content as bytes
        _bytes = blob.download_as_bytes()

        return _bytes

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def save_dict_to_gcs_json(
    data: dict, bucket_name: str, file_path: str, encoding: str = "utf-8"
) -> None:
    """Saves a Python dictionary as a JSON file to a Google Cloud Storage (GCS) bucket.

    Args:
        data: The dictionary to be saved.
        bucket_name: The name of the GCS bucket.
        file_path: The desired path for the JSON file within the bucket (e.g.,
          'data/my_data.json').
        encoding: The encoding to use when writing the JSON file. Defaults to
          'utf-8'.
    """
    try:
        # Initialize the GCS client
        client = storage.Client()

        # Get the bucket object
        bucket = client.bucket(bucket_name)

        # Get the blob (file) object
        blob = bucket.blob(file_path)

        # Serialize the dictionary to a JSON string
        json_string = json.dumps(
            data, indent=2, ensure_ascii=False
        )  # indent for readability

        # Upload the JSON string to GCS as bytes with specified encoding
        blob.upload_from_string(
            json_string.encode(encoding), content_type="application/json"
        )

        print(f"Dictionary successfully saved to gs://{bucket_name}/{file_path}")

    except Exception as e:
        print(f"An error occurred while saving to GCS: {e}")

# If you want to run the scoring for yourself run the following section. Otherwise you can skip this section

In [6]:
# @title GCP parameters

# @markdown Change this to your bucket, project id and location
OUTPUT_BUCKET_NAME = "ranking_datasets"  # @param {type: "string"}
PROJECT_NUMBER = 1234567  # @param {type: "integer"}
LOCATION = "<YOUR-LOCATION>"  # @param {type: "string"}
MODEL_NAME = "semantic-ranker-default-004"  # @param {type: "string"}

aiplatform.init(project=PROJECT_NUMBER, location=LOCATION)
vertexai.init(project=PROJECT_NUMBER, location=LOCATION)

INPUT_BUCKET_NAME = "ranking_datasets"
DATASET_PATH = "input_data/labeled_beir_with_more_than_1_label.pkl"
SCORE_PATH = "scores"

# Authenticate user to Google Cloud with your current credentials
auth.authenticate_user()

In [7]:
# @title Read Dataset
df = pickle.loads(load_bytes_from_gcs(INPUT_BUCKET_NAME, DATASET_PATH))
display(df.groupby("name").agg({"query_id": "nunique", "document_id": "nunique"}))

Unnamed: 0_level_0,query_id,document_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
dbpedia-entity,400,40724
msmarco,43,9139
nfcorpus,323,3128
scidocs,1000,25657
trec-covid,50,35480
webis-touche2020,49,2099


In [8]:
# @title Test connection
docs = [
    {
        "query_id": "1",
        "query": "Cheap red car",
        "name": "test",
        "document_id": "1",
        "title": "Purple Ferrari",
        "content": (
            "The purple Ferrari is a variation of the red Ferrari. However it"
            " does not shine as bright."
        ),
    },
    {
        "query_id": "1",
        "query": "Cheap red car",
        "name": "test",
        "document_id": "2",
        "title": "Purple VW",
        "content": (
            "The purple VW is a variation of the red VW. However it does not"
            " shine as bright."
        ),
    },
    {
        "query_id": "1",
        "query": "Cheap red car",
        "name": "test",
        "document_id": "3",
        "title": "Used red Car",
        "content": ("This car is used and 3 years old. The paint has a red color."),
    },
]

res = call_model_in_batches(
    df=pd.DataFrame(docs),
    model_name=MODEL_NAME,
    project_id=PROJECT_NUMBER,
    location=LOCATION,
)

for d in docs:
    for r, v in res["1"].items():
        if r == d["document_id"]:
            print(d["title"], v)
            break

Scoring test:   0%|          | 0/1 [00:00<?, ?it/s]

The time to process 1 queries with at total number of 3 documents took: 0.2s.
 That is 202.05ms per query or 67.35ms per document.
Purple Ferrari 0.1854
Purple VW 0.1874
Used red Car 0.3871


In [9]:
# @title Score Query-Document pairs

dataset_groups = df.groupby("name")
for subdataset_name, dataset in dataset_groups:
    score_output_path = f"{SCORE_PATH}/{subdataset_name.replace('-','_')}/{MODEL_NAME.replace('-', '_')}.json"

    print(f"Evaluating : {subdataset_name.upper()}")
    print(f"Shape: {dataset.shape}")

    scores = call_model_in_batches(
        df=dataset,
        model_name=MODEL_NAME,
        project_id=PROJECT_NUMBER,
        location=LOCATION,
    )

    save_dict_to_gcs_json(
        data=scores, bucket_name=OUTPUT_BUCKET_NAME, file_path=score_output_path
    )

Evaluating : DBPEDIA-ENTITY
Shape: (43515, 7)


Scoring dbpedia-entity:   0%|          | 0/400 [00:00<?, ?it/s]

The time to process 400 queries with at total number of 43515 documents took: 59.99s.
 That is 149.98ms per query or 1.38ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/dbpedia_entity/semantic_ranker_private_preview_004.json
Evaluating : MSMARCO
Shape: (9260, 7)


Scoring msmarco:   0%|          | 0/43 [00:00<?, ?it/s]

The time to process 43 queries with at total number of 9260 documents took: 7.06s.
 That is 164.15ms per query or 0.76ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/msmarco/semantic_ranker_private_preview_004.json
Evaluating : NFCORPUS
Shape: (12334, 7)


Scoring nfcorpus:   0%|          | 0/323 [00:00<?, ?it/s]

The time to process 323 queries with at total number of 12334 documents took: 22.14s.
 That is 68.53ms per query or 1.79ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/nfcorpus/semantic_ranker_private_preview_004.json
Evaluating : SCIDOCS
Shape: (29928, 7)


Scoring scidocs:   0%|          | 0/1000 [00:00<?, ?it/s]

The time to process 1000 queries with at total number of 29928 documents took: 55.7s.
 That is 55.7ms per query or 1.86ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/scidocs/semantic_ranker_private_preview_004.json
Evaluating : TREC-COVID
Shape: (66336, 7)


Scoring trec-covid:   0%|          | 0/50 [00:00<?, ?it/s]

The time to process 50 queries with at total number of 66336 documents took: 22.84s.
 That is 456.72ms per query or 0.34ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/trec_covid/semantic_ranker_private_preview_004.json
Evaluating : WEBIS-TOUCHE2020
Shape: (2214, 7)


Scoring webis-touche2020:   0%|          | 0/49 [00:00<?, ?it/s]

The time to process 49 queries with at total number of 2214 documents took: 2.78s.
 That is 56.77ms per query or 1.26ms per document.
Dictionary successfully saved to gs://ranking_datasets/scores/webis_touche2020/semantic_ranker_private_preview_004.json


# Calculate NDCG and ROC AUC Scores

In [10]:
# @title GCP parameters

# @markdown Change this to your bucket, project id and location
INPUT_BUCKET_NAME = "ranking_datasets"  # @param {type: "string"}
MODEL_NAME = "semantic-ranker-default-004"  # @param {type: "string"}
SCORE_PATH = "scores"  # @param {type: "string"}

k_values = [1, 3, 5, 10]

In [11]:
# @title Calculate...

# Get datasets
client = storage.Client()
bucket = client.bucket(INPUT_BUCKET_NAME)
blobs = [
    "nfcorpus",
    "scidocs",
    "trec-covid",
    "webis-touche2020",
    "dbpedia-entity",
    "msmarco",
]
_res = None

for sub_name in blobs:
    print(sub_name)
    qrel_path = f"{SCORE_PATH}/{sub_name.replace('-', '_')}/qrels.json"
    qpred_path = (
        f"{SCORE_PATH}/{sub_name.replace('-', '_')}/{MODEL_NAME.replace('-', '_')}.json"
    )

    qrel = json.loads(load_bytes_from_gcs(INPUT_BUCKET_NAME, qrel_path).decode())
    qpred = json.loads(load_bytes_from_gcs(INPUT_BUCKET_NAME, qpred_path).decode())

    # Check if all query document pairs are present in qpred and qrel
    for first, second, n1, n2 in [
        (qrel, qpred, "Qrel", "Qpred"),
        (qpred, qrel, "Qpred", "Qrel"),
    ]:
        deleted_queries = 0
        deleted_docs = 0
        first_qid_keys = list(first.keys())
        for qid in first_qid_keys:
            if qid not in second:
                del first[qid]
                deleted_queries += 1
            first_did_keys = list(first[qid].keys())
            for did in first_did_keys:
                if did not in second[qid]:
                    del first[qid][did]
                    deleted_docs += 1
        if deleted_queries > 0 or deleted_docs > 0:
            print(
                f"Deleted from {n1} (because not present in {n2}):"
                f" {deleted_queries} Queries, {deleted_docs} Documents"
            )

    # The following conversion is needed as pytrec eval only excepts positive integers as gt-labels
    y_true, y_pred = [], []
    _max_score = 0
    for qid, dids in qrel.items():
        for did, yt in dids.items():
            _max_score = max(_max_score, yt)

    for qid, dids in qrel.items():
        for did, yt in dids.items():
            yt_label = 0 if yt / _max_score <= 0.5 else 1
            y_true.append(yt_label)
            y_pred.append(qpred[qid][did])

    ndcg = evaluate(qrel, qpred, k_values, verbose=False)
    results = {k: [v] for k, v in ndcg.items()}

    results["ROC AUC"] = [float(roc_auc_score(np.array(y_true), np.array(y_pred)))]

    ndcg_res = pd.DataFrame.from_dict(results, orient="index", columns=[sub_name])
    if _res is None:
        _res = ndcg_res
    else:
        _res = _res.merge(ndcg_res, left_index=True, right_index=True)

_res["Macro Avg."] = _res.mean(axis=1)
_res.index.name = MODEL_NAME

nfcorpus
scidocs
trec-covid
webis-touche2020
dbpedia-entity
msmarco


In [12]:
_res

Unnamed: 0_level_0,nfcorpus,scidocs,trec-covid,webis-touche2020,dbpedia-entity,msmarco,Macro Avg.
semantic-ranker-default-004,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NDCG@1,0.97059,0.942,0.92,0.85714,0.735,0.80233,0.871177
NDCG@3,0.96246,0.89994,0.91235,0.80132,0.67754,0.77696,0.838428
NDCG@5,0.96433,0.824,0.89551,0.78376,0.6681,0.77811,0.818968
NDCG@10,0.97109,0.90327,0.86081,0.74412,0.6756,0.77976,0.822442
ROC AUC,0.619083,0.929131,0.847284,0.84312,0.800851,0.840652,0.813353
