In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluating Third-Party LLMs with the Vertex AI Gen AI Evaluation SDK

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_third_party_llms_vertex_ai_gen_ai_eval_sdk.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author |
| --- |
| [Jason Dai](https://github.com/jsondai) |

## Overview

This notebook demonstrates how to use the new Vertex Gen AI Evaluation SDK to evaluate various types of third-party models. Whether you're working with open models you've deployed yourself, models hosted on other platforms, or managed model services available in the Model Garden, this SDK provides a unified way to assess their performance.

We will explore four main scenarios:

1.  **Evaluating Third-Party Models via APIs:** This method is ideal for accessing closed-source models from various third-party (3P) providers such as OpenAI, Anthropic, Cohere, etc. These models are typically accessed via an API key. The Vertex AI GenAI Evaluation SDK integrates with the `litellm` library, which acts as a universal translator, allowing you to call over 100 different LLM APIs using a consistent format. You simply need to provide the appropriate API key for the service you wish to use (e.g., `OPENAI_API_KEY` for OpenAI models like `gpt-5`) and Vertex AI GenAI Evaluation SDK handles the provider-specific API calls. For a full list of supported providers and model string formats, refer to the [LiteLLM Supported Providers](https://docs.litellm.ai/docs/providers) page.
   

2.   **Model as a Service (MaaS) from Model Garden:** Evaluating partner models, such as `llama-4` from Meta, which are offered as managed services within Vertex AI Model Garden. These models often utilize an OpenAI-compatible API format for inference.


3.  **Bring Your Own Model (BYOM) Endpoint:**  This method supports evaluating models that you manage and serve independently. This could be on your own hardware, a different cloud service, or a local machine. To integrate with the Vertex AI Evaluation SDK, you implement a Python function that knows how to communicate with your model's specific serving endpoint to generate responses for given prompts.



Throughout this tutorial, we will use the `vertexai` SDK's new `Client` interface, which provides a streamlined way to interact with Vertex AI services, including the GenAI Evaluation Service.


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


In [None]:
# @title ### Install Vertex AI SDK for Gen AI Evaluation Service

%pip install --upgrade "google-cloud-aiplatform[evaluation]>=1.115.0" litellm --force-reinstall --quiet --no-warn-conflicts

In [None]:
# @title ### Authenticate your notebook environment (Colab only)
# @markdown If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# @title ### Set Google Cloud project information
# @markdown To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).
# @markdown  Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

import os

PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}  # fmt: skip
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))
LOCATION= "us-central1"  # @param {type: "string", placeholder: "us-central1", isTemplate: true}  # fmt: skip
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", LOCATION)


from vertexai import Client, types

client = Client(project=PROJECT_ID, location=LOCATION)

## Evaluating Third-Party Models via API

The Vertex Gen AI Evaluation SDK allows you to evaluate a wide range of closed-source models from various third-party (3P) providers such as OpenAI, Anthropic, Cohere, and others. To connect to these models, you typically need to provide an API key for the respective service.

The SDK simplifies this process by providing a unified way to call these different LLM APIs. You can generally use the same code structure, just changing the model identifier string and ensuring the correct API key is available in your environment.

**Key Features:**

*   **Broad Provider Support:** Evaluate models from many popular LLM providers.
*   **Simplified Workflow:** Use a consistent method within the SDK to run inference, regardless of the 3P provider.
*   **API Key Authentication:** Securely authenticate with each provider using their standard API keys.

**Note:** You need to set the appropriate API keys as environment variables for the provider you intend to use (e.g., `OPENAI_API_KEY` for OpenAI models, `ANTHROPIC_API_KEY` for Anthropic models, etc.). Consult the specific provider's documentation for details on obtaining and setting API keys. For a list of compatible model names, you can refer to the [LiteLLM Supported Providers](https://docs.litellm.ai/docs/providers) page.


In [None]:
# 1. Example using OpenAI
LITELLM_MODEL_ID = "gpt-5-mini"
# Make sure your OPENAI_API_KEY environment variable is set.
os.environ["OPENAI_API_KEY"] = ""  # @param {type:"string", placeholder: "[your-openai-api-key]"}  # fmt: skip
# WARNING: Setting API keys directly in code is insecure. Use environment variables or secure storage.

# Alternative, use your OPENAI_API_KEY from Colab Secrets manager
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")


# 2. Example for Anthropic:
# LITELLM_MODEL_ID = "anthropic/claude-3-5-sonnet-20240620" # Example using Anthropic
# os.environ["ANTHROPIC_API_KEY"] = "" # @param {type:"string", placeholder: "[your-anthropic-api-key]"}  # fmt: skip


# Run the evaluation
openai_responses = client.evals.run_inference(
    model=LITELLM_MODEL_ID,
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)
openai_responses.show()

In [None]:
eval_result = client.evals.evaluate(
    dataset=openai_responses,
    metrics=[
        types.RubricMetric.GENERAL_QUALITY,
        types.RubricMetric.INSTRUCTION_FOLLOWING,
        types.Metric(name="rouge_1"),
        types.Metric(name="bleu"),
    ],
)
eval_result.show()

## Model as a Service (MaaS) from Model Garden

Vertex AI Model Garden offers partner models, such as Llama from Meta, as fully managed services. These Model as a Service (MaaS) offerings allow you to use these models without needing to deploy or manage the underlying infrastructure. These MAAS endpoints often expose an OpenAI-compatible API for inference.

The Vertex Gen AI Evaluation SDK can directly interact with these MAAS endpoints. You will typically need to use a service account with appropriate permissions to authenticate.

**Key Features:**

*   **Managed Endpoints:** No need to deploy or manage model serving infrastructure.
*   **OpenAI-Compatible API:** Many MAAS models use a familiar API structure.
*   **Integrated Evaluation:** Seamlessly evaluate these models within your Google Cloud environment using the Vertex AI GenAI Evaluation SDK.

**Note:** Ensure your Google Cloud project has enabled the specific MAAS model API from the Model Garden and that your service account has the `Vertex AI User` role.


**Steps to set the environment variable `GOOGLE_APPLICATION_CREDENTIALS`:**

1. [Create a service account key](https://cloud.google.com/iam/docs/keys-create-delete#creating) inside IAM & Admin from your Google Cloud Console.

2. Upload the downloaded key in JSON format to this Colab notebook's runtime by clicking the folder icon on the left sidebar, then the "Upload" icon (a sheet of paper with an arrow pointing up). Select the JSON key file from your local computer.

3. Once uploaded, right click on the file and select "Copy Path". You can refer to the file by its path in the notebook below.

In [None]:
GOOGLE_APP_CRED_PATH = ""  # @param {type:"string", placeholder: "[your-google-application-cred-file-path]"}  # fmt: skip

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APP_CRED_PATH
os.environ["VERTEXAI_PROJECT"] = PROJECT_ID
os.environ["VERTEXAI_LOCATION"] = LOCATION

In [None]:
# @title Run inference on MaaS models to create eval dataset

# Select a MaaS model. Remember to check regional availability!

# model = "deepseek-ai/deepseek-r1-0528-maas" # Example model
# model = "meta/llama-3.1-70b-instruct-maas"  # Example model
# model = "meta/llama-4-maverick-17b-128e-instruct-maas"   # Example model in us-east5
# model = "claude-3-5-haiku"  # Example model in us-east5
# model = "qwen/qwen3-coder-480b-a35b-instruct-maas"  # Example model in us-south1

MODEL_ID = "deepseek-ai/deepseek-r1-0528-maas"  # @param {type:"string"}

eval_dataset = client.evals.run_inference(
    model=MODEL_ID,
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)
eval_dataset.show()

In [None]:
maas_eval_result = client.evals.evaluate(
    dataset=eval_dataset,
    metrics=[
        types.RubricMetric.GENERAL_QUALITY,
        types.RubricMetric.INSTRUCTION_FOLLOWING,
        types.Metric(name="rouge_1"),
        types.Metric(name="bleu"),
    ],
)
maas_eval_result.show()

In [None]:
# @title End-to-End Example: Evaluating 3P partner models for MaaS

# Ensure GOOGLE_APPLICATION_CREDENTIALS, VERTEXAI_PROJECT, and VERTEXAI_LOCATION are set.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APP_CRED_PATH
os.environ["VERTEXAI_PROJECT"] = PROJECT_ID
os.environ["VERTEXAI_LOCATION"] = LOCATION

# Select a MaaS model. Remember to check regional availability!
# model = "deepseek-ai/deepseek-r1-0528-maas" # Example model
# model = "meta/llama-3.1-70b-instruct-maas"  # Example model
# model = "meta/llama-4-maverick-17b-128e-instruct-maas"   # Example model in us-east5
# model = "claude-3-5-haiku"  # Example model in us-east5
# model = "qwen/qwen3-coder-480b-a35b-instruct-maas"  # Example model in us-south1

MAAS_MODEL_ID = "meta/llama-3.1-70b-instruct-maas"  # Replace with the MaaS model you want to evaluate


print(f"--- Running Inference for MaaS Model: {MAAS_MODEL_ID} ---")

maas_responses = client.evals.run_inference(
    model=MAAS_MODEL_ID,
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)

print(f"\n--- Running Evaluation for MaaS Model: {MAAS_MODEL_ID} ---")
maas_eval_result = client.evals.evaluate(
    dataset=maas_responses,
    metrics=[
        types.RubricMetric.GENERAL_QUALITY,
        types.RubricMetric.INSTRUCTION_FOLLOWING,
        types.Metric(name="rouge_1"),
        types.Metric(name="bleu"),
    ],
)
print("Evaluation complete. Displaying report:")
maas_eval_result.show()

## Bring Your Own Model (BYOM) Endpoint

The Vertex Gen AI Evaluation SDK allows you to provide a generic Python function as input to specify how the model or application should be invoked for batch inference. This could be done through an endpoint or an SDK. This flexible approach accommodates a wide range of open and closed models, enabling you to evaluate models that you manage and serve independently.

In [None]:
# @title #### Define your custom inference function
# @markdown Here we use a one-cllick deployed `llama-4-maverick` model endpoint as an example.
# @markdown When creating your own endpoint inference function, make sure to verify that
# @markdown the dedicated endpoint is specified correctly in the inference function.


import json
import subprocess

import requests

PROJECT_ID_BYOM = ""  # @param {type:"string", placeholder: "[your-project-id]"}
ENDPOINT_ID_BYOM = ""  # @param {type:"string", placeholder: "[your-endpoint-id]"}
LOCATION_BYOM = "us-central1"  # @param {type:"string", placeholder: "[your-location]"}


def custom_model_inference_fn(prompt: str) -> str | None:
    """Calls the specific Vertex AI endpoint with the given prompt using the requests library.

    Args:
        prompt (str): The input prompt for the model.

    Returns:
        str: The model's response content as a string, or None if an error occurs.
    """
    project_id: str = PROJECT_ID_BYOM
    endpoint_id: str = ENDPOINT_ID_BYOM
    location: str = LOCATION_BYOM

    # This is the direct URL from sample request
    endpoint_url: str = f"https://{endpoint_id}.{location}-410429375534.prediction.vertexai.goog/v1/projects/{project_id}/locations/{location}/endpoints/{endpoint_id}:predict"
    try:
        token = subprocess.run(
            ["gcloud", "auth", "print-access-token"],
            capture_output=True,
            text=True,
            check=True,
        ).stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error getting gcloud access token: {e}")
        return None
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

    # Construct the JSON payload in the chatCompletions format
    payload = {
        "instances": [
            {
                "@requestFormat": "chatCompletions",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 100,
            }
        ]
    }

    try:
        # Make the POST request
        response = requests.post(
            endpoint_url, headers=headers, data=json.dumps(payload)
        )
        response.raise_for_status()
        json_response = response.json()

        # Extract the content from the response
        try:
            return json_response["predictions"]["choices"][0]["message"]["content"]
        except (KeyError, IndexError, TypeError) as e:
            print(
                f"Could not extract content from response: {e}. Response: {json_response}"
            )
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error calling Vertex AI endpoint: {e}")
        return None

In [None]:
custom_model_inference_fn("hello")

In [None]:
print("--- Running Inference for Bring Your Own Model (BYOM) Endpoint ---")
# run_inference generates responses for the prompts in eval_df
vertex_endpoint_responses = client.evals.run_inference(
    model=custom_model_inference_fn,
    src="gs://vertex-evaluation-llm-dataset-us-central1/genai_eval_sdk/test_prompts.jsonl",
)
vertex_endpoint_responses.show()

In [None]:
vertex_endpoint_eval_result = client.evals.evaluate(
    dataset=vertex_endpoint_responses,
    metrics=[
        types.RubricMetric.GENERAL_QUALITY,
        types.RubricMetric.INSTRUCTION_FOLLOWING,
        types.Metric(name="rouge_1"),
        types.Metric(name="bleu"),
    ],
)
vertex_endpoint_eval_result.show()

## Example: Comparing Multiple Models


The Vertex Gen AI Evaluation SDK makes it easy to compare the performance of multiple models on the same dataset and metrics. You can achieve this by running inference separately for each model and then passing the resulting `EvaluationDataset` objects as a list to the `client.evals.evaluate()` method.

The evaluation service will then compute the specified metrics for each model's responses, allowing for a side-by-side comparison.

**Note:** The following example uses a small subset of the data (10 rows) for demonstration purposes. This is not intended for rigorous benchmarking but to illustrate the comparison functionality.


In [None]:
import os

# Refer to instructions in MaaS section above for setting up credentials.
GOOGLE_APP_CRED_PATH = ""  # @param {type:"string", placeholder: "[your-google-application-cred-file-path]"}  # fmt: skip

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APP_CRED_PATH
os.environ["VERTEXAI_PROJECT"] = PROJECT_ID
os.environ["VERTEXAI_LOCATION"] = LOCATION

In [None]:
import pandas as pd

prompts_df = pd.DataFrame(
    {
        "prompt": [
            "Explain the difference between correlation and causation, and provide a real-world example where confusing the two could lead to poor decision-making.",
            "Write a Python function that finds the longest palindromic substring in a given string. Include comments explaining your approach and time complexity.",
            "A train leaves Station A at 9:00 AM traveling at 60 mph toward Station B. Another train leaves Station B at 10:00 AM traveling at 80 mph toward Station A. If the stations are 280 miles apart, at what time do the trains meet?",
            "Analyze the ethical implications of using AI in hiring decisions. Present arguments from multiple perspectives and discuss potential safeguards.",
            "Translate the following sentence to French, Spanish, and German, then explain any cultural nuances that might affect the translation: 'The early bird catches the worm, but the second mouse gets the cheese.'",
            "Create a short story (200 words) that includes these elements: a mysterious package, a lighthouse keeper, and a revelation that changes everything. The story should have a clear beginning, middle, and end.",
            "Compare and contrast the economic theories of Adam Smith and Karl Marx. How would each theorist likely view modern gig economy platforms like Uber?",
            "Debug this code and explain what's wrong: def fibonacci(n): if n <= 1: return n else: return fibonacci(n-1) + fibonacci(n-2) + fibonacci(n-3)",
            "You're a manager and an employee consistently delivers excellent work but is frequently late to meetings. Write a constructive feedback message addressing this issue while maintaining morale.",
            "Explain how transformer architecture works in machine learning to someone with basic programming knowledge but no ML background. Use an analogy to clarify the concept of attention mechanisms.",
        ]
    }
)

data_with_rubrics = client.evals.generate_rubrics(
    src=prompts_df,
    rubric_group_name="general_quality_rubrics",
    predefined_spec_name=types.RubricMetric.GENERAL_QUALITY,
)

In [None]:
# --- Model 1: Gemini 2.5 Flash ---
gemini_dataset = client.evals.run_inference(
    model="gemini-2.5-flash",
    src=data_with_rubrics,
)

# --- Model 2: DeepSeek MAAS Model ---
deepseek_dataset = client.evals.run_inference(
    model="deepseek-ai/deepseek-r1-0528-maas",
    src=data_with_rubrics,
)

# --- Model 3: Llama 3.1 MAAS Model ---
llama_dataset = client.evals.run_inference(
    model="meta/llama-3.1-70b-instruct-maas",
    src=data_with_rubrics,
)

# --- Run Comparison Evaluation ---
comparison_eval_result = client.evals.evaluate(
    dataset=[gemini_dataset, deepseek_dataset, llama_dataset],
    metrics=[
        types.RubricMetric.GENERAL_QUALITY(rubric_group_name="general_quality_rubrics")
    ],
)
comparison_eval_result.show()

In [None]:
# --- Model 1: Gemini 2.5 Pro ---
gemini_dataset = client.evals.run_inference(
    model="gemini-2.5-pro",
    src=prompts_df,
)

# --- Model 2: OpenAI GPT Model ---
openai_dataset = client.evals.run_inference(
    model="gpt-5-mini",
    src=prompts_df,
)

# --- Model 3: DeepSeek MAAS Model ---
deepseek_dataset = client.evals.run_inference(
    model="deepseek-ai/deepseek-r1-0528-maas",
    src=prompts_df,
)

# --- Run Comparison Evaluation ---
comparison_eval_result = client.evals.evaluate(
    dataset=[gemini_dataset, openai_dataset, deepseek_dataset],
)
comparison_eval_result.show()