In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - E5 Text Embedding Models

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_e5.ipynb">
      <img alt="Workbench logo" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" width="32px"><br> Run in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_e5.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_e5.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates how to deploy a **E 5** open model on Google Cloud Vertex AI.

### Objectives

- Deploy E 5 using containerized backends like [vLLM](https://github.com/vllm-project/vllm) on GPU.
- Use the deployed model to serve chat completion requests for both text and multimodal inputs.

### File a Bug

If you encounter issues with this notebook, report them on [GitHub](https://github.com/GoogleCloudPlatform/vertex-ai-samples/issues/new).

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI
- Cloud Storage

Refer to the [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing) pages for more information. Use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to estimate your projected costs.

## Get Started

### Install Vertex AI SDK and other required packages

In [None]:
%pip install --upgrade --force-reinstall --quiet 'google-cloud-aiplatform>=1.106.0' 'openai' 'google-auth==2.27.0' 'requests==2.32.3'

### Authenticate the Notebook Environment (Colab only)

If you're running this notebook in Google Colab, run the following cell to authenticate.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud Project Information

To get started with Vertex AI, ensure you have an existing Google Cloud project and that the [Vertex AI API is enabled](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

See the guide on [setting up your project and development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment). Also confirm that [billing is enabled](https://cloud.google.com/billing/docs/how-to/modify-project).


In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}

if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

REGION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=REGION)

### Import libraries

In [None]:
from vertexai import model_garden

## Deploy model

### Choose model variant

You can proceed with the default model variant or select a different one.

In [None]:
model_version = "multilingual-e5-large-instruct"  # @param ["e5-base-v2", "e5-large-v2", "e5-small-v2", "multilingual-e5-large", "multilingual-e5-large-instruct", "multilingual-e5-small"] {isTemplate:true}
MODEL_NAME = f"intfloat/e5@{model_version}"

To see all deployable model variants available in Model Garden, use:

In [None]:
all_model_versions = model_garden.list_deployable_models(
    model_filter="e5", list_hf_models=False
)

Once you've selected a model variant, initialize it:

In [None]:
model = model_garden.OpenModel(MODEL_NAME)

### Check the Deployment Configuration

Use the `list_deploy_options()` method to view the verified deployment configurations for your selected model. This helps ensure you have sufficient resources (e.g., GPU quota) available to deploy it.

In [None]:
deploy_options = model.list_deploy_options(concise=True)
print(deploy_options)

### Deploy the Model

Now that you’ve reviewed the deployment options, use the `deploy()` method to serve the selected open model to a Vertex AI endpoint. Deployment time may vary depending on the model size and infrastructure requirements.

> **Note**: If the model requires accepting a license agreement (EULA), set the `accept_eula=True` flag in the deploy call. Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).

In [None]:
use_dedicated_endpoint = True

In [None]:
endpoint = model.deploy(
    accept_eula=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
)

Alternatively, you can select one of the verified deployment configurations listed above.

In [None]:
endpoint = model.deploy(
    accept_eula=True,
    use_dedicated_endpoint=use_dedicated_endpoint,
    serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cu122.1-2.ubuntu2204",
    machine_type="g2-standard-8",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1,
)

To further customize your deployment, you can configure:

- **Compute Resources**: Machine type, replica count (min/max), accelerator type and quantity.
- **Infrastructure**: Use Spot VMs, reservation affinity, or dedicated endpoints.
- **Serving Container**: Customize container image, ports, health checks, and environment variables.

See the [Model Garden SDK README](https://github.com/googleapis/python-aiplatform/blob/main/vertexai/model_garden/README.md) for advanced configuration options.

### Predict

Once deployment succeeds, you can send requests to the endpoint with text prompts.

In [None]:
# @title Run sample prompt
# @markdown Below is an example to encode queries and passages from the [MS-MARCO passage ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) dataset.

# @markdown Example:

# @markdown ```
# @markdown query: how much protein should a female eat
# @markdown query: summit define
# @markdown passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.
# @markdown passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.
# @markdown ```

# @markdown NOTE: Inputs are not limited to 2 queries and 2 passages. To add more inputs, you may modify the code directly.

from torch import Tensor

query1 = "how much protein should a female eat?"  # @param {type: "string"}
query2 = "summit define"  # @param {type: "string"}
passage1 = "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day."  # @param {type: "string"}
passage2 = "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."  # @param {type: "string"}

# @markdown Click "Show code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)


# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
instances = [
    {
        "inputs": [
            f"query: {query1}",
            f"query: {query2}",
            f"passage: {passage1}",
            f"passage: {passage2}",
        ],
    },
]
response = endpoint.predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    embeddings = Tensor(prediction)
    scores = (embeddings[:2] @ embeddings[2:].T) * 100
    print(scores.tolist())

In [None]:
# @title Run sample prompt for instruction-tuned models
# @markdown For instruction-tuned models (e.g. intfloat/multilingual-e5-large-instruct), the task definition should be a one-sentence instruction that describes the task. This is a way to customize text embeddings for different scenarios through natural language instructions.

# @markdown Below is an example to encode queries and passages from the [MS-MARCO passage ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) dataset.

# @markdown Example:

# @markdown ```
# @markdown Instruct: Given a web search query, retrieve relevant passages that answer the query
# @markdown Query: how much protein should a female eat
# @markdown Instruct: Given a web search query, retrieve relevant passages that answer the query
# @markdown ```

# @markdown API reference link to HuggingFace : [Text Embeddings Inference API](https://huggingface.github.io/text-embeddings-inference/#/).

# @markdown NOTE: Inputs are not limited to 1 instruction, 2 queries, and 2 documents. To add more inputs, you may modify the code directly.

instruction = "Given a web search query, retrieve relevant passages that answer the query"  # @param {type: "string"}
query1 = "how much protein should a female eat"  # @param {type: "string"}
query2 = "南瓜的家常做法"  # @param {type: "string"}
document1 = "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day."  # @param {type: "string"}
document2 = "1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"  # @param {type: "string"}

# @markdown Click "Show code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f"Instruct: {task_description}\nQuery: {query}"


# Each query must come with a one-sentence instruction that describes the task
queries = [
    get_detailed_instruct(instruction, query1),
    get_detailed_instruct(instruction, query2),
]
# No need to add instruction for retrieval documents
documents = [
    document1,
    document2,
]

instances = [{"inputs": queries + documents}]

response = endpoint.predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
)

for prediction in response.predictions:
    embeddings = Tensor(prediction)
    scores = (embeddings[:2] @ embeddings[2:].T) * 100
    print(scores.tolist())

## Clean up resources

In [None]:
# @title Delete the models and endpoints

# @markdown Delete the endpoint.

if endpoint:
    endpoint.delete(force=True)