In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - E5 Text Embedding Models

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_e5.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_e5.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates deploying E5 text embedding models in Vertex AI.

### Objective

- Deploy prebuilt E5 models with Hugging Face [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) (TEI) docker image on a Vertex AI Endpoint
    - [intfloat/multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct): 560M params, instruction-tuned
    - [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large): 560M params
    - [intfloat/e5-large-v2](https://huggingface.co/intfloat/e5-large-v2): 335M params
    - [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small): 118M params
    - [intfloat/e5-base-v2](https://huggingface.co/intfloat/e5-base-v2): 109M params
    - [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2): 33M params
- Run inference on the deployed Vertex AI Endpoint


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [Cloud NL API pricing](https://cloud.google.com/natural-language/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. "us") is not considered a match for a single region covered by the multi-region range (eg. "us-central1"). If not set, a unique GCS bucket will be created instead.

import os
import uuid
from datetime import datetime
from typing import Tuple

from google.cloud import aiplatform

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Cloud Storage bucket for storing the experiment artifacts.
# A unique GCS bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value yourself below.
now = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_URI = "gs://"  # @param {type: "string"}
assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."

# @markdown Click "Show code" to see more details.

# Create a unique GCS bucket for this notebook, if not specified by the user.
assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
if BUCKET_URI is None or BUCKET_URI.strip() == "" or BUCKET_URI == "gs://":
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    ! gsutil mb -l {REGION} {BUCKET_URI}
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
else:
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    shell_output = ! gsutil ls -Lb {BUCKET_NAME} | grep "Location constraint:" | sed "s/Location constraint://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            "Bucket region %s is different from notebook region %s"
            % (bucket_region, REGION)
        )

print(f"Using this GCS Bucket: {BUCKET_URI}")


! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com

STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")

# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.

SERVICE_ACCOUNT = None
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

print("Using this default Service Account:", SERVICE_ACCOUNT)

# Provision permissions to the SERVICE_ACCOUNT with the GCS bucket
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_NAME

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)


def create_name_with_datetime(prefix: str) -> str:
    """Creates a name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")


def deploy_model_tei(
    model_name: str,
    model_id: str,
    service_account: str,
    docker_uri: str,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_model_len: int = 512,
    gpu_memory_utilization: float = 0.9,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys E5 models with TEI on Vertex AI.

    Args:
        model_name: Display name of the model.
        model_id: Model ID or path to model weights.
        service_account: Service account for model uploading and deployment.
        machine_type: Deployment machine type.
        accelerator_type: Deployment accelerator type.
        accelerator_count: Number of accelerators to use.
        max_model_len: Maximum model length.
        gpu_memory_utilization: Fraction of GPU memory to be used for the model
            executor.

    Returns:
        Model instance and endpoint instance.
    """
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    tei_args = [
        f"--model-id={model_id}",
    ]
    serving_env = {
        "MODEL_ID": model_id,
        "DEPLOY_SOURCE": "notebook",
    }
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=docker_uri,
        serving_container_args=tei_args,
        serving_container_ports=[7080],
        serving_container_environment_variables=serving_env,
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

In [None]:
# @title Deploy
# @markdown This section uploads a prebuilt model to Model Registry and deploys it on the Endpoint. The model deployment step will take ~15 minutes to complete.

prebuilt_model_id = "intfloat/e5-small-v2"  # @param ["intfloat/multilingual-e5-large-instruct", "intfloat/multilingual-e5-large", "intfloat/e5-large-v2", "intfloat/multilingual-e5-small", "intfloat/e5-base-v2", "intfloat/e5-small-v2"]

# @markdown Specify a processor for the TEI docker image. E5 models can be run on either GPU or CPU.
processor = "NVIDIA_L4"  # @param["NVIDIA_TESLA_V100", "NVIDIA_L4", "NVIDIA_TESLA_A100", "CPU"]

if processor == "NVIDIA_TESLA_V100":
    accelerator_type = "NVIDIA_TESLA_V100"
    machine_type = "n1-highmem-16"
    accelerator_count = 2
elif processor == "NVIDIA_L4":
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-8"
    accelerator_count = 1
elif processor == "NVIDIA_TESLA_A100":
    accelerator_type = "NVIDIA_TESLA_A100"
    machine_type = "a2-highgpu-1g"
    accelerator_count = 1
elif processor == "CPU":
    accelerator_type = None
    machine_type = None
    accelerator_count = None
else:
    raise ValueError(f"Unsupported processor: {processor}")


# The pre-built serving docker images with TEI.
if processor == "CPU":
    TEI_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cpu.1-2"
else:
    TEI_DOCKER_URI = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-embeddings-inference-cu122.1-2.ubuntu2204"

# @markdown Click "Show code" to see more details.

# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

model, endpoint = deploy_model_tei(
    model_name=create_name_with_datetime(prefix="e5-serve-tei"),
    model_id=prebuilt_model_id,
    service_account=SERVICE_ACCOUNT,
    docker_uri=TEI_DOCKER_URI,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

print("endpoint_name:", endpoint.name)
print("model_name:", model.display_name)
print("model_id:", model.resource_name)

### Predict

Once deployment succeeds, you can send requests to the endpoint with text prompts.

In [None]:
# @title Run sample prompt
# @markdown Below is an example to encode queries and passages from the [MS-MARCO passage ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) dataset.

# @markdown Example:

# @markdown ```
# @markdown query: how much protein should a female eat
# @markdown query: summit define
# @markdown passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.
# @markdown passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.
# @markdown ```

# @markdown NOTE: Inputs are not limited to 2 queries and 2 passages. To add more inputs, you may modify the code directly.

query1 = "how much protein should a female eat?"  # @param {type: "string"}
query2 = "summit define"  # @param {type: "string"}
passage1 = "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day."  # @param {type: "string"}
passage2 = "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."  # @param {type: "string"}

# @markdown Click "Show code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

from torch import Tensor

# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
instances = [
    {
        "inputs": [
            f"query: {query1}",
            f"query: {query2}",
            f"passage: {passage1}",
            f"passage: {passage2}",
        ],
    },
]
response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    embeddings = Tensor(prediction)
    scores = (embeddings[:2] @ embeddings[2:].T) * 100
    print(scores.tolist())

In [None]:
# @title Run sample prompt for instruction-tuned models
# @markdown For instruction-tuned models (e.g. intfloat/multilingual-e5-large-instruct), the task definition should be a one-sentence instruction that describes the task. This is a way to customize text embeddings for different scenarios through natural language instructions.

# @markdown Below is an example to encode queries and passages from the [MS-MARCO passage ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) dataset.

# @markdown Example:

# @markdown ```
# @markdown Instruct: Given a web search query, retrieve relevant passages that answer the query
# @markdown Query: how much protein should a female eat
# @markdown Instruct: Given a web search query, retrieve relevant passages that answer the query
# @markdown Query: 南瓜的家常做法
# @markdown As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.
# @markdown 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅
# @markdown ```

# @markdown NOTE: Inputs are not limited to 1 instruction, 2 queries, and 2 documents. To add more inputs, you may modify the code directly.

instruction = "Given a web search query, retrieve relevant passages that answer the query"  # @param {type: "string"}
query1 = "how much protein should a female eat"  # @param {type: "string"}
query2 = "南瓜的家常做法"  # @param {type: "string"}
document1 = "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day."  # @param {type: "string"}
document2 = "1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"  # @param {type: "string"}

# @markdown Click "Show code" to see more details.

# Loads an existing endpoint instance using the endpoint name:
# - Using `endpoint_name = endpoint.name` allows us to get the
#   endpoint name of the endpoint `endpoint` created in the cell
#   above.
# - Alternatively, you can set `endpoint_name = "1234567890123456789"` to load
#   an existing endpoint with the ID 1234567890123456789.
# You may uncomment the code below to load an existing endpoint.

# endpoint_name = ""  # @param {type:"string"}
# aip_endpoint_name = (
#     f"projects/{PROJECT_ID}/locations/{REGION}/endpoints/{endpoint_name}"
# )
# endpoint = aiplatform.Endpoint(aip_endpoint_name)

from torch import Tensor


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f"Instruct: {task_description}\nQuery: {query}"


# Each query must come with a one-sentence instruction that describes the task
queries = [
    get_detailed_instruct(instruction, query1),
    get_detailed_instruct(instruction, query2),
]
# No need to add instruction for retrieval documents
documents = [
    document1,
    document2,
]

instances = [{"inputs": queries + documents}]

response = endpoint.predict(instances=instances)

for prediction in response.predictions:
    embeddings = Tensor(prediction)
    scores = (embeddings[:2] @ embeddings[2:].T) * 100
    print(scores.tolist())

### End

In [None]:
# @title Clean up resources

# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

endpoint.delete(force=True)
model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI