In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
!pip install git+https://github.com/googleapis/python-aiplatform.git@copybara_559287614
# Remember to restart your runtime!

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()
import vertexai
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

# Note: when publishing, will make the PROJECT_ID empty
PROJECT_ID = "cloud-llm-preview1" # @param {type:"string"}
TASK_TYPE = "RETRIEVAL_QUERY" # @param ["RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING"]
TITLE = "" # @param {type:"string"}
TEXT = "Embedding text." # @param {type:"string"}
MODEL_NAME = "textembedding-gecko@latest" # @param ["textembedding-gecko@latest", "textembedding-gecko-multilingual@latest"]

if not PROJECT_ID:
  raise ValueError("Please set your PROJECT_ID.")
if not TASK_TYPE:
  raise ValueError("Please set TASK_TYPE.")
if not TEXT:
  raise ValueError("Please set TEXT.")
if not MODEL_NAME:
  raise ValueError("Please set MODEL_NAME.")
if TITLE and TASK_TYPE != "RETRIEVAL_DOCUMENT":
  raise ValueError("Title can only be provided if the task_type is RETRIEVAL_DOCUMENT")

vertexai.init(project=PROJECT_ID, location="us-central1")

In [None]:
def text_embedding(
  model_name: str, task_type: str, text: str, title: str="",) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained(model_name)

    text_embedding_input = TextEmbeddingInput(
        task_type=task_type, title=title, text=text)
    embeddings = model.get_embeddings([text_embedding_input])
    return embeddings[0].values

embedding = text_embedding(
    model_name=MODEL_NAME, task_type=TASK_TYPE, text=TEXT, title=TITLE)
print(len(embedding))

768


In [None]:
# End of public demo
# The following is to test the correctness of the above code.
vertexai.init(project="cloud-llm-preview1", location="us-central1")
print(sum(text_embedding("textembedding-gecko@latest", "SEMANTIC_SIMILARITY", title="", text="simple embedding")))
print(sum(text_embedding("textembedding-gecko@latest", "RETRIEVAL_DOCUMENT", title="", text="simple embedding")))
print(sum(text_embedding("textembedding-gecko@latest", "RETRIEVAL_DOCUMENT", title="Google", text="simple embedding")))
print(sum(text_embedding("textembedding-gecko-multilingual@latest", "SEMANTIC_SIMILARITY", title="", text="simple embedding")))
print(sum(text_embedding("textembedding-gecko-multilingual@latest", "RETRIEVAL_DOCUMENT", title="", text="simple embedding")))
print(sum(text_embedding("textembedding-gecko-multilingual@latest", "RETRIEVAL_DOCUMENT", title="Google", text="simple embedding")))

-0.8425277400092455
-0.5084050247105552
-0.6567934176709969
0.3477527925315371
0.06128880959295202
0.283250759935072
