From f93e2e0ec25c2c893974a218258b21b105639008 Mon Sep 17 00:00:00 2001 From: Yvonne Li Date: Thu, 22 Feb 2024 09:36:19 -0800 Subject: [PATCH] feat(Imagen): Add python sdk samples for Multimodal Embeddings (#10267) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add embedding examples * fix lint * wait for ga * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Add MM Embeddings samples for Images, Text and Video * Fix lint errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Updated SDK version in requirements * Fix `dimensions` -> `dimension` * Fix error in `VideoSegmentConfig` - There's a bug in the Vertex AI SDK, submitted a CL to fix... --------- Co-authored-by: Owl Bot Co-authored-by: Holt Skinner Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> --- generative_ai/multimodal_embedding_image.py | 62 ++++++++++++++ .../multimodal_embedding_image_test.py | 35 ++++++++ .../multimodal_embedding_image_video_text.py | 82 +++++++++++++++++++ ...timodal_embedding_image_video_text_test.py | 37 +++++++++ generative_ai/multimodal_embedding_video.py | 75 +++++++++++++++++ .../multimodal_embedding_video_test.py | 35 ++++++++ generative_ai/requirements.txt | 2 +- 7 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 generative_ai/multimodal_embedding_image.py create mode 100644 generative_ai/multimodal_embedding_image_test.py create mode 100644 generative_ai/multimodal_embedding_image_video_text.py create mode 100644 generative_ai/multimodal_embedding_image_video_text_test.py create mode 100644 generative_ai/multimodal_embedding_video.py create mode 100644 generative_ai/multimodal_embedding_video_test.py diff --git a/generative_ai/multimodal_embedding_image.py b/generative_ai/multimodal_embedding_image.py new file mode 100644 index 000000000000..53c1f6f9c417 --- /dev/null +++ b/generative_ai/multimodal_embedding_image.py @@ -0,0 +1,62 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START aiplatform_sdk_multimodal_embedding_image] +from typing import Optional + +import vertexai +from vertexai.vision_models import ( + Image, + MultiModalEmbeddingModel, + MultiModalEmbeddingResponse, +) + + +def get_image_embeddings( + project_id: str, + location: str, + image_path: str, + contextual_text: Optional[str] = None, + dimension: int = 1408, +) -> MultiModalEmbeddingResponse: + """Example of how to generate multimodal embeddings from image and text. + + Args: + project_id: Google Cloud Project ID, used to initialize vertexai + location: Google Cloud Region, used to initialize vertexai + image_path: Path to image (local or Google Cloud Storage) to generate embeddings for. + contextual_text: Text to generate embeddings for. + dimension: Dimension for the returned embeddings. + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension + """ + + vertexai.init(project=project_id, location=location) + + model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") + image = Image.load_from_file(image_path) + + embeddings = model.get_embeddings( + image=image, + contextual_text=contextual_text, + dimension=dimension, + ) + print(f"Image Embedding: {embeddings.image_embedding}") + print(f"Text Embedding: {embeddings.text_embedding}") + # [END aiplatform_sdk_multimodal_embedding_image] + + return embeddings + + +if __name__ == "__main__": + get_image_embeddings() diff --git a/generative_ai/multimodal_embedding_image_test.py b/generative_ai/multimodal_embedding_image_test.py new file mode 100644 index 000000000000..335c7b56cada --- /dev/null +++ b/generative_ai/multimodal_embedding_image_test.py @@ -0,0 +1,35 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import backoff +from google.api_core.exceptions import ResourceExhausted + +import multimodal_embedding_image + +_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") +_LOCATION = "us-central1" + + +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_multimodal_embedding_image() -> None: + embeddings = multimodal_embedding_image.get_image_embeddings( + project_id=_PROJECT_ID, + location=_LOCATION, + image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", + contextual_text="Colosseum", + ) + assert embeddings is not None + assert embeddings.image_embedding is not None + assert embeddings.text_embedding is not None diff --git a/generative_ai/multimodal_embedding_image_video_text.py b/generative_ai/multimodal_embedding_image_video_text.py new file mode 100644 index 000000000000..2771deb50249 --- /dev/null +++ b/generative_ai/multimodal_embedding_image_video_text.py @@ -0,0 +1,82 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START aiplatform_sdk_multimodal_embedding_image_video_text] +from typing import Optional + +import vertexai +from vertexai.vision_models import ( + Image, + MultiModalEmbeddingModel, + MultiModalEmbeddingResponse, + Video, + VideoSegmentConfig, +) + + +def get_image_video_text_embeddings( + project_id: str, + location: str, + image_path: str, + video_path: str, + contextual_text: Optional[str] = None, + dimension: Optional[int] = 1408, + video_segment_config: Optional[VideoSegmentConfig] = None, +) -> MultiModalEmbeddingResponse: + """Example of how to generate multimodal embeddings from image, video, and text. + + Args: + project_id: Google Cloud Project ID, used to initialize vertexai + location: Google Cloud Region, used to initialize vertexai + image_path: Path to image (local or Google Cloud Storage) to generate embeddings for. + video_path: Path to video (local or Google Cloud Storage) to generate embeddings for. + contextual_text: Text to generate embeddings for. + dimension: Dimension for the returned embeddings. + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension + video_segment_config: Define specific segments to generate embeddings for. + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices + """ + + vertexai.init(project=project_id, location=location) + + model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") + image = Image.load_from_file(image_path) + video = Video.load_from_file(video_path) + + embeddings = model.get_embeddings( + image=image, + video=video, + video_segment_config=video_segment_config, + contextual_text=contextual_text, + dimension=dimension, + ) + + print(f"Image Embedding: {embeddings.image_embedding}") + + # Video Embeddings are segmented based on the video_segment_config. + print("Video Embeddings:") + for video_embedding in embeddings.video_embeddings: + print( + f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}" + ) + print(f"Embedding: {video_embedding.embedding}") + + print(f"Text Embedding: {embeddings.text_embedding}") + # [END aiplatform_sdk_multimodal_embedding_image_video_text] + + return embeddings + + +if __name__ == "__main__": + get_image_video_text_embeddings() diff --git a/generative_ai/multimodal_embedding_image_video_text_test.py b/generative_ai/multimodal_embedding_image_video_text_test.py new file mode 100644 index 000000000000..2b0e41ba4598 --- /dev/null +++ b/generative_ai/multimodal_embedding_image_video_text_test.py @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import backoff +from google.api_core.exceptions import ResourceExhausted + +import multimodal_embedding_image_video_text + +_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") +_LOCATION = "us-central1" + + +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_multimodal_embedding_image_video_text() -> None: + embeddings = multimodal_embedding_image_video_text.get_image_video_text_embeddings( + project_id=_PROJECT_ID, + location=_LOCATION, + image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", + video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4", + contextual_text="Cars on Highway", + ) + assert embeddings is not None + assert embeddings.image_embedding is not None + assert embeddings.video_embeddings is not None + assert embeddings.text_embedding is not None diff --git a/generative_ai/multimodal_embedding_video.py b/generative_ai/multimodal_embedding_video.py new file mode 100644 index 000000000000..18c6b031ec9e --- /dev/null +++ b/generative_ai/multimodal_embedding_video.py @@ -0,0 +1,75 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START aiplatform_sdk_multimodal_embedding_video] +from typing import Optional + +import vertexai +from vertexai.vision_models import ( + MultiModalEmbeddingModel, + MultiModalEmbeddingResponse, + Video, + VideoSegmentConfig, +) + + +def get_video_embeddings( + project_id: str, + location: str, + video_path: str, + contextual_text: Optional[str] = None, + dimension: Optional[int] = 1408, + video_segment_config: Optional[VideoSegmentConfig] = None, +) -> MultiModalEmbeddingResponse: + """Example of how to generate multimodal embeddings from video and text. + + Args: + project_id: Google Cloud Project ID, used to initialize vertexai + location: Google Cloud Region, used to initialize vertexai + video_path: Path to video (local or Google Cloud Storage) to generate embeddings for. + contextual_text: Text to generate embeddings for. + dimension: Dimension for the returned embeddings. + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension + video_segment_config: Define specific segments to generate embeddings for. + https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices + """ + + vertexai.init(project=project_id, location=location) + + model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") + video = Video.load_from_file(video_path) + + embeddings = model.get_embeddings( + video=video, + video_segment_config=video_segment_config, + contextual_text=contextual_text, + dimension=dimension, + ) + + # Video Embeddings are segmented based on the video_segment_config. + print("Video Embeddings:") + for video_embedding in embeddings.video_embeddings: + print( + f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}" + ) + print(f"Embedding: {video_embedding.embedding}") + + print(f"Text Embedding: {embeddings.text_embedding}") + # [END aiplatform_sdk_multimodal_embedding_video] + + return embeddings + + +if __name__ == "__main__": + get_video_embeddings() diff --git a/generative_ai/multimodal_embedding_video_test.py b/generative_ai/multimodal_embedding_video_test.py new file mode 100644 index 000000000000..42550e3c6c19 --- /dev/null +++ b/generative_ai/multimodal_embedding_video_test.py @@ -0,0 +1,35 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import backoff +from google.api_core.exceptions import ResourceExhausted + +import multimodal_embedding_video + +_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") +_LOCATION = "us-central1" + + +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_multimodal_embedding_video() -> None: + embeddings = multimodal_embedding_video.get_video_embeddings( + project_id=_PROJECT_ID, + location=_LOCATION, + video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4", + contextual_text="Cars on Highway", + ) + assert embeddings is not None + assert embeddings.video_embeddings is not None + assert embeddings.text_embedding is not None diff --git a/generative_ai/requirements.txt b/generative_ai/requirements.txt index 61bccd57eb9f..4a051840c692 100644 --- a/generative_ai/requirements.txt +++ b/generative_ai/requirements.txt @@ -2,5 +2,5 @@ pandas==1.3.5; python_version == '3.7' pandas==2.0.1; python_version > '3.7' pillow==9.5.0; python_version < '3.8' pillow==10.0.1; python_version >= '3.8' -google-cloud-aiplatform[pipelines]==1.42.0 +google-cloud-aiplatform[pipelines]==1.42.1 google-auth==2.17.3