feat(Imagen): Add python sdk samples for Multimodal Embeddings (#10267)

* add embedding examples * fix lint * wait for ga * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Add MM Embeddings samples for Images, Text and Video * Fix lint errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Updated SDK version in requirements * Fix `dimensions` -> `dimension` * Fix error in `VideoSegmentConfig` - There's a bug in the Vertex AI SDK, submitted a CL to fix... --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Holt Skinner <holtskinner@google.com> Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
GoogleCloudPlatform · Feb 22, 2024 · f93e2e0 · f93e2e0
1 parent 18632ae
commit f93e2e0
Show file tree

Hide file tree

Showing 7 changed files with 327 additions and 1 deletion.
diff --git a/generative_ai/multimodal_embedding_image.py b/generative_ai/multimodal_embedding_image.py
@@ -0,0 +1,62 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_image]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    Image,
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+)
+
+
+def get_image_embeddings(
+    project_id: str,
+    location: str,
+    image_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: int = 1408,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from image and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    image = Image.load_from_file(image_path)
+
+    embeddings = model.get_embeddings(
+        image=image,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+    print(f"Image Embedding: {embeddings.image_embedding}")
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_image]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_image_embeddings()
diff --git a/generative_ai/multimodal_embedding_image_test.py b/generative_ai/multimodal_embedding_image_test.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_image
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_image() -> None:
+    embeddings = multimodal_embedding_image.get_image_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
+        contextual_text="Colosseum",
+    )
+    assert embeddings is not None
+    assert embeddings.image_embedding is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/multimodal_embedding_image_video_text.py b/generative_ai/multimodal_embedding_image_video_text.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_image_video_text]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    Image,
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+    Video,
+    VideoSegmentConfig,
+)
+
+
+def get_image_video_text_embeddings(
+    project_id: str,
+    location: str,
+    image_path: str,
+    video_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: Optional[int] = 1408,
+    video_segment_config: Optional[VideoSegmentConfig] = None,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from image, video, and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
+        video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+        video_segment_config: Define specific segments to generate embeddings for.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    image = Image.load_from_file(image_path)
+    video = Video.load_from_file(video_path)
+
+    embeddings = model.get_embeddings(
+        image=image,
+        video=video,
+        video_segment_config=video_segment_config,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+
+    print(f"Image Embedding: {embeddings.image_embedding}")
+
+    # Video Embeddings are segmented based on the video_segment_config.
+    print("Video Embeddings:")
+    for video_embedding in embeddings.video_embeddings:
+        print(
+            f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
+        )
+        print(f"Embedding: {video_embedding.embedding}")
+
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_image_video_text]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_image_video_text_embeddings()
diff --git a/generative_ai/multimodal_embedding_image_video_text_test.py b/generative_ai/multimodal_embedding_image_video_text_test.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_image_video_text
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_image_video_text() -> None:
+    embeddings = multimodal_embedding_image_video_text.get_image_video_text_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
+        video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
+        contextual_text="Cars on Highway",
+    )
+    assert embeddings is not None
+    assert embeddings.image_embedding is not None
+    assert embeddings.video_embeddings is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/multimodal_embedding_video.py b/generative_ai/multimodal_embedding_video.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_video]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+    Video,
+    VideoSegmentConfig,
+)
+
+
+def get_video_embeddings(
+    project_id: str,
+    location: str,
+    video_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: Optional[int] = 1408,
+    video_segment_config: Optional[VideoSegmentConfig] = None,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from video and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+        video_segment_config: Define specific segments to generate embeddings for.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    video = Video.load_from_file(video_path)
+
+    embeddings = model.get_embeddings(
+        video=video,
+        video_segment_config=video_segment_config,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+
+    # Video Embeddings are segmented based on the video_segment_config.
+    print("Video Embeddings:")
+    for video_embedding in embeddings.video_embeddings:
+        print(
+            f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
+        )
+        print(f"Embedding: {video_embedding.embedding}")
+
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_video]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_video_embeddings()
diff --git a/generative_ai/multimodal_embedding_video_test.py b/generative_ai/multimodal_embedding_video_test.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_video
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_video() -> None:
+    embeddings = multimodal_embedding_video.get_video_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
+        contextual_text="Cars on Highway",
+    )
+    assert embeddings is not None
+    assert embeddings.video_embeddings is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/requirements.txt b/generative_ai/requirements.txt
@@ -2,5 +2,5 @@ pandas==1.3.5; python_version == '3.7'
 pandas==2.0.1; python_version > '3.7'
 pillow==9.5.0; python_version < '3.8'
 pillow==10.0.1; python_version >= '3.8'
-google-cloud-aiplatform[pipelines]==1.42.0
+google-cloud-aiplatform[pipelines]==1.42.1
 google-auth==2.17.3