From f93e2e0ec25c2c893974a218258b21b105639008 Mon Sep 17 00:00:00 2001
From: Yvonne Li <liyvonne@google.com>
Date: Thu, 22 Feb 2024 09:36:19 -0800
Subject: [PATCH] feat(Imagen): Add python sdk samples for Multimodal
 Embeddings (#10267)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add embedding examples

* fix lint

* wait for ga

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* Add MM Embeddings samples for Images, Text and Video

* Fix lint errors

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* Updated SDK version in requirements

* Fix `dimensions` -> `dimension`

* Fix error in `VideoSegmentConfig`

- There's a bug in the Vertex AI SDK, submitted a CL to fix...

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Holt Skinner <holtskinner@google.com>
Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
---
 generative_ai/multimodal_embedding_image.py   | 62 ++++++++++++++
 .../multimodal_embedding_image_test.py        | 35 ++++++++
 .../multimodal_embedding_image_video_text.py  | 82 +++++++++++++++++++
 ...timodal_embedding_image_video_text_test.py | 37 +++++++++
 generative_ai/multimodal_embedding_video.py   | 75 +++++++++++++++++
 .../multimodal_embedding_video_test.py        | 35 ++++++++
 generative_ai/requirements.txt                |  2 +-
 7 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 generative_ai/multimodal_embedding_image.py
 create mode 100644 generative_ai/multimodal_embedding_image_test.py
 create mode 100644 generative_ai/multimodal_embedding_image_video_text.py
 create mode 100644 generative_ai/multimodal_embedding_image_video_text_test.py
 create mode 100644 generative_ai/multimodal_embedding_video.py
 create mode 100644 generative_ai/multimodal_embedding_video_test.py

diff --git a/generative_ai/multimodal_embedding_image.py b/generative_ai/multimodal_embedding_image.py
new file mode 100644
index 000000000000..53c1f6f9c417
--- /dev/null
+++ b/generative_ai/multimodal_embedding_image.py
@@ -0,0 +1,62 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_image]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    Image,
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+)
+
+
+def get_image_embeddings(
+    project_id: str,
+    location: str,
+    image_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: int = 1408,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from image and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    image = Image.load_from_file(image_path)
+
+    embeddings = model.get_embeddings(
+        image=image,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+    print(f"Image Embedding: {embeddings.image_embedding}")
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_image]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_image_embeddings()
diff --git a/generative_ai/multimodal_embedding_image_test.py b/generative_ai/multimodal_embedding_image_test.py
new file mode 100644
index 000000000000..335c7b56cada
--- /dev/null
+++ b/generative_ai/multimodal_embedding_image_test.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_image
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_image() -> None:
+    embeddings = multimodal_embedding_image.get_image_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
+        contextual_text="Colosseum",
+    )
+    assert embeddings is not None
+    assert embeddings.image_embedding is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/multimodal_embedding_image_video_text.py b/generative_ai/multimodal_embedding_image_video_text.py
new file mode 100644
index 000000000000..2771deb50249
--- /dev/null
+++ b/generative_ai/multimodal_embedding_image_video_text.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_image_video_text]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    Image,
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+    Video,
+    VideoSegmentConfig,
+)
+
+
+def get_image_video_text_embeddings(
+    project_id: str,
+    location: str,
+    image_path: str,
+    video_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: Optional[int] = 1408,
+    video_segment_config: Optional[VideoSegmentConfig] = None,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from image, video, and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
+        video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+        video_segment_config: Define specific segments to generate embeddings for.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    image = Image.load_from_file(image_path)
+    video = Video.load_from_file(video_path)
+
+    embeddings = model.get_embeddings(
+        image=image,
+        video=video,
+        video_segment_config=video_segment_config,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+
+    print(f"Image Embedding: {embeddings.image_embedding}")
+
+    # Video Embeddings are segmented based on the video_segment_config.
+    print("Video Embeddings:")
+    for video_embedding in embeddings.video_embeddings:
+        print(
+            f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
+        )
+        print(f"Embedding: {video_embedding.embedding}")
+
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_image_video_text]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_image_video_text_embeddings()
diff --git a/generative_ai/multimodal_embedding_image_video_text_test.py b/generative_ai/multimodal_embedding_image_video_text_test.py
new file mode 100644
index 000000000000..2b0e41ba4598
--- /dev/null
+++ b/generative_ai/multimodal_embedding_image_video_text_test.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_image_video_text
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_image_video_text() -> None:
+    embeddings = multimodal_embedding_image_video_text.get_image_video_text_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
+        video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
+        contextual_text="Cars on Highway",
+    )
+    assert embeddings is not None
+    assert embeddings.image_embedding is not None
+    assert embeddings.video_embeddings is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/multimodal_embedding_video.py b/generative_ai/multimodal_embedding_video.py
new file mode 100644
index 000000000000..18c6b031ec9e
--- /dev/null
+++ b/generative_ai/multimodal_embedding_video.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_multimodal_embedding_video]
+from typing import Optional
+
+import vertexai
+from vertexai.vision_models import (
+    MultiModalEmbeddingModel,
+    MultiModalEmbeddingResponse,
+    Video,
+    VideoSegmentConfig,
+)
+
+
+def get_video_embeddings(
+    project_id: str,
+    location: str,
+    video_path: str,
+    contextual_text: Optional[str] = None,
+    dimension: Optional[int] = 1408,
+    video_segment_config: Optional[VideoSegmentConfig] = None,
+) -> MultiModalEmbeddingResponse:
+    """Example of how to generate multimodal embeddings from video and text.
+
+    Args:
+        project_id: Google Cloud Project ID, used to initialize vertexai
+        location: Google Cloud Region, used to initialize vertexai
+        video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
+        contextual_text: Text to generate embeddings for.
+        dimension: Dimension for the returned embeddings.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
+        video_segment_config: Define specific segments to generate embeddings for.
+            https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
+    """
+
+    vertexai.init(project=project_id, location=location)
+
+    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
+    video = Video.load_from_file(video_path)
+
+    embeddings = model.get_embeddings(
+        video=video,
+        video_segment_config=video_segment_config,
+        contextual_text=contextual_text,
+        dimension=dimension,
+    )
+
+    # Video Embeddings are segmented based on the video_segment_config.
+    print("Video Embeddings:")
+    for video_embedding in embeddings.video_embeddings:
+        print(
+            f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
+        )
+        print(f"Embedding: {video_embedding.embedding}")
+
+    print(f"Text Embedding: {embeddings.text_embedding}")
+    # [END aiplatform_sdk_multimodal_embedding_video]
+
+    return embeddings
+
+
+if __name__ == "__main__":
+    get_video_embeddings()
diff --git a/generative_ai/multimodal_embedding_video_test.py b/generative_ai/multimodal_embedding_video_test.py
new file mode 100644
index 000000000000..42550e3c6c19
--- /dev/null
+++ b/generative_ai/multimodal_embedding_video_test.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import backoff
+from google.api_core.exceptions import ResourceExhausted
+
+import multimodal_embedding_video
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+
+
+@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
+def test_multimodal_embedding_video() -> None:
+    embeddings = multimodal_embedding_video.get_video_embeddings(
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
+        contextual_text="Cars on Highway",
+    )
+    assert embeddings is not None
+    assert embeddings.video_embeddings is not None
+    assert embeddings.text_embedding is not None
diff --git a/generative_ai/requirements.txt b/generative_ai/requirements.txt
index 61bccd57eb9f..4a051840c692 100644
--- a/generative_ai/requirements.txt
+++ b/generative_ai/requirements.txt
@@ -2,5 +2,5 @@ pandas==1.3.5; python_version == '3.7'
 pandas==2.0.1; python_version > '3.7'
 pillow==9.5.0; python_version < '3.8'
 pillow==10.0.1; python_version >= '3.8'
-google-cloud-aiplatform[pipelines]==1.42.0
+google-cloud-aiplatform[pipelines]==1.42.1
 google-auth==2.17.3