Skip to content

Commit

Permalink
feat(Imagen): Add python sdk samples for Multimodal Embeddings (#10267)
Browse files Browse the repository at this point in the history
* add embedding examples

* fix lint

* wait for ga

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* Add MM Embeddings samples for Images, Text and Video

* Fix lint errors

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* Updated SDK version in requirements

* Fix `dimensions` -> `dimension`

* Fix error in `VideoSegmentConfig`

- There's a bug in the Vertex AI SDK, submitted a CL to fix...

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Holt Skinner <holtskinner@google.com>
Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
  • Loading branch information
4 people committed Feb 22, 2024
1 parent 18632ae commit f93e2e0
Show file tree
Hide file tree
Showing 7 changed files with 327 additions and 1 deletion.
62 changes: 62 additions & 0 deletions generative_ai/multimodal_embedding_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START aiplatform_sdk_multimodal_embedding_image]
from typing import Optional

import vertexai
from vertexai.vision_models import (
Image,
MultiModalEmbeddingModel,
MultiModalEmbeddingResponse,
)


def get_image_embeddings(
project_id: str,
location: str,
image_path: str,
contextual_text: Optional[str] = None,
dimension: int = 1408,
) -> MultiModalEmbeddingResponse:
"""Example of how to generate multimodal embeddings from image and text.
Args:
project_id: Google Cloud Project ID, used to initialize vertexai
location: Google Cloud Region, used to initialize vertexai
image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
contextual_text: Text to generate embeddings for.
dimension: Dimension for the returned embeddings.
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
"""

vertexai.init(project=project_id, location=location)

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
image = Image.load_from_file(image_path)

embeddings = model.get_embeddings(
image=image,
contextual_text=contextual_text,
dimension=dimension,
)
print(f"Image Embedding: {embeddings.image_embedding}")
print(f"Text Embedding: {embeddings.text_embedding}")
# [END aiplatform_sdk_multimodal_embedding_image]

return embeddings


if __name__ == "__main__":
get_image_embeddings()
35 changes: 35 additions & 0 deletions generative_ai/multimodal_embedding_image_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import backoff
from google.api_core.exceptions import ResourceExhausted

import multimodal_embedding_image

_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
_LOCATION = "us-central1"


@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
def test_multimodal_embedding_image() -> None:
embeddings = multimodal_embedding_image.get_image_embeddings(
project_id=_PROJECT_ID,
location=_LOCATION,
image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
contextual_text="Colosseum",
)
assert embeddings is not None
assert embeddings.image_embedding is not None
assert embeddings.text_embedding is not None
82 changes: 82 additions & 0 deletions generative_ai/multimodal_embedding_image_video_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START aiplatform_sdk_multimodal_embedding_image_video_text]
from typing import Optional

import vertexai
from vertexai.vision_models import (
Image,
MultiModalEmbeddingModel,
MultiModalEmbeddingResponse,
Video,
VideoSegmentConfig,
)


def get_image_video_text_embeddings(
project_id: str,
location: str,
image_path: str,
video_path: str,
contextual_text: Optional[str] = None,
dimension: Optional[int] = 1408,
video_segment_config: Optional[VideoSegmentConfig] = None,
) -> MultiModalEmbeddingResponse:
"""Example of how to generate multimodal embeddings from image, video, and text.
Args:
project_id: Google Cloud Project ID, used to initialize vertexai
location: Google Cloud Region, used to initialize vertexai
image_path: Path to image (local or Google Cloud Storage) to generate embeddings for.
video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
contextual_text: Text to generate embeddings for.
dimension: Dimension for the returned embeddings.
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
video_segment_config: Define specific segments to generate embeddings for.
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
"""

vertexai.init(project=project_id, location=location)

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
image = Image.load_from_file(image_path)
video = Video.load_from_file(video_path)

embeddings = model.get_embeddings(
image=image,
video=video,
video_segment_config=video_segment_config,
contextual_text=contextual_text,
dimension=dimension,
)

print(f"Image Embedding: {embeddings.image_embedding}")

# Video Embeddings are segmented based on the video_segment_config.
print("Video Embeddings:")
for video_embedding in embeddings.video_embeddings:
print(
f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
)
print(f"Embedding: {video_embedding.embedding}")

print(f"Text Embedding: {embeddings.text_embedding}")
# [END aiplatform_sdk_multimodal_embedding_image_video_text]

return embeddings


if __name__ == "__main__":
get_image_video_text_embeddings()
37 changes: 37 additions & 0 deletions generative_ai/multimodal_embedding_image_video_text_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import backoff
from google.api_core.exceptions import ResourceExhausted

import multimodal_embedding_image_video_text

_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
_LOCATION = "us-central1"


@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
def test_multimodal_embedding_image_video_text() -> None:
embeddings = multimodal_embedding_image_video_text.get_image_video_text_embeddings(
project_id=_PROJECT_ID,
location=_LOCATION,
image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png",
video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
contextual_text="Cars on Highway",
)
assert embeddings is not None
assert embeddings.image_embedding is not None
assert embeddings.video_embeddings is not None
assert embeddings.text_embedding is not None
75 changes: 75 additions & 0 deletions generative_ai/multimodal_embedding_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START aiplatform_sdk_multimodal_embedding_video]
from typing import Optional

import vertexai
from vertexai.vision_models import (
MultiModalEmbeddingModel,
MultiModalEmbeddingResponse,
Video,
VideoSegmentConfig,
)


def get_video_embeddings(
project_id: str,
location: str,
video_path: str,
contextual_text: Optional[str] = None,
dimension: Optional[int] = 1408,
video_segment_config: Optional[VideoSegmentConfig] = None,
) -> MultiModalEmbeddingResponse:
"""Example of how to generate multimodal embeddings from video and text.
Args:
project_id: Google Cloud Project ID, used to initialize vertexai
location: Google Cloud Region, used to initialize vertexai
video_path: Path to video (local or Google Cloud Storage) to generate embeddings for.
contextual_text: Text to generate embeddings for.
dimension: Dimension for the returned embeddings.
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension
video_segment_config: Define specific segments to generate embeddings for.
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices
"""

vertexai.init(project=project_id, location=location)

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")
video = Video.load_from_file(video_path)

embeddings = model.get_embeddings(
video=video,
video_segment_config=video_segment_config,
contextual_text=contextual_text,
dimension=dimension,
)

# Video Embeddings are segmented based on the video_segment_config.
print("Video Embeddings:")
for video_embedding in embeddings.video_embeddings:
print(
f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
)
print(f"Embedding: {video_embedding.embedding}")

print(f"Text Embedding: {embeddings.text_embedding}")
# [END aiplatform_sdk_multimodal_embedding_video]

return embeddings


if __name__ == "__main__":
get_video_embeddings()
35 changes: 35 additions & 0 deletions generative_ai/multimodal_embedding_video_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import backoff
from google.api_core.exceptions import ResourceExhausted

import multimodal_embedding_video

_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
_LOCATION = "us-central1"


@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10)
def test_multimodal_embedding_video() -> None:
embeddings = multimodal_embedding_video.get_video_embeddings(
project_id=_PROJECT_ID,
location=_LOCATION,
video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4",
contextual_text="Cars on Highway",
)
assert embeddings is not None
assert embeddings.video_embeddings is not None
assert embeddings.text_embedding is not None
2 changes: 1 addition & 1 deletion generative_ai/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ pandas==1.3.5; python_version == '3.7'
pandas==2.0.1; python_version > '3.7'
pillow==9.5.0; python_version < '3.8'
pillow==10.0.1; python_version >= '3.8'
google-cloud-aiplatform[pipelines]==1.42.0
google-cloud-aiplatform[pipelines]==1.42.1
google-auth==2.17.3

0 comments on commit f93e2e0

Please sign in to comment.