-
Notifications
You must be signed in to change notification settings - Fork 6.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(Imagen): Add python sdk samples for Multimodal Embeddings (#10267)
* add embedding examples * fix lint * wait for ga * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Add MM Embeddings samples for Images, Text and Video * Fix lint errors * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Updated SDK version in requirements * Fix `dimensions` -> `dimension` * Fix error in `VideoSegmentConfig` - There's a bug in the Vertex AI SDK, submitted a CL to fix... --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Holt Skinner <holtskinner@google.com> Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
- Loading branch information
1 parent
18632ae
commit f93e2e0
Showing
7 changed files
with
327 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# [START aiplatform_sdk_multimodal_embedding_image] | ||
from typing import Optional | ||
|
||
import vertexai | ||
from vertexai.vision_models import ( | ||
Image, | ||
MultiModalEmbeddingModel, | ||
MultiModalEmbeddingResponse, | ||
) | ||
|
||
|
||
def get_image_embeddings( | ||
project_id: str, | ||
location: str, | ||
image_path: str, | ||
contextual_text: Optional[str] = None, | ||
dimension: int = 1408, | ||
) -> MultiModalEmbeddingResponse: | ||
"""Example of how to generate multimodal embeddings from image and text. | ||
Args: | ||
project_id: Google Cloud Project ID, used to initialize vertexai | ||
location: Google Cloud Region, used to initialize vertexai | ||
image_path: Path to image (local or Google Cloud Storage) to generate embeddings for. | ||
contextual_text: Text to generate embeddings for. | ||
dimension: Dimension for the returned embeddings. | ||
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension | ||
""" | ||
|
||
vertexai.init(project=project_id, location=location) | ||
|
||
model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") | ||
image = Image.load_from_file(image_path) | ||
|
||
embeddings = model.get_embeddings( | ||
image=image, | ||
contextual_text=contextual_text, | ||
dimension=dimension, | ||
) | ||
print(f"Image Embedding: {embeddings.image_embedding}") | ||
print(f"Text Embedding: {embeddings.text_embedding}") | ||
# [END aiplatform_sdk_multimodal_embedding_image] | ||
|
||
return embeddings | ||
|
||
|
||
if __name__ == "__main__": | ||
get_image_embeddings() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import os | ||
|
||
import backoff | ||
from google.api_core.exceptions import ResourceExhausted | ||
|
||
import multimodal_embedding_image | ||
|
||
_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") | ||
_LOCATION = "us-central1" | ||
|
||
|
||
@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) | ||
def test_multimodal_embedding_image() -> None: | ||
embeddings = multimodal_embedding_image.get_image_embeddings( | ||
project_id=_PROJECT_ID, | ||
location=_LOCATION, | ||
image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", | ||
contextual_text="Colosseum", | ||
) | ||
assert embeddings is not None | ||
assert embeddings.image_embedding is not None | ||
assert embeddings.text_embedding is not None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# [START aiplatform_sdk_multimodal_embedding_image_video_text] | ||
from typing import Optional | ||
|
||
import vertexai | ||
from vertexai.vision_models import ( | ||
Image, | ||
MultiModalEmbeddingModel, | ||
MultiModalEmbeddingResponse, | ||
Video, | ||
VideoSegmentConfig, | ||
) | ||
|
||
|
||
def get_image_video_text_embeddings( | ||
project_id: str, | ||
location: str, | ||
image_path: str, | ||
video_path: str, | ||
contextual_text: Optional[str] = None, | ||
dimension: Optional[int] = 1408, | ||
video_segment_config: Optional[VideoSegmentConfig] = None, | ||
) -> MultiModalEmbeddingResponse: | ||
"""Example of how to generate multimodal embeddings from image, video, and text. | ||
Args: | ||
project_id: Google Cloud Project ID, used to initialize vertexai | ||
location: Google Cloud Region, used to initialize vertexai | ||
image_path: Path to image (local or Google Cloud Storage) to generate embeddings for. | ||
video_path: Path to video (local or Google Cloud Storage) to generate embeddings for. | ||
contextual_text: Text to generate embeddings for. | ||
dimension: Dimension for the returned embeddings. | ||
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension | ||
video_segment_config: Define specific segments to generate embeddings for. | ||
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices | ||
""" | ||
|
||
vertexai.init(project=project_id, location=location) | ||
|
||
model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") | ||
image = Image.load_from_file(image_path) | ||
video = Video.load_from_file(video_path) | ||
|
||
embeddings = model.get_embeddings( | ||
image=image, | ||
video=video, | ||
video_segment_config=video_segment_config, | ||
contextual_text=contextual_text, | ||
dimension=dimension, | ||
) | ||
|
||
print(f"Image Embedding: {embeddings.image_embedding}") | ||
|
||
# Video Embeddings are segmented based on the video_segment_config. | ||
print("Video Embeddings:") | ||
for video_embedding in embeddings.video_embeddings: | ||
print( | ||
f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}" | ||
) | ||
print(f"Embedding: {video_embedding.embedding}") | ||
|
||
print(f"Text Embedding: {embeddings.text_embedding}") | ||
# [END aiplatform_sdk_multimodal_embedding_image_video_text] | ||
|
||
return embeddings | ||
|
||
|
||
if __name__ == "__main__": | ||
get_image_video_text_embeddings() |
37 changes: 37 additions & 0 deletions
37
generative_ai/multimodal_embedding_image_video_text_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import os | ||
|
||
import backoff | ||
from google.api_core.exceptions import ResourceExhausted | ||
|
||
import multimodal_embedding_image_video_text | ||
|
||
_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") | ||
_LOCATION = "us-central1" | ||
|
||
|
||
@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) | ||
def test_multimodal_embedding_image_video_text() -> None: | ||
embeddings = multimodal_embedding_image_video_text.get_image_video_text_embeddings( | ||
project_id=_PROJECT_ID, | ||
location=_LOCATION, | ||
image_path="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", | ||
video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4", | ||
contextual_text="Cars on Highway", | ||
) | ||
assert embeddings is not None | ||
assert embeddings.image_embedding is not None | ||
assert embeddings.video_embeddings is not None | ||
assert embeddings.text_embedding is not None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# [START aiplatform_sdk_multimodal_embedding_video] | ||
from typing import Optional | ||
|
||
import vertexai | ||
from vertexai.vision_models import ( | ||
MultiModalEmbeddingModel, | ||
MultiModalEmbeddingResponse, | ||
Video, | ||
VideoSegmentConfig, | ||
) | ||
|
||
|
||
def get_video_embeddings( | ||
project_id: str, | ||
location: str, | ||
video_path: str, | ||
contextual_text: Optional[str] = None, | ||
dimension: Optional[int] = 1408, | ||
video_segment_config: Optional[VideoSegmentConfig] = None, | ||
) -> MultiModalEmbeddingResponse: | ||
"""Example of how to generate multimodal embeddings from video and text. | ||
Args: | ||
project_id: Google Cloud Project ID, used to initialize vertexai | ||
location: Google Cloud Region, used to initialize vertexai | ||
video_path: Path to video (local or Google Cloud Storage) to generate embeddings for. | ||
contextual_text: Text to generate embeddings for. | ||
dimension: Dimension for the returned embeddings. | ||
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#low-dimension | ||
video_segment_config: Define specific segments to generate embeddings for. | ||
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-multimodal-embeddings#video-best-practices | ||
""" | ||
|
||
vertexai.init(project=project_id, location=location) | ||
|
||
model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") | ||
video = Video.load_from_file(video_path) | ||
|
||
embeddings = model.get_embeddings( | ||
video=video, | ||
video_segment_config=video_segment_config, | ||
contextual_text=contextual_text, | ||
dimension=dimension, | ||
) | ||
|
||
# Video Embeddings are segmented based on the video_segment_config. | ||
print("Video Embeddings:") | ||
for video_embedding in embeddings.video_embeddings: | ||
print( | ||
f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}" | ||
) | ||
print(f"Embedding: {video_embedding.embedding}") | ||
|
||
print(f"Text Embedding: {embeddings.text_embedding}") | ||
# [END aiplatform_sdk_multimodal_embedding_video] | ||
|
||
return embeddings | ||
|
||
|
||
if __name__ == "__main__": | ||
get_video_embeddings() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import os | ||
|
||
import backoff | ||
from google.api_core.exceptions import ResourceExhausted | ||
|
||
import multimodal_embedding_video | ||
|
||
_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") | ||
_LOCATION = "us-central1" | ||
|
||
|
||
@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) | ||
def test_multimodal_embedding_video() -> None: | ||
embeddings = multimodal_embedding_video.get_video_embeddings( | ||
project_id=_PROJECT_ID, | ||
location=_LOCATION, | ||
video_path="gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4", | ||
contextual_text="Cars on Highway", | ||
) | ||
assert embeddings is not None | ||
assert embeddings.video_embeddings is not None | ||
assert embeddings.text_embedding is not None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters