In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Product recognition from images using Gemini 1.5 Flash

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretail%2Fproduct_recognition_from_image.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retail/product_recognition_from_image.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retail/product_recognition_from_image.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| | |
|-|-|
|Author(s) | [Tianli Yu](https://github.com/tianli) |

## Overview

This Colab teaches how to build a general agent (on top of Gemini) to recognize any product in an image. The product recognition functionality is based on Gemini's image to text capability and a supporting text search engine.

In the following sections we will:

*   Write the necessary image loading and parsing library.
*   Create a product recognition agent.
*   Run the agent on an image to recognize and find the link to the product in the image.

# Get started (Skip if using Vertex Colab Enterprise)

In [None]:
#@title Install gcp aiplatform package
! pip3 install --upgrade --user --quiet google-cloud-aiplatform

In [None]:
#@title Restart the kernel after installing package.
import sys

if "google.colab" in sys.modules:
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
#@title Authetication
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
#@title Setup Vertex project.
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Product Recognition using Gemini Demo

In [9]:
#@title [Library] Gemini based Product Image Search/Recognition.
import ipywidgets as widgets
import os
import pprint
import requests

from IPython.display import display
from PIL import Image
from googleapiclient.discovery import build # For google custom search engine.
from io import BytesIO
from typing import Union
from vertexai.preview.generative_models import GenerationConfig, GenerativeModel, Part


# Google Custom search engine developer key.
# To get the key, follow the instructions at
# https://developers.google.com/custom-search/v1/overview
YOUR_DEVELOPER_KEY="" #@param {"type": "string"}


def get_image_mime_from_uri(image_uri: str) -> str:
    """Get the mime type from the image uri."""
    if image_uri.endswith(".png"):
        return "image/png"
    elif image_uri.endswith(".gif"):
        return "image/gif"
    elif image_uri.endswith(".svg"):
        return None # not a normal image.
    else:
        # Assume JPEG as the default mime
        return "image/jpeg"


def load_image_part_from_uri(image_uri: str) -> Union[None, Part]:
  """Load image to a prompt Part from a remote or local file URI.

  Support scheme: gs://, http://, https:// and local files.
  Note that http and https does not work on borg colab runtime.

  Args:
    image_uri (str): The uri or the local file path for the image.

  Returns:
    The prompt Part object.
  """
  mime_type=get_image_mime_from_uri(image_uri)
  if not mime_type:
    return None

  if image_uri.startswith("gs://"):
    return Part.from_uri(image_uri, mime_type=mime_type)
  elif image_uri.startswith("http://") or image_uri.startswith("https://"):
    response = requests.get(image_uri)
    if response.status_code == 200:
      image_bytes = response.content
      return Part.from_data(image_bytes, mime_type=mime_type)
    else:
      print(f"Fetch image failed for {image_uri}, status code: {response.status_code}")
  else:
    try:
      image_bytes = open(image_uri, "rb").read()
      return Part.from_data(image_bytes, mime_type=mime_type)
    except FileNotFoundError as e:
      print(f"File not found -- {e} skipping...\n")

  return None


class GeminiImageSearchAgent:
  """An agent that wraps around Gemini 1.5 to perform images search of products
    based on a (retailer's) text search engine.

    Args:
      gemini_model_version (str): The version string of the Gemini 1.5 model.
          gemini-1.5-pro-001 or gemini-1.5-flash-001
  """

  def __init__(self, cx_id: str, gemini_model_version: str = "gemini-1.5-flash-001"):
    config = GenerationConfig(temperature=0.0, max_output_tokens=256)

    # System instructions, add any common instructions here.
    sys_inst = ""
    self.gemini_model = GenerativeModel(
        gemini_model_version, generation_config=config,
        system_instruction=sys_inst)
    self.cx_id = cx_id

  def get_search_key_words(self, query_image_bytes: bytes,
                           mime_type: str = "image/jpeg",
                           debug: bool = False) -> str:
    """Returns the search key words for the given image using Gemini.
      Args:
      query_image_bytes: The image bytes to process.

      Returns:
        The generated key words.
    """
    image_part = Part.from_data(query_image_bytes, mime_type=mime_type)
    prompt = """
      If I want to search a retailer's website for the product in this image,
      what key words should I use. Please only reply with the key words. In general,
      you should use a format like [brand] [product name] [product varient] [quantity and weight].
      If there are multiple products in the image, you should only select the
      most prominent one.
      """
    if debug:
      print("====== Begin Debug Info ======")
      image = Image.open(BytesIO(query_image_bytes))
      # scale image to max height of 300 and preserves the aspect ratio.
      display_height = 300
      if image.height > display_height:
        display_width = int(image.width * display_height / image.height)
        image = image.resize((display_width, display_height))
      display(image)
      #print(f"Prompt:\n{prompt}")
      print("====== End Debug Info ======")

    model_response = self.gemini_model.generate_content([image_part, prompt])
    return model_response.text

  def search_cse(self, keywords: str) -> list[tuple[str, str, str]]:
    """ Perform search using google's custom search api.

     Returns: a list of urls and image_urls pairs.
    """
    service = build("customsearch", "v1",
                    developerKey=YOUR_DEVELOPER_KEY)
    res = service.cse().list(q=keywords, cx=self.cx_id).execute()

    result = []
    if "items" not in res:
      return result

    try:
      for item in res["items"]:
        link = item["link"]
        title = item["title"]
        if "pagemap" in item and "cse_image" in item["pagemap"]:
          image_url = item["pagemap"]["cse_image"][0]["src"]
          result.append((link, image_url, title))
    except KeyError as e:
      print(f"KeyError - {e}:")

    return result

  def pick_matching_product(self,
                            product_results: list[tuple[str, str, str]],
                            query_image_bytes: bytes,
                            mime_type: str = "image/jepg",
                            debug: bool = False) -> Union[None, tuple[str, str, str]]:
    """Perform reranking of product images using Gemini, return the
    top rank product.
    """
    prompt = [
      """You are a product search engine and your responsibility is to recognize
      the product in the query image by selecting the closest image from a set of
      candidates. Please follow these instructions:

      a. The overall picture layout and content of the candidate should match exactly with the product in the query image.
      b. The title of the candidate should also match the product in the query image.
      c. Pay attention to details on the product label like product name, product variant,
         product weight, size and quantity.
      d. You should also pay attention to the package type, shape and color scheme which could
         indicate different variants, weight and quantity.
      e. The query images are captured in non-ideal conditions
         and might have distortions or occlusions. If the image is blurry or not complete, you
         should find the best match based on partial similarity of graphics and text layout on the packaging.

      Please output the index of the closest matched candidate only without any markup or explanation.
      """
    ]
    query_image = Part.from_data(query_image_bytes, mime_type=mime_type)
    prompt.extend(["Query image: ", query_image, ";\n"])

    if len(product_results) == 0:
      # No results, cannot create prompt
      return None

    for i, result in enumerate(product_results):
      prompt.append(f"Candidate {i} : ")
      prompt.append("title - " + result[2] + ".")
      candidate_image = load_image_part_from_uri(result[1])
      if candidate_image:
        prompt.append("image - ")
        prompt.append(candidate_image)
        prompt.append(";\n")

    model_response = self.gemini_model.generate_content(prompt)
    print(f"Got model response: {model_response.text}")

    try:
      candidate_index = int(model_response.text)
      if candidate_index < 0 or candidate_index >= len(product_results):
        return None
      else:
        return product_results[candidate_index]
    except ValueError:
      return None

In [None]:
#@title [Demo] Recognize Product Image for Any Retailer using Gemini & Google Custom Search Engine.
import time

from google.colab import files


# You can use any text search engine with Gemini for product image search.
# In this demo we use google's custom search engine as a replacement for
# a retailer's own text search engine.
# To search a specific retailer or website you will need to create a custom
# search engine. Please follow instructions here:
# https://developers.google.com/custom-search/v1/overview
CUSTOM_SEARCH_ENGINE_ID = "" # @param {type: "string"}
RETAILER_NAME = "" # @param {type: "string"}
GEMINI_MODEL_VERSION = "gemini-1.5-flash-002" # @param {type: "string"}

agent = GeminiImageSearchAgent(cx_id=CUSTOM_SEARCH_ENGINE_ID,
                               gemini_model_version=GEMINI_MODEL_VERSION)

print(f"Please upload a product image to search in {RETAILER_NAME}...")
files = files.upload()
if files:
  start_time = time.perf_counter()

  print("** Step 1: Asking Gemini to convert the image to keywords...")
  query_bytes = list(files.values())[0]
  keywords = agent.get_search_key_words(query_bytes, debug = True)
  print(f"keywords = {keywords}")

  print(f"** Step 2: Calling Google Cloud Custom Search Engine {CUSTOM_SEARCH_ENGINE_ID} for {RETAILER_NAME} ...")
  # REPLACE the following line with your own text search API call if
  # you already have a text search engine.
  products = agent.search_cse(keywords)

  if products:
    # Only visualize images to save space.
    print(f"Found {len(products)} products in the search results.")
    visualize_html = "<table><tr>\n"
    for product in products:
      visualize_html += f"<td><img src='{product[1]}' height='150px'></img></td>\n"
    visualize_html += "</tr><table>\n"
    display(widgets.HTML(visualize_html))

  print(f"** Step 3: Asking Gemini to select the product from {len(products)} candidates...")
  selected = agent.pick_matching_product(products, query_bytes, "image/jpeg",
                                         debug = True)
  end_time = time.perf_counter()
  print(f"Step 1 - 3 end-to-end latency: {end_time - start_time:.2f} seconds")
  if selected:
    visualize_html = f"""
        <table>
        <tr>
          <td><img src='{selected[1]}' height='300px'></img></td>
          <td width="300px"><h2>{selected[2]}</h2><br>
          <a href='{selected[0]}'>{selected[0]}</a>
          </td>
        <table>
        """
    display(widgets.HTML(visualize_html))
  else:
    print("No matching product found!")
