In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Creating an Image-Based Home Search Engine

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/analytics/image_based_home_search.ipynb">
      <img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/analytics/image_based_home_search.ipynb">
      <img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/analytics/image_based_home_search.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/analytics/image_based_home_search.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks%2Fanalytics%2Fimage_based_home_search.ipynb">
    <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo">
    Open in Colab Enterprise
    </a>
  </td>

</table>

## Overview

### The Business Challenge:
For e-commerce platforms, particularly those selling products where images factor into the purchase decision, like appliances, furniture or real estate, the limitations of traditional keyword-based search create significant friction in the customer journey. A user often has a clear visual idea of the product they want but may struggle to describe it with a precise set of keywords (e.g., "mid-century modern home with a big front lawn"). This mismatch between visual intent and text-based search can lead to user frustration, poor search results, and ultimately, abandoned shopping carts.

The business objective is to create a more intuitive and effective product discovery experience that aligns with how users think visually. The technical challenge is to build a system that can accept an image as a search query and return a ranked list of the most visually similar products from a catalog of potentially millions of items. Solving this problem of visual search is critical for improving search relevance, increasing customer engagement, and driving higher conversion rates.



### The Data Science Approach:
For this use case, our approach is to implement a large-scale similarity search system using vector embeddings. We will treat the problem of visual similarity as one of proximity in a high-dimensional vector space, building the entire workflow on BigQuery's native multimodal capabilities. We will start by using a remote foundation model, called from a BigQuery ML SQL query, to convert each product image into a high-dimensional vector embedding. To enable fast querying over millions of vectors, we will create a VECTOR INDEX on the embeddings.

The search functionality is then exposed through the VECTOR_SEARCH function. When a user provides a query image, we will convert it into an embedding and use this function to find the products with the closest embeddings in the indexed catalog, returning a ranked list of visually similar items in real-time.


## Setup and Imports

### Install and load required libraries
This cell installs and loads the necessary Python libraries and extensions for interacting with BigQuery.

In [None]:
%pip install bigquery-magics -q

In [None]:
%load_ext bigquery_magics

### Define configuration variables
This cell sets up the necessary configuration for the project, including the Google Cloud Project ID, BigQuery Dataset ID, location, and the Google Cloud Storage paths for the image datasets.

In [None]:
PROJECT_ID = "PROJECT_ID"
DATASET_ID = "DATASET_ID"
LOCATION = "US"
GCS_IMAGE_DATASET_PATH = "gs://dataproc-metastore-public-binaries/home_image_search/house_images/*"
GCS_TEST_IMAGE_DATASET_PATH = "gs://dataproc-metastore-public-binaries/home_image_search/test_image/*"

### Create BigQuery Dataset
This section executes a SQL query to create a new BigQuery dataset. The `IF NOT EXISTS` clause ensures that the query doesn't fail if the dataset already exists.

In [None]:
#@title Create BigQuery dataset if not exists

query = f"""
CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}`
  OPTIONS (
    location = '{LOCATION}');
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Create a BigQuery ML Model for Embeddings
This SQL statement creates a remote BQML model that connects to the `multimodalembedding@001` Vertex AI endpoint. This model will be used to generate vector embeddings for the images.

In [None]:
#@title Create BQML model

query = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.{DATASET_ID}.home_search`
REMOTE WITH CONNECTION DEFAULT
OPTIONS (ENDPOINT = 'multimodalembedding@001');
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Create an External Table for the Image Dataset
An external table is created in BigQuery to reference the home images stored in Google Cloud Storage. This allows BigQuery to query the images directly without needing to load them into a native table.

In [None]:
#@title Create external table

query = f"""
CREATE OR REPLACE EXTERNAL TABLE `{PROJECT_ID}.{DATASET_ID}.external_images_table`
WITH CONNECTION DEFAULT
OPTIONS(
object_metadata = 'SIMPLE',
uris = ['{GCS_IMAGE_DATASET_PATH}'],
max_staleness = INTERVAL 1 DAY,
metadata_cache_mode = 'AUTOMATIC');
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Generate Image Embeddings
Using the `ML.GENERATE_EMBEDDING` function, this query processes each image in the external table, calls the multimodal embedding model, and stores the resulting vector embeddings in a new table called `home_embeddings`.

In [None]:
#@title Generate embeddings for the images

query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.home_embeddings` AS
SELECT *
FROM ML.GENERATE_EMBEDDING(
 MODEL `{PROJECT_ID}.{DATASET_ID}.home_search`,
 TABLE `{PROJECT_ID}.{DATASET_ID}.external_images_table`,
 STRUCT(TRUE AS flatten_json_output,
 512 AS output_dimensionality)
);
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### View the Generated Embeddings
This query retrieves and displays the contents of the `home_embeddings` table to verify that the embeddings were generated successfully. Each row contains the original image URI and its corresponding vector embedding.

In [None]:
query = f"""
SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.home_embeddings`;
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

## Create Vector Indexes

Use a vector index to enable faster and more scalable semantic search. A vector index efficiently finds the nearest neighbors of a query embedding within a large collection of embeddings using the CREATE VECTOR INDEX statement.
While vector indexes are ideal for large datasets, we are not creating an index in this case because we are only generating embeddings for 80 images.

Learn more about Vector Indexes on BigQuery [here](https://cloud.google.com/bigquery/docs/vector-index#choose-vector-index-type).




In [None]:

# %%bigquery --project $PROJECT_ID
# CREATE OR REPLACE
#   VECTOR INDEX `house_images_index`
# ON
#   {DATASET_ID}.home_embeddings(ml_generate_embedding_result)
#   OPTIONS (
#     index_type = 'IVF',
#     distance_type = 'COSINE');

### Check Vector Index Status
(Optional) After creating a vector index, you can use this query to check its status. The `INFORMATION_SCHEMA.VECTOR_INDEXES` view provides metadata about the index, including its `index_status` and `coverage_percentage`.

In [None]:
# %%bigquery --project $PROJECT_ID
# SELECT table_name, index_name, index_status,
#   coverage_percentage, last_refresh_time, disable_reason
# FROM {DATASET_ID}.INFORMATION_SCHEMA.VECTOR_INDEXES
# WHERE index_name = 'house_images_index';

## Perform Image-Based Search

Now that the image catalog has been processed to generate embeddings, we can perform a visual search. We'll start by providing a new image and finding the most similar images from the catalog.

### Create an External Table for the Test Image
Similar to the main dataset, an external table is created for the test image stored in Google Cloud Storage. This allows it to be processed by the BQML model.

In [None]:
#@title Create external table for the test image

query = f"""
CREATE OR REPLACE EXTERNAL TABLE `{PROJECT_ID}.{DATASET_ID}.external_images_test_table`
WITH CONNECTION DEFAULT
OPTIONS(
    object_metadata = 'SIMPLE',
    uris = ['{GCS_TEST_IMAGE_DATASET_PATH}'],
    max_staleness = INTERVAL 1 DAY,
    metadata_cache_mode = 'AUTOMATIC');
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Verify the Test Image Table
This query confirms that the external table for the test image has been created correctly and is accessible.

In [None]:
query = f"""
SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.external_images_test_table`;
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Display the Test Image
This Python code downloads the test image from Google Cloud Storage and displays it within the notebook to visualize the search query.

In [None]:
#@title Viewing the test image

from google.cloud import storage
from io import BytesIO
from PIL import Image

# Initialize a client
storage_client = storage.Client()

# Specify your bucket and image file name
path_after_protocol = GCS_TEST_IMAGE_DATASET_PATH.split("gs://")[1]
bucket_name = path_after_protocol.split("/", 1)[0]
blob_name = path_after_protocol.split("/", 1)[1][:-1] + "house_test_image.jpg"

# Get the bucket and blob
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(blob_name)

# Download the image data into a BytesIO object
image_bytes = blob.download_as_bytes()
image_stream = BytesIO(image_bytes)

# Open the image using PIL (Pillow)
img = Image.open(image_stream)

# Now 'img' is a PIL Image object, and you can perform operations on it
# For example, to display it (requires matplotlib):
import matplotlib.pyplot as plt
plt.imshow(img)
plt.axis('off')
plt.show()

### Generate Embedding for the Test Image
The `ML.GENERATE_EMBEDDING` function is used again, this time to generate the vector embedding for the single test image. The result is stored in a new table.

In [None]:
#@title Generate embeddings for the test image

query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.test_embeddings` AS
SELECT *
FROM ML.GENERATE_EMBEDDING(
    MODEL `{PROJECT_ID}.{DATASET_ID}.home_search`,
    TABLE `{PROJECT_ID}.{DATASET_ID}.external_images_test_table`,
    STRUCT(TRUE AS flatten_json_output,
    512 AS output_dimensionality)
);
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Perform the Vector Search
The `VECTOR_SEARCH` function is the core of the image search engine. It takes the test image's embedding as the query and searches the `home_embeddings` table to find the top 10 most similar images based on cosine distance. The `use_brute_force` option is specified here, but for larger datasets, a vector index would be used for performance. The results, containing the GCS URIs of the similar images and their distances, are stored in a new table.

In [None]:
#@title Vector search

query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.vector_search_results` AS
SELECT base.uri AS gcs_uri, distance
FROM
VECTOR_SEARCH(
    TABLE  `{PROJECT_ID}.{DATASET_ID}.home_embeddings`,
    'ml_generate_embedding_result',
    (
    SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.test_embeddings`
    ),
    top_k => 10,
    distance_type => 'COSINE',
    options => '{{"use_brute_force":true}}'
);
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

### Inspect the Search Results Table
This query shows the raw results of the vector search, listing the GCS URIs of the closest matches and their calculated cosine distances to the query image (a smaller distance indicates higher similarity).

In [None]:
query = f"""
SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.vector_search_results`;
"""

In [None]:
%%bigquery --project $PROJECT_ID
$query

## Visualize the Search Results

To better understand the performance of the image search, we will now visualize the results.

### Define a Helper Function to Display Images
This Python function, `print_images`, is defined to fetch images from Google Cloud Storage based on their URIs and display them in a grid format within the notebook. It will also show the similarity distance for each image.

In [None]:
#@title Define print_images function

def print_images(query_result):
    """
    Display images from BigQuery results containing GCS URIs and distances.
    
    Args:
        query_result: BigQuery query result with 'gcs_uri' and distance columns
    """
    import matplotlib.pyplot as plt
    from google.cloud import storage
    from PIL import Image
    from io import BytesIO
    import math
    
    # Convert query result to list
    results = list(query_result)
    
    if not results:
        print("No results found.")
        return
    
    # Initialize storage client
    storage_client = storage.Client()
    
    # Calculate grid dimensions
    num_images = len(results)
    cols = min(3, num_images)  # Max 3 columns
    rows = math.ceil(num_images / cols)
    
    # Create figure and subplots
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    
    # Handle single image case
    if num_images == 1:
        axes = [axes]
    elif rows == 1:
        axes = [axes] if cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, row in enumerate(results):
        gcs_uri = row.gcs_uri
        distance = float(row["distance"])
        
        try:
            # Parse GCS URI
            path_after_protocol = gcs_uri.split("gs://")[1]
            bucket_name = path_after_protocol.split("/", 1)[0]
            blob_name = path_after_protocol.split("/", 1)[1]
            
            # Download image
            bucket = storage_client.get_bucket(bucket_name)
            blob = bucket.blob(blob_name)
            image_bytes = blob.download_as_bytes()
            image_stream = BytesIO(image_bytes)
            img = Image.open(image_stream)
            
            # Display image
            ax = axes[i] if num_images > 1 else axes[0]
            ax.imshow(img)
            ax.set_title(f"Distance: {distance:.4f}\n{blob_name}", fontsize=10)
            ax.axis('off')
            
        except Exception as e:
            print(f"Error loading image {gcs_uri}: {e}")
            ax = axes[i] if num_images > 1 else axes[0]
            ax.text(0.5, 0.5, f"Error loading\n{gcs_uri}", 
                   ha='center', va='center', transform=ax.transAxes)
            ax.axis('off')
    
    # Hide unused subplots
    for i in range(num_images, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print(f"\nDisplayed {num_images} similar images, ordered by similarity (lower distance = more similar)")

### Display the Similar Images
This final code cell executes a query to get the search results, ordered by similarity, and then passes them to the `print_images` helper function to display the top visually similar homes found by the vector search.

In [None]:
#@title Viewing the search result

from google.cloud import bigquery
client = bigquery.Client()

query = f"""
  SELECT gcs_uri, CAST(distance AS STRING) as distance FROM `{PROJECT_ID}.{DATASET_ID}.vector_search_results`
  ORDER BY distance;
"""

print_images(client.query(query))

## Conclusion
Learn more about image embeddings in BigQuery [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding) and Vector Search on BigQuery [here](https://cloud.google.com/bigquery/docs/vector-search-intro).
