# Creating embeddings from your data in BigQuery

In this notebook you create vector embeddings using image data in your lakehouse using BigQuery ML and Vertex AI.[link text](https://)

In [None]:
#@title Set your Project ID

PROJECT_ID="" # @param {type:"string"}
!gcloud config set project $PROJECT_ID

In [None]:
#@title Create a connection to the BigQuery client

from google.cloud import bigquery
client = bigquery.Client(PROJECT_ID)

In [None]:
#@title Load a helper function for viewing images in your notebook

import io
from PIL import Image
import matplotlib.pyplot as plt
import tensorflow as tf

def printImages(results):
  image_results_list = list(results)
  amt_of_images = len(image_results_list)

  fig, axes = plt.subplots(nrows=amt_of_images, ncols=2, figsize=(20, 20))
  fig.tight_layout()
  fig.subplots_adjust(hspace=0.5)
  for i in range(amt_of_images):
    gcs_uri = image_results_list[i][0]
    text = image_results_list[i][1]
    f = tf.io.gfile.GFile(gcs_uri, 'rb')
    stream = io.BytesIO(f.read())
    img = Image.open(stream)
    axes[i, 0].axis('off')
    axes[i, 0].imshow(img)
    axes[i, 1].axis('off')
    axes[i, 1].text(0, 0, text, fontsize=10)
  plt.show()

In [None]:
#@title View images in your dataset

display_first_10_images_query = f"""
  SELECT *
  FROM ga4_images_{PROJECT_ID.replace("-", "_")}.ga4_obfuscated_sample_ecommerce_images
  WHERE content_type = 'image/jpeg'
  LIMIT 10;
"""
printImages(client.query(display_first_10_images_query))

In [None]:
#@title Create a multimodalembedding model
%%bigquery

CREATE OR REPLACE MODEL `gcp_lakehouse_ds.embeddings`
REMOTE WITH CONNECTION `us-central1.storage`
OPTIONS (endpoint="multimodalembedding@001");


In [None]:

#@title Generate image embeddings from the image object table.
%%bigquery

embedding_query = f"""
CREATE OR REPLACE TABLE `gcp_lakehouse_ds.ga4_embeddings`
AS
SELECT *
FROM
  ML.GENERATE_EMBEDDING(
    MODEL `gcp_lakehouse_ds.embeddings`,
    (
      SELECT * FROM `ga4_images_{PROJECT_ID.replace("-", "_")}.ga4_obfuscated_sample_ecommerce_images` WHERE content_type = 'image/jpeg' LIMIT 10000
    ));
"""
client.query(embedding_query)

In [None]:
#@title Show image embedding results.
%%bigquery

SELECT * FROM `gcp_lakehouse_ds.ga4_embeddings` limit 10;


In [None]:
#@title Input text prompt: "give me a best match picture of a water bottle". Vector search find best match and saves it to a new table.
%%bigquery

CREATE OR REPLACE TABLE `gcp_lakehouse_ds.vector_search` AS
SELECT base.uri AS gcs_uri, distance
FROM
  VECTOR_SEARCH(
    TABLE `gcp_lakehouse_ds.ga4_embeddings`,
    "ml_generate_embedding_result",
    (
      SELECT * FROM ML.GENERATE_EMBEDDING(
        MODEL `gcp_lakehouse_ds.embeddings`,
        (
          SELECT "give me a best match picture of a water bottle" AS content
        )
      )
    ),
    "ml_generate_embedding_result",
    top_k => 5);

In [None]:
#@title Perform vector search but without saving to a new table

prompt = "" # @param {type:"string"}

#@title Find the best match image.
best_match_query = f"""
  SELECT base.uri AS gcs_uri, distance
  FROM VECTOR_SEARCH(
    TABLE `gcp_lakehouse_ds.ga4_embeddings`,
    "ml_generate_embedding_result",
    (
      SELECT * FROM ML.GENERATE_EMBEDDING(
        MODEL `gcp_lakehouse_ds.embeddings`,
        (
          SELECT "{prompt}" AS content
        )
      )
    ),
    "ml_generate_embedding_result",
    top_k => 5);
"""

printImages(client.query(best_match_query))
