### <font color='#4285f4'>Overview</font>

Overview: This notebooks loops through all the tables in the project and creates vector embeddings for every string field.  This allows the agent to search for data using vector embeddings based upon the data in the tables, not just the metadata about each table.

Cost:
* Approximate cost: Less than $1

Author:
* Adam Paternostro

### <font color='#4285f4'>License</font>

```
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

### <font color='#4285f4'>Pip installs</font>

In [None]:
# PIP Installs (if necessary)
import sys

# !{sys.executable} -m pip install REPLACE-ME

### <font color='#4285f4'>Initialize</font>

In [None]:
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random

import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception

In [None]:
# Set these (run this cell to verify the output)

bigquery_location = "${bigquery_non_multi_region}"
region = "${region}"
location = "${location}"

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

### <font color='#4285f4'>Helper Methods</font>

#### restAPIHelper
Calls the Google Cloud REST API using the current users credentials.

In [None]:
def restAPIHelper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
    return json.loads(response.content)
    #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
  else:
    error = f"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

#### RetryCondition (for retrying LLM calls)

In [None]:
def RetryCondition(error):
  error_string = str(error)
  print(error_string)

  retry_errors = [
      "RESOURCE_EXHAUSTED",
      "No content in candidate",
      # Add more error messages here as needed
  ]

  for retry_error in retry_errors:
    if retry_error in error_string:
      print("Retrying...")
      return True

  return False

#### Gemini LLM

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM(prompt, model = "gemini-2.0-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 8192,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": {
          "text": prompt
      },
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError("No text in part: {response.content}")
            else:
              raise RuntimeError("No parts in content: {response.content}")
          else:
            raise RuntimeError("No parts in content: {response.content}")
        else:
          raise RuntimeError("No content in candidate: {response.content}")
      else:
        raise RuntimeError("No candidates: {response.content}")
    else:
      raise RuntimeError("No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

#### Helper Functions

In [None]:
def RunQuery(sql):
  import time
  from google.cloud import bigquery
  client = bigquery.Client()

  if (sql.startswith("SELECT") or sql.startswith("WITH")):
      df_result = client.query(sql).to_dataframe()
      return df_result
  else:
    job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
    query_job = client.query(sql, job_config=job_config)

    # Check on the progress by getting the job's updated state.
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )
    print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    while query_job.state != "DONE":
      time.sleep(2)
      query_job = client.get_job(
          query_job.job_id, location=query_job.location
          )
      print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    if query_job.error_result == None:
      return True
    else:
      raise Exception(query_job.error_result)

In [None]:
def PrettyPrintJson(json_string):
  json_object = json.loads(json_string)
  json_formatted_str = json.dumps(json_object, indent=2)
  return json_formatted_str

In [None]:
def GetNextPrimaryKey(fully_qualified_table_name, field_name):
  from google.cloud import bigquery
  client = bigquery.Client()
  sql = f"""
  SELECT IFNULL(MAX({field_name}),0) AS result
    FROM `{fully_qualified_table_name}`
  """
  # print(sql)
  df_result = client.query(sql).to_dataframe()
  # display(df_result)
  return df_result['result'].iloc[0] + 1

### <font color='#4285f4'>MAIN CODE - REPLACE-ME</font>

In [None]:
%%bigquery

/*
CREATE OR REPLACE MODEL `data_analytics_agent_metadata.textembedding_model`
 REMOTE WITH CONNECTION `us-central1.vertex-ai`
 OPTIONS(ENDPOINT = 'text-embedding-005');


CREATE OR REPLACE TABLE `data_analytics_agent_metadata.vector_embedding_metadata` (
    project_id     STRING       OPTIONS(description="The BigQuery project for which the vector embedding is generated."),
    dataset_name   STRING       OPTIONS(description="The dataset for which the vector embedding is generated."),
    table_name     STRING       OPTIONS(description="The table for which the vector embedding is generated."),
    column_name    STRING       OPTIONS(description="The column for which the vector embedding is generated."),
    text_content   STRING       OPTIONS(description="The value or content which was embedded."),
    text_embedding ARRAY<FLOAT64> OPTIONS(description="The vector embedding generated with text-embedding-005.")
)
OPTIONS(
    description="Contains the distinct values of every string column for every dataset and table in the current BigQuery project.  This is used for NL2SQL to lookup the actual value of fields that a human might want to use in the queries."
);
*/

In [None]:
sql=f"""SELECT table_catalog AS project_id,
       table_schema  AS dataset_name,
       table_name    AS table_name,
       column_name   AS column_name,
  FROM `region-us-central1`.INFORMATION_SCHEMA.COLUMNS
  WHERE data_type = 'STRING'
    AND ENDS_WITH(column_name, '_id') = FALSE  -- Let's in index the "id" columns (typically UUIDs)
    AND table_schema != 'data_analytics_agent_metadata'
ORDER BY 1, 2, 3, 4
"""

df_data_to_embed = RunQuery(sql)

In [None]:
for item in df_data_to_embed.itertuples():
  project_id = item.project_id
  dataset_name = item.dataset_name
  table_name = item.table_name
  column_name = item.column_name

  print(f"Embedding {project_id}.{dataset_name}.{table_name}.{column_name}")

  sql = f"""INSERT INTO `data_analytics_agent_metadata.vector_embedding_metadata`
            (project_id, dataset_name, table_name, column_name, text_content, text_embedding)
            SELECT '{project_id}'   AS project_id,
                   '{dataset_name}' AS dataset_name,
                   '{table_name}'   AS table_name,
                   '{column_name}'  AS column_name,
                   content          AS text_content,
                   text_embedding
              FROM ML.GENERATE_TEXT_EMBEDDING(
                    MODEL `data_analytics_agent_metadata.textembedding_model`,
                    (SELECT DISTINCT {column_name} AS content
                       FROM `{project_id}.{dataset_name}.{table_name}`),
                    STRUCT(TRUE AS flatten_json_output,
                           'SEMANTIC_SIMILARITY' as task_type,
                           768 AS output_dimensionality)
                   );
  """
  #print(sql)

  RunQuery(sql)

In [None]:
search_term = "cabin"

sql = f"""SELECT TO_JSON_STRING(TO_JSON(STRUCT(
          base. text_content AS text_content,
          ROUND(distance,6) as distance
        ))) AS json_result
  FROM VECTOR_SEARCH((SELECT *
                        FROM `data_analytics_agent_metadata.vector_embedding_metadata`
                      WHERE project_id   = '{project_id}'
                        AND dataset_name = 'agentic_beans_curated'
                        AND table_name   = 'camera'
                        AND column_name  = 'camera_location_type'
                        AND array_length(text_embedding) = 768 -- For "null" vector embedding values
                      ),
                     'text_embedding',
                     (SELECT text_embedding,
                             content AS query
                        FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `data_analytics_agent_metadata.textembedding_model`,
                                                        (SELECT '{search_term}' AS content),
                                                        STRUCT(TRUE AS flatten_json_output,
                                                              'SEMANTIC_SIMILARITY' as task_type,
                                                              768 AS output_dimensionality)
                                                        )),
                     top_k => 5)
ORDER BY distance;"""

match_df = RunQuery(sql)

for item in match_df.itertuples():
  json_result = item.json_result

  print(f"{json_result}")