### <font color='#4285f4'>Overview</font>

Overview: Generates synthetic product data

Author:
* Adam Paternostro

### <font color='#4285f4'>License</font>

```
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

### <font color='#4285f4'>Pip installs</font>

In [None]:
# PIP Installs (if necessary)
import sys

# !{sys.executable} -m pip install REPLACE-ME

### <font color='#4285f4'>Initialize</font>

In [None]:
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random

import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception

In [None]:
# Set these (run this cell to verify the output)

bigquery_location = "${bigquery_non_multi_region}"
region = "${region}"
location = "${location}"


# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

### <font color='#4285f4'>Helper Methods</font>

#### restAPIHelper
Calls the Google Cloud REST API using the current users credentials.

In [None]:
def restAPIHelper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
    return json.loads(response.content)
    #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
  else:
    error = f"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

#### RetryCondition (for retrying LLM calls)

In [None]:
def RetryCondition(error):
  error_string = str(error)
  print(error_string)

  retry_errors = [
      "RESOURCE_EXHAUSTED",
      "No content in candidate",
      # Add more error messages here as needed
  ]

  for retry_error in retry_errors:
    if retry_error in error_string:
      print("Retrying...")
      return True

  return False

#### Gemini LLM

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM(prompt, model = "gemini-2.5-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 8192,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": {
          "text": prompt
      },
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError("No text in part: {response.content}")
            else:
              raise RuntimeError("No parts in content: {response.content}")
          else:
            raise RuntimeError("No parts in content: {response.content}")
        else:
          raise RuntimeError("No content in candidate: {response.content}")
      else:
        raise RuntimeError("No candidates: {response.content}")
    else:
      raise RuntimeError("No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM_VerifyImage(prompt, imageBase64, model = "gemini-2.0-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 8192,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": [
          { "text": prompt },
          { "inlineData": {  "mimeType": "image/png", "data": f"{imageBase64}" } }
        ]
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError("No text in part: {response.content}")
            else:
              raise RuntimeError("No parts in content: {response.content}")
          else:
            raise RuntimeError("No parts in content: {response.content}")
        else:
          raise RuntimeError("No content in candidate: {response.content}")
      else:
        raise RuntimeError("No candidates: {response.content}")
    else:
      raise RuntimeError("No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

#### Imagen

In [None]:
def ImageGen(prompt):
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  model_version = "imagen-4.0-generate-preview-06-06" # Preview Access Model

  # https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/image-generation
  # url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/imagegeneration:predict"
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/{model_version}:predict"

  payload = {
    "instances": [
      {
        "prompt": prompt
      }
    ],
    "parameters": {
      "sampleCount": 1,
      "personGeneration" : "dont_allow"  # change to allow_adult for people generation
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    response_json = json.loads(response.content)
    # print(f"Imagen3 response_json: {response_json}")

    if "blocked" in response_json:
      print(f"Blocked: {response_json['blocked']}")

    if "predictions" in response_json:
      image_data = response_json["predictions"][0]["bytesBase64Encoded"]
      image_data = base64.b64decode(image_data)
      filename= str(uuid.uuid4()) + ".png"
      with open(filename, "wb") as f:
        f.write(image_data)
      print(f"Image generated OK.")
      return filename
    else:
      raise RuntimeError(f"No predictions in response: {response.content}")
  else:
    error = f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'"
    raise RuntimeError(error)

#### Helper Functions

In [None]:
def RunQuery(sql):
  import time
  from google.cloud import bigquery
  client = bigquery.Client()

  if (sql.startswith("SELECT") or sql.startswith("WITH")):
      df_result = client.query(sql).to_dataframe()
      return df_result
  else:
    job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
    query_job = client.query(sql, job_config=job_config)

    # Check on the progress by getting the job's updated state.
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )
    print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    while query_job.state != "DONE":
      time.sleep(2)
      query_job = client.get_job(
          query_job.job_id, location=query_job.location
          )
      print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    if query_job.error_result == None:
      return True
    else:
      raise Exception(query_job.error_result)

#### GCS

In [None]:
# This was generated by GenAI

def copy_file_to_gcs(local_file_path, bucket_name, destination_blob_name):
  """Copies a file from a local drive to a GCS bucket.

  Args:
      local_file_path: The full path to the local file.
      bucket_name: The name of the GCS bucket to upload to.
      destination_blob_name: The desired name of the uploaded file in the bucket.

  Returns:
      None
  """

  import os
  from google.cloud import storage

  # Ensure the file exists locally
  if not os.path.exists(local_file_path):
      raise FileNotFoundError(f"Local file '{local_file_path}' not found.")

  # Create a storage client
  storage_client = storage.Client()

  # Get a reference to the bucket
  bucket = storage_client.bucket(bucket_name)

  # Create a blob object with the desired destination path
  blob = bucket.blob(destination_blob_name)

  # Upload the file from the local filesystem
  content_type = ""
  if local_file_path.endswith(".html"):
    content_type = "text/html; charset=utf-8"

  if local_file_path.endswith(".json"):
    content_type = "application/json; charset=utf-8"

  if content_type == "":
    blob.upload_from_filename(local_file_path)
  else:
    blob.upload_from_filename(local_file_path, content_type = content_type)

  print(f"File '{local_file_path}' uploaded to GCS bucket '{bucket_name}' as '{destination_blob_name}.  Content-Type: {content_type}'.")

### <font color='#4285f4'>MAIN CODE - Create Product</font>

In [None]:
%%bigquery
CREATE SCHEMA IF NOT EXISTS `agentic_beans_raw` OPTIONS(location = 'us-central1');

In [None]:
%%bigquery
--DROP TABLE IF EXISTS `agentic_beans_raw.product`;

In [None]:
%%bigquery

CREATE TABLE IF NOT EXISTS `agentic_beans_raw.product`
(
    product_id           INT64  NOT NULL OPTIONS(description="The unique identifier and primary key for each product."),
    product_category_id  INT64  NOT NULL OPTIONS(description="A foreign key that links this product to its corresponding category in the product_category table."),
    product_name         STRING NOT NULL OPTIONS(description="The public-facing name of the individual product (e.g., 'Latte', 'Blueberry Muffin', 'Cold Brew')."),
    product_description  STRING NOT NULL OPTIONS(description="A detailed, customer-facing description of the product, suitable for a menu."),
    product_image_prompt STRING          OPTIONS(description="The specific text prompt provided to a generative AI model to create the product image."),
    product_image_uri    STRING          OPTIONS(description="The URI location of the product image stored in Google Cloud Storage (format: gs://bucket-name/image-path/image-name.png).")
)
CLUSTER BY product_id
OPTIONS(
    description="A table containing all individual products available for sale on the coffee trucks' menus."
);

In [None]:
# Write me the json in  OpenAPI 3.0 schema object for the below object.
# Make all fields required.
#  {
#    "product_name" : "text",
#    "product_description" : "text",
#  }
response_schema = {
  "type": "object",
  "required": [
    "product_name",
    "product_description"
  ],
  "properties": {
    "product_name": {
      "type": "string"
    },
    "product_description": {
      "type": "string"
    }
  }
}

# Pick up where you left off
max_product_id_df = RunQuery("SELECT IFNULL(MAX(product_id) + 1,1) as next_product_id FROM `agentic_beans_raw.product`")
max_product_id = int(max_product_id_df['next_product_id'][0])
print(f"max_product_id: {max_product_id}")

# Product category info
load_product_category_id = 0
product_category_name = ""
product_category_description = ""

for product_id_pk in range(max_product_id,100,1):

  product_category_id = (product_id_pk //  10) + 1
  print(f"product_category_id: {product_category_id}")

  if load_product_category_id != product_category_id:
    product_category_df = RunQuery(f"SELECT product_category_name, product_category_description  FROM `agentic_beans_raw.product_category` WHERE product_category_id = {product_category_id}")
    product_category_name = str(product_category_df['product_category_name'][0])
    product_category_description = str(product_category_df['product_category_description'][0])
    print(f"product_category_name: {product_category_name}")
    print(f"product_category_description: {product_category_description}")

  existing_products_df = RunQuery("SELECT IFNULL(STRING_AGG(product_name),'') as product_names FROM `agentic_beans_raw.product`")
  existing_products = str(existing_products_df['product_names'][0])
  print(f"existing_products: {existing_products}")

  prompt = f"""Create the below fields for a company that has a fleet of coffee trucks.
                - product_name: A descriptive enticing menu name.  Keep it short and simple.
                - product_description: A long description for the menu item with lots of detail.
                - The data you are generating is for this product category is '{product_category_name}'.
                - The data you are generating is for this product category description is '{product_category_description}'.
                - The product names should be somewhat futuristic and AI related, but not too futuristic.
                - Do not generate these products, they are already exist ({existing_products})"""

  # Use LLM to generate data
  menu_response = GeminiLLM(prompt, response_schema=response_schema)

  # Parse response (we know the JSON since we passed it to our LLM)
  menu_json_response = json.loads(menu_response)
  print(json.dumps(menu_json_response, indent=2))
  product_name = menu_json_response["product_name"].replace("'","\\'").replace("\n", " ")
  product_description = menu_json_response["product_description"].replace("'", "\\'").replace("\n", " ")

  # Insert to BigQuery
  # Returning a known json schema and then generating an insert statement seems more reliable then having the LLM generating the SQL
  sql = f"""INSERT INTO `agentic_beans_raw.product` (product_id, product_category_id, product_name, product_description)
  VALUES ({product_id_pk}, {product_category_id}, '{product_name}', '{product_description}')"""
  print(f"SQL: {sql}")

  RunQuery(sql)


In [None]:
storage_account = "gcs-bucket-namet"

In [None]:
# Write me the json in  OpenAPI 3.0 schema object for the below object.
# Make all fields required.
#  {
#    "product_name" : "text",
#    "product_description" : "text",
#  }
response_schema = {
  "type": "object",
  "required": [
    "image_prompt"
  ],
  "properties": {
    "image_prompt": {
      "type": "string"
    }
  }
}

# Pick up where you left off
min_product_id_df = RunQuery("SELECT IFNULL(MIN(product_id),1) as product_id FROM `agentic_beans_raw.product` WHERE product_image_prompt IS NULL")
min_product_id = int(min_product_id_df['product_id'][0])
print(f"min_product_id: {min_product_id}")

max_product_id_df = RunQuery("SELECT IFNULL(MAX(product_id),0) as product_id FROM `agentic_beans_raw.product` WHERE product_image_prompt IS NULL")
max_product_id = int(max_product_id_df['product_id'][0])
print(f"max_product_id: {max_product_id}")


for product_id_pk in range(min_product_id,max_product_id + 1 ,1):

  project_df = RunQuery(f"SELECT product_name, product_description FROM `agentic_beans_raw.product` WHERE product_id = {product_id_pk}")
  product_name = str(project_df['product_name'][0])
  product_description = str(project_df['product_description'][0])
  print(f"product_name: {product_name}")
  print(f"product_description: {product_description}")

  prompt = f"""Create an image prompt for the following:
                - This is for a company that has a fleet of coffee trucks.
                - The image is for a product.
                - The product is '{product_name}'
                - The product description is '{product_description}'
                - Just return the image prompt without any other text.
                - The image should represent a product and not contain any people.
                - This image show should a delicious picture of the product.
                - Make the background have a AI theme.  Keep it suttle since we want the product to be the main focus of the image.
                - IMPORTANT: The image should not contain any text (restate this in the generated prompt).
                - IMPORTANT: Do not output the product name in single quotes, this will cause text in the image.
               Best practices for image prompt:
               - Subject: The first thing to think about with any prompt is the subject: the object, person, animal, or scenery you want an image of.
               - Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments.
               - Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D).
               - Use descriptive language: Employ detailed adjectives and adverbs to paint a clear picture for Imagen 3.
               - Provide context: If necessary, include background information to aid the AI's understanding.
               - Reference specific artists or styles: If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful.
               - Lighting - natural, dramatic, warm, cold.
               - Camera Settings - motion blur, soft focus, bokeh, portrait.
               - Lens types - 35mm, 50mm, fisheye, wide angle, macro.
               - General Modifiers - high-quality, beautiful, stylized.
               - Photos - 4K, HDR, Studio Photo.
               - Art, Illustration - by a professional, detailed.
"""

  # Use LLM to generate data
  image_response = GeminiLLM(prompt, response_schema=response_schema)

  # Parse response (we know the JSON since we passed it to our LLM)
  image_json_response = json.loads(image_response)
  print(json.dumps(image_json_response, indent=2))
  product_image_prompt = image_json_response["image_prompt"]

  # Generate the menu image
  filename = ImageGen(product_image_prompt)

  # View it
  img = Image.open(filename)
  img.thumbnail([500,500]) # width, height
  IPython.display.display(img)

  # Copy to GCS
  product_image_uri = f"gs://{storage_account}/agentic-beans/generate-product/{product_id_pk}.png"
  copy_file_to_gcs(filename, storage_account, f"agentic-beans/generate-product/{product_id_pk}.png")

  # Insert to BigQuery
  # Returning a known json schema and then generating an insert statement seems more reliable then having the LLM generating the SQL
  product_image_prompt = product_image_prompt.replace("'","\\'").replace("\n", " ")
  sql = f"""UPDATE `agentic_beans_raw.product`
  SET product_image_prompt = '{product_image_prompt}',
      product_image_uri = '{product_image_uri}'
  WHERE product_id = {product_id_pk}"""
  print(f"SQL: {sql}")

  RunQuery(sql)