### <font color='#4285f4'>Overview</font>

Overview: Generates synthetic customer data


Cost:
* Approximate cost: $1

Author:
* Adam Paternostro

### <font color='#4285f4'>License</font>

```
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

### <font color='#4285f4'>Pip installs</font>

In [None]:
# PIP Installs (if necessary)
import sys

# !{sys.executable} -m pip install REPLACE-ME

### <font color='#4285f4'>Initialize</font>

In [None]:
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random

import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception

In [None]:
# Set these (run this cell to verify the output)

bigquery_location = "${bigquery_non_multi_region}"
region = "${region}"
location = "${location}"

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

### <font color='#4285f4'>Helper Methods</font>

#### restAPIHelper
Calls the Google Cloud REST API using the current users credentials.

In [None]:
def restAPIHelper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
    return json.loads(response.content)
    #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
  else:
    error = f"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

#### RetryCondition (for retrying LLM calls)

In [None]:
def RetryCondition(error):
  error_string = str(error)
  print(error_string)

  retry_errors = [
      "RESOURCE_EXHAUSTED",
      "No content in candidate",
      # Add more error messages here as needed
  ]

  for retry_error in retry_errors:
    if retry_error in error_string:
      print("Retrying...")
      return True

  return False

#### Gemini LLM

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM(prompt, model = "gemini-2.5-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 65536,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": {
          "text": prompt
      },
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError(f"No text in part: {response.content}")
            else:
              raise RuntimeError(f"No parts in content: {response.content}")
          else:
            raise RuntimeError(f"No parts in content: {response.content}")
        else:
          raise RuntimeError(f"No content in candidate: {response.content}")
      else:
        raise RuntimeError(f"No candidates: {response.content}")
    else:
      raise RuntimeError(f"No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM_VerifyImage(prompt, imageBase64, model = "gemini-2.0-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 8192,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": [
          { "text": prompt },
          { "inlineData": {  "mimeType": "image/png", "data": f"{imageBase64}" } }
        ]
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError(f"No text in part: {response.content}")
            else:
              raise RuntimeError(f"No parts in content: {response.content}")
          else:
            raise RuntimeError(f"No parts in content: {response.content}")
        else:
          raise RuntimeError(f"No content in candidate: {response.content}")
      else:
        raise RuntimeError(f"No candidates: {response.content}")
    else:
      raise RuntimeError(f"No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

#### Imagen

In [None]:
def ImageGen(prompt):
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  model_version = "imagen-4.0-generate-preview-06-06" # Preview Access Model

  # https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/image-generation
  # url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/imagegeneration:predict"
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/{model_version}:predict"

  payload = {
    "instances": [
      {
        "prompt": prompt
      }
    ],
    "parameters": {
      "sampleCount": 1,
      "personGeneration" : "dont_allow"  # change to allow_adult for people generation
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    response_json = json.loads(response.content)
    # print(f"Imagen3 response_json: {response_json}")

    if "blocked" in response_json:
      print(f"Blocked: {response_json['blocked']}")

    if "predictions" in response_json:
      image_data = response_json["predictions"][0]["bytesBase64Encoded"]
      image_data = base64.b64decode(image_data)
      filename= str(uuid.uuid4()) + ".png"
      with open(filename, "wb") as f:
        f.write(image_data)
      print(f"Image generated OK.")
      return filename
    else:
      raise RuntimeError(f"No predictions in response: {response.content}")
  else:
    error = f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'"
    raise RuntimeError(error)

#### Helper Functions

In [None]:
def RunQuery(sql):
  import time
  from google.cloud import bigquery
  client = bigquery.Client()

  if (sql.startswith("SELECT") or sql.startswith("WITH")):
      df_result = client.query(sql).to_dataframe()
      return df_result
  else:
    job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
    query_job = client.query(sql, job_config=job_config)

    # Check on the progress by getting the job's updated state.
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )
    print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    while query_job.state != "DONE":
      time.sleep(2)
      query_job = client.get_job(
          query_job.job_id, location=query_job.location
          )
      print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    if query_job.error_result == None:
      return True
    else:
      raise Exception(query_job.error_result)

In [None]:
def GetTableSchema(project_id, dataset_name, table_name):
  import io
  from google.cloud import bigquery

  client = bigquery.Client()

  dataset_ref = client.dataset(dataset_name, project=project_id)
  table_ref = dataset_ref.table(table_name)
  table = client.get_table(table_ref)

  f = io.StringIO("")
  client.schema_to_json(table.schema, f)
  return f.getvalue()

#### GCS

In [None]:
# This was generated by GenAI

def copy_file_to_gcs(local_file_path, bucket_name, destination_blob_name):
  """Copies a file from a local drive to a GCS bucket.

  Args:
      local_file_path: The full path to the local file.
      bucket_name: The name of the GCS bucket to upload to.
      destination_blob_name: The desired name of the uploaded file in the bucket.

  Returns:
      None
  """

  import os
  from google.cloud import storage

  # Ensure the file exists locally
  if not os.path.exists(local_file_path):
      raise FileNotFoundError(f"Local file '{local_file_path}' not found.")

  # Create a storage client
  storage_client = storage.Client()

  # Get a reference to the bucket
  bucket = storage_client.bucket(bucket_name)

  # Create a blob object with the desired destination path
  blob = bucket.blob(destination_blob_name)

  # Upload the file from the local filesystem
  content_type = ""
  if local_file_path.endswith(".html"):
    content_type = "text/html; charset=utf-8"

  if local_file_path.endswith(".json"):
    content_type = "application/json; charset=utf-8"

  if content_type == "":
    blob.upload_from_filename(local_file_path)
  else:
    blob.upload_from_filename(local_file_path, content_type = content_type)

  print(f"File '{local_file_path}' uploaded to GCS bucket '{bucket_name}' as '{destination_blob_name}.  Content-Type: {content_type}'.")

### <font color='#4285f4'>MAIN CODE - Create Product Categories</font>

In [None]:
%%bigquery
CREATE SCHEMA IF NOT EXISTS `agentic_beans_raw` OPTIONS(location = 'us-central1');

In [None]:
%%bigquery
--DROP TABLE IF EXISTS `agentic_beans_raw.customer`;

In [None]:
%%bigquery

CREATE TABLE IF NOT EXISTS `agentic_beans_raw.customer`
(
    customer_id             INTEGER NOT NULL OPTIONS(description="The unique identifier and primary key for each customer."),
    customer_name           STRING  NOT NULL OPTIONS(description="The full name of the customer."),
    customer_yob            INTEGER NOT NULL OPTIONS(description="The customer's year of birth, used for demographic analysis."),
    customer_email          STRING  NOT NULL OPTIONS(description="The unique email address of the customer, used for marketing and receipts."),
    customer_inception_date DATE    NOT NULL OPTIONS(description="The date of the customer's first transaction, marking their start date."),
    country_code            STRING  NOT NULL OPTIONS(description="The two-letter ISO 3166-1 alpha-2 country code of the customer (e.g., 'US', 'CA', 'GB').")
)
CLUSTER BY customer_id
OPTIONS(
    description="A table containing demographic and contact information for individual customers."
);

In [None]:
from datetime import timedelta
from datetime import date

yob_start = 1960
yob_end = 2009

In [None]:
# Write me the json in  OpenAPI 3.0 schema object for the below object.
# Make all fields required.
#  {
#    "customer_name" : "text",
#    "customer_email" : "text",
#  }
response_schema = {
  "type": "object",
  "required": [
    "customer_data"
  ],
  "properties": {
    "customer_data": {
      "type": "array",
      "items": {
        "type": "object",
        "required": [
          "customer_name",
          "customer_email",
        ],
        "properties": {
          "customer_name": {
            "type": "string"
          },
          "customer_email": {
            "type": "string"
          }
        }
      }
    }
  }
}


# Pick up where you left off
# min_customer_id_df = RunQuery("SELECT IFNULL(MIN(customer_id),1) as customer_id FROM `agentic_beans_raw.customer`")
# min_customer_id = int(min_customer_id_df['customer_id'][0])
# print(f"min_customer_id: {min_customer_id}")

max_customer_id_df = RunQuery("SELECT IFNULL(MAX(customer_id) + 1,1) as customer_id FROM `agentic_beans_raw.customer`")
max_customer_id = int(max_customer_id_df['customer_id'][0])
# print(f"max_customer_id: {max_customer_id}")

dataset_name = "agentic_beans_raw"
table_name = "customer"
table_schema = GetTableSchema(project_id,dataset_name,table_name)

other_countries = [
    ("Haiti", "HT"),
    ("Colombia", "CO"),
    ("Russia", "RU"),
    ("Trinidad and Tobago", "TT"),
    ("Guyana", "GY"),
    ("Dominican Republic", "DO"),
    ("Jamaica", "JM"),
    ("Mexico", "MX"),
    ("Ecuador", "EC"),
    ("India", "IN"),
    ("El Salvador", "SV"),
    ("Bangladesh", "BD"),
    ("Brazil", "BR")
]

existing_customer_names_df = RunQuery("SELECT IFNULL(STRING_AGG(customer_name),'') as customer_names FROM `agentic_beans_raw.customer`")
existing_customer_names = str(existing_customer_names_df['customer_names'][0])
#print(f"existing_customer_names: {existing_customer_names}")

customer_id = max_customer_id
while customer_id <= 10000:
  print(f"customer_id: {customer_id}")
  success = False
  while not success:
    try:
      rand_int = random.randint(1, 100)
      country = "American"
      country_code = "US"

      # Per Gemini
      if rand_int <= 65:
        # ~65% of the population is native-born.
        country = "American"
        country_code = "US"
      elif rand_int <= 75:
        # Dominican Republic is the largest immigrant group. (~10%)
        country = "Dominican"
        country_code = "DO"
      elif rand_int <= 83:
        country, country_code = random.choice(other_countries)
      elif rand_int <= 88:
        # Jamaica has a significant presence. (~5%)
        country = "Jamaican"
        country_code = "JM"
      elif rand_int <= 92:
        # Mexico is another major group. (~4%)
        country = "Mexican"
        country_code = "MX"
      elif rand_int <= 95:
        # India has a growing population in the city. (~3%)
        country = "Indian"
        country_code = "IN"
      else:
        # The remaining 5% is a mix of other prominent nationalities.
        country, country_code = random.choice(other_countries)

      prompt = f"""You are a database engineer and need to generate data for a table for the below schema.
      I need you to generate a 100 customer names and email addresses based in the country {country}.
      The customer email should be a random email address.
      Read the description of each field for valid values.
      Encourage unconventional ideas and fresh perspectives and inspires unique variations when creating the customer's name.

      Here is the table schema:
      <schema>
      {table_schema}
      </schema>

      Here are the existing customer name, do not reuse any of these names:
      <existing_customer_names>
      {existing_customer_names}
      </existing_customer_names>
      """

      # Use LLM to generate data
      # print(f"Prompt: {prompt}")
      customer_response = GeminiLLM(prompt, response_schema=response_schema)

      # Parse response (we know the JSON since we passed it to our LLM)
      customer_response = json.loads(customer_response)
      # print(json.dumps(customer_response, indent=2))

      sql = f"""INSERT INTO `{project_id}.{dataset_name}.{table_name}`
              (customer_id, customer_name, customer_yob, customer_email, customer_inception_date, country_code)
              VALUES """

      for item in customer_response["customer_data"]:
        customer_name = item["customer_name"].replace("'","\\'").replace("\n", " ")
        customer_email = item["customer_email"].replace("'","\\'").replace("\n", " ")
        customer_yob = random.randint(yob_start, yob_end)
        start_date = date(2020, 1, 1)
        end_date = date(2025, 12, 31)
        total_days = (end_date - start_date).days
        random_days = random.randint(0, total_days)
        random_inception_date = start_date + timedelta(days=random_days)
        customer_inception_date = random_inception_date.strftime('%Y-%m-%d')
        existing_customer_names = existing_customer_names + f",{customer_name}"
        sql = sql + f"""({customer_id}, '{customer_name}', {customer_yob}, '{customer_email}', '{customer_inception_date}', '{country_code}'),"""
        customer_id += 1

      sql = sql[:-1]
      #print(f"SQL: {sql}")
      RunQuery(sql)

      success = True
    except Exception as error:
      print(f"An error occurred: {error}")