### <font color='#4285f4'>Overview</font>

Overview: Generates synthetic event data

Author:
* Adam Paternostro

### <font color='#4285f4'>License</font>

```
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

### <font color='#4285f4'>Pip installs</font>

In [None]:
# PIP Installs (if necessary)
import sys

# !{sys.executable} -m pip install REPLACE-ME

### <font color='#4285f4'>Initialize</font>

In [None]:
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random

import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception

In [None]:
# Set these (run this cell to verify the output)

bigquery_location = "${bigquery_non_multi_region}"
region = "${region}"
location = "${location}"

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

### <font color='#4285f4'>Helper Methods</font>

#### restAPIHelper
Calls the Google Cloud REST API using the current users credentials.

In [None]:
def restAPIHelper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
    return json.loads(response.content)
    #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
  else:
    error = f"Error restAPIHelper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

#### RetryCondition (for retrying LLM calls)

In [None]:
def RetryCondition(error):
  error_string = str(error)
  print(error_string)

  retry_errors = [
      "RESOURCE_EXHAUSTED",
      "No content in candidate",
      # Add more error messages here as needed
  ]

  for retry_error in retry_errors:
    if retry_error in error_string:
      print("Retrying...")
      return True

  return False

#### Gemini LLM

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM(prompt, model = "gemini-2.5-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 65536,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": {
          "text": prompt
      },
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError(f"No text in part: {response.content}")
            else:
              raise RuntimeError(f"No parts in content: {response.content}")
          else:
            raise RuntimeError(f"No parts in content: {response.content}")
        else:
          raise RuntimeError(f"No content in candidate: {response.content}")
      else:
        print(f"GeminiLLM response: {response.content}")
        raise RuntimeError(f"No candidates: {response.content}")
    else:
      raise RuntimeError(f"No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM_VerifyImage(prompt, imageBase64, model = "gemini-2.0-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 8192,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.0-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": [
          { "text": prompt },
          { "inlineData": {  "mimeType": "image/png", "data": f"{imageBase64}" } }
        ]
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError("No text in part: {response.content}")
            else:
              raise RuntimeError("No parts in content: {response.content}")
          else:
            raise RuntimeError("No parts in content: {response.content}")
        else:
          raise RuntimeError("No content in candidate: {response.content}")
      else:
        raise RuntimeError("No candidates: {response.content}")
    else:
      raise RuntimeError("No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

#### Imagen

In [None]:
def ImageGen(prompt):
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  model_version = "imagen-4.0-generate-preview-06-06" # Preview Access Model

  # https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/image-generation
  # url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/imagegeneration:predict"
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/publishers/google/models/{model_version}:predict"

  payload = {
    "instances": [
      {
        "prompt": prompt
      }
    ],
    "parameters": {
      "sampleCount": 1,
      "personGeneration" : "dont_allow"  # change to allow_adult for people generation
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    response_json = json.loads(response.content)
    # print(f"Imagen3 response_json: {response_json}")

    if "blocked" in response_json:
      print(f"Blocked: {response_json['blocked']}")

    if "predictions" in response_json:
      image_data = response_json["predictions"][0]["bytesBase64Encoded"]
      image_data = base64.b64decode(image_data)
      filename= str(uuid.uuid4()) + ".png"
      with open(filename, "wb") as f:
        f.write(image_data)
      print(f"Image generated OK.")
      return filename
    else:
      raise RuntimeError(f"No predictions in response: {response.content}")
  else:
    error = f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'"
    raise RuntimeError(error)

#### Helper Functions

In [None]:
def RunQuery(sql):
  import time
  from google.cloud import bigquery
  client = bigquery.Client()

  if (sql.startswith("SELECT") or sql.startswith("WITH")):
      df_result = client.query(sql).to_dataframe()
      return df_result
  else:
    job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
    query_job = client.query(sql, job_config=job_config)

    # Check on the progress by getting the job's updated state.
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )
    print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    while query_job.state != "DONE":
      time.sleep(2)
      query_job = client.get_job(
          query_job.job_id, location=query_job.location
          )
      print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    if query_job.error_result == None:
      return True
    else:
      raise Exception(query_job.error_result)

In [None]:
def GetTableSchema(project_id, dataset_name, table_name):
  import io
  from google.cloud import bigquery

  client = bigquery.Client()

  dataset_ref = client.dataset(dataset_name, project=project_id)
  table_ref = dataset_ref.table(table_name)
  table = client.get_table(table_ref)

  f = io.StringIO("")
  client.schema_to_json(table.schema, f)
  return f.getvalue()

#### GCS

In [None]:
# This was generated by GenAI

def copy_file_to_gcs(local_file_path, bucket_name, destination_blob_name):
  """Copies a file from a local drive to a GCS bucket.

  Args:
      local_file_path: The full path to the local file.
      bucket_name: The name of the GCS bucket to upload to.
      destination_blob_name: The desired name of the uploaded file in the bucket.

  Returns:
      None
  """

  import os
  from google.cloud import storage

  # Ensure the file exists locally
  if not os.path.exists(local_file_path):
      raise FileNotFoundError(f"Local file '{local_file_path}' not found.")

  # Create a storage client
  storage_client = storage.Client()

  # Get a reference to the bucket
  bucket = storage_client.bucket(bucket_name)

  # Create a blob object with the desired destination path
  blob = bucket.blob(destination_blob_name)

  # Upload the file from the local filesystem
  content_type = ""
  if local_file_path.endswith(".html"):
    content_type = "text/html; charset=utf-8"

  if local_file_path.endswith(".json"):
    content_type = "application/json; charset=utf-8"

  if content_type == "":
    blob.upload_from_filename(local_file_path)
  else:
    blob.upload_from_filename(local_file_path, content_type = content_type)

  print(f"File '{local_file_path}' uploaded to GCS bucket '{bucket_name}' as '{destination_blob_name}.  Content-Type: {content_type}'.")

### <font color='#4285f4'>MAIN CODE - Create Product Categories</font>

In [None]:
%%bigquery
CREATE SCHEMA IF NOT EXISTS `agentic_beans_raw` OPTIONS(location = 'us-central1');

In [None]:
%%bigquery
--DROP TABLE IF EXISTS `agentic_beans_raw.events`;

In [None]:
%%bigquery

CREATE TABLE IF NOT EXISTS `agentic_beans_raw.events`
(
    event_id            INT64      NOT NULL OPTIONS(description="The unique identifier and primary key for each event."),
    event_title         STRING     NOT NULL OPTIONS(description="The public-facing title of the event."),
    event_location      STRING     NOT NULL OPTIONS(description="The specific address or location of the event."),
    event_description   STRING              OPTIONS(description="A detailed description of the event."),
    event_start_date_time TIMESTAMP NOT NULL OPTIONS(description="The date and time when the event begins (UTC)."),
    event_end_date_time TIMESTAMP NOT NULL OPTIONS(description="The date and time when the event ends (UTC)."),
    age_range           STRING              OPTIONS(description="The recommended or required age range for event attendees (e.g., 'All Ages', '18+', '21+')."),
    event_venue         STRING              OPTIONS(description="The name of the venue where the event is held, if applicable."),
    event_neighborhood  STRING              OPTIONS(description="The Manhattan neighborhood where the event takes place, based on the provided list.")
)
CLUSTER BY event_id
OPTIONS(
    description="A table containing information about events where the coffee trucks will be present."
);

In [None]:
from datetime import datetime, timedelta

# OpenAPI 3.0 Schema for the Events data to be generated by the LLM
response_schema = {
  "type": "object",
  "required": [
    "events_data"
  ],
  "properties": {
    "events_data": {
      "type": "array",
      "items": {
        "type": "object",
        "required": [
          "event_title",
          "event_location",
          "event_description",
          "event_start_date_time",
          "event_end_date_time",
          "age_range",
          "event_venue",
          "event_neighborhood"
        ],
        "properties": {
          "event_title": { "type": "string" },
          "event_location": { "type": "string" },
          "event_description": { "type": "string" },
          "event_start_date_time": { "type": "string", "format": "date-time" },
          "event_end_date_time": { "type": "string", "format": "date-time" },
          "age_range": { "type": "string", "enum": ["All Ages", "18+", "21+", "Family-Friendly"] },
          "event_venue": { "type": "string" },
          "event_neighborhood": { "type": "string" }
        }
      }
    }
  }
}

# List of Manhattan neighborhoods as provided in [Item A]
manhattan_neighborhoods = [
    "Marble Hill", "Central Harlem North", "Penn Station/Madison Sq West", "Roosevelt Island",
    "Lincoln Square East", "Midtown Center", "East Village", "Upper West Side South",
    "Manhattanville", "UN/Turtle Bay South", "Lenox Hill West", "Times Sq/Theatre District",
    "Financial District South", "Clinton East", "Lincoln Square West", "Alphabet City",
    "Lower East Side", "Chinatown", "Upper East Side South", "West Chelsea/Hudson Yards",
    "Gramercy", "Lenox Hill East", "Clinton West", "Inwood Hill Park", "Bloomingdale",
    "Greenwich Village South", "Upper East Side North", "Garment District",
    "Greenwich Village North", "Hudson Sq", "Randalls Island", "Battery Park",
    "Battery Park City", "Morningside Heights", "Washington Heights North", "Kips Bay",
    "Murray Hill", "Seaport", "Yorkville West", "Two Bridges/Seward Park",
    "TriBeCa/Civic Center", "Union Sq", "Midtown South", "Midtown North", "Flatiron",
    "Sutton Place/Turtle Bay North", "World Trade Center", "SoHo", "East Chelsea",
    "Manhattan Valley", "Governor's Island/Ellis Island/Liberty Island",
    "Meatpacking/West Village West", "Upper West Side North", "East Harlem North",
    "Hamilton Heights", "Little Italy/NoLiTa", "East Harlem South",
    "Stuy Town/Peter Cooper Village", "Highbridge Park", "Central Park", "Midtown East",
    "Washington Heights South", "Central Harlem", "West Village", "Yorkville East",
    "Financial District North", "Inwood"
]

# Get the maximum existing event_id to start from
max_event_id_df = RunQuery("SELECT IFNULL(MAX(event_id) + 1, 1) as event_id FROM `agentic_beans_raw.events`")
max_event_id = int(max_event_id_df['event_id'][0])
print(f"Starting event_id: {max_event_id}")

event_start_date_df = RunQuery("SELECT CAST(CAST(IFNULL(MAX(event_end_date_time), '2020-01-01') AS DATE) AS STRING) as event_start_date FROM `agentic_beans_raw.events`")
event_start_date = str(event_start_date_df['event_start_date'][0])
event_start_date = datetime.strptime(event_start_date, "%Y-%m-%d").date()
print(f"Starting event_start_date: {event_start_date}")

max_event_date = datetime.strptime('2027-01-01', "%Y-%m-%d").date()

dataset_name = "agentic_beans_raw"
table_name = "events"
table_schema = GetTableSchema(project_id, dataset_name, table_name)
event_response_raw = ""

# Loop to generate and insert events
event_id = max_event_id
while event_start_date < max_event_date:
  # do this twice for each day, we run out of output tokens
  #for two_days in range(1,3,1):
    print(f"Generating events: {event_start_date} | {event_id}")
    success = False
    while not success:
      try:
        prompt = f"""You are a database engineer and need to generate data for a table for the below schema.
        The data is for events that occur in Manhattan, NYC.
        I need you to generate approximately 25 realistic events per call.
        The events should be typical events in NYC (events in parks, art festivals, events at bars, etc...)

        Ensure the following constraints are met:
        - The 'event_neighborhood' must be chosen from the provided list of Manhattan neighborhoods.
        - The 'event_location' must be a specific, plausible street address or landmark within the chosen 'event_neighborhood'.
        - The 'event_start_date_time' and 'event_end_date_time' must be valid UTC timestamps.
        - The 'event_start_date_time' must be on the day: {event_start_date}
        - 'event_end_date_time' must always be after 'event_start_date_time', with a duration typically between 2 to 8 hours.
        - Make the 'event_title' and 'event_description' engaging and relevant.
        - Vary the 'age_range' between 'All Ages', '18+', '21+', or 'Family-Friendly'.
        - The 'event_venue' should be relevant to the location.

        Here is the table schema:
        <schema>
        {json.dumps(table_schema, indent=2)}
        </schema>

        Here are the specific Manhattan neighborhoods for the events:
        <event_neighborhoods>
        {manhattan_neighborhoods}
        </event_neighborhoods>
        """

        # Use LLM to generate data
        # print(f"Prompt: {prompt}") # Uncomment to see the full prompt
        event_response_raw = GeminiLLM(prompt, response_schema=response_schema)

        # Parse response
        event_response = json.loads(event_response_raw)
        #print(json.dumps(event_response, indent=2)) # Uncomment to see the LLM's raw response

        if not event_response["events_data"]:
            print("LLM returned no events. Retrying...")
            continue

        sql_values = []
        for item in event_response["events_data"]:

          if item["event_title"] is None:
            event_title = ""
          else:
            event_title = item["event_title"].replace("'","\\'").replace("\n", " ")

          if item['event_location'] is None:
            event_location = ""
          else:
            event_location = item["event_location"].replace("'","\\'").replace("\n", " ")

          if item["event_description"] is None:
            event_description = ""
          else:
            event_description = item["event_description"].replace("'","\\'").replace("\n", " ")

          event_start_date_time = item["event_start_date_time"]
          event_end_date_time = item["event_end_date_time"]
          age_range = item["age_range"]
          event_venue = item.get("event_venue") # Use .get to handle potential null/missing
          if event_venue is not None:
            event_venue_sql = event_venue.replace("'","\\'").replace("\n", " ")

          if item["event_neighborhood"] is None:
            event_neighborhood = ""
          else:
            event_neighborhood = item["event_neighborhood"].replace("'","\\'").replace("\n", " ")

          event_start_date_time = event_start_date_time.replace("+00:00","")
          event_end_date_time = event_end_date_time.replace("+00:00","")

          sql_values.append(
              f"({event_id}, '{event_title}', '{event_location}', '{event_description}', "
              f"PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', '{event_start_date_time}'), "
              f"PARSE_TIMESTAMP('%Y-%m-%dT%H:%M:%SZ', '{event_end_date_time}'), "
              f"'{age_range}', '{event_venue_sql}', '{event_neighborhood}')"
          )
          event_id += 1

        if sql_values:
            sql = f"""INSERT INTO `{project_id}.{dataset_name}.{table_name}`
                    (event_id, event_title, event_location, event_description, event_start_date_time, event_end_date_time, age_range, event_venue, event_neighborhood)
                    VALUES """
            sql += ",\n".join(sql_values)
            RunQuery(sql)

        event_start_date = event_start_date + timedelta(days=1)

        success = True
      except Exception as error:
        print(f"event_response_raw: {event_response_raw}")
        print(f"An error occurred during event generation/insertion: {error}")