### <font color='#4285f4'>Overview</font>

Overview: Runs the knowledge engine on a dataset.  Creates a business glossary based upon the knowledge scan.

Cost:
* Approximate cost: Less than $1

Author:
* Adam Paternostro

### <font color='#4285f4'>Video Walkthrough</font>

[Video](https://storage.googleapis.com/data-analytics-golden-demo/colab-videos/Demo-Knowledge-Engine.mp4)


In [None]:
from IPython.display import HTML

HTML("""
<video width="800" height="600" controls>
  <source src="https://storage.googleapis.com/data-analytics-golden-demo/colab-videos/Demo-Knowledge-Engine.mp4" type="video/mp4">
  Your browser does not support the video tag.
</video>
""")

### <font color='#4285f4'>License</font>

```
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
```

### <font color='#4285f4'>Pip installs</font>

In [None]:
# PIP Installs (if necessary)
import sys

# !{sys.executable} -m pip install REPLACE-ME

### <font color='#4285f4'>Initialize</font>

In [None]:
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random

import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception

In [None]:
# Set these (run this cell to verify the output)

bigquery_location = "${bigquery_non_multi_region}"
dataplex_region = "${dataplex_region}"
dataform_region = "${dataform_region}"
location = "${location}" # for Gemini


logger = logging.getLogger()

# Get the current date and time
now = datetime.datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

### <font color='#4285f4'>Helper Methods</font>

#### rest_api_helper
Calls the Google Cloud REST API using the current users credentials.

In [None]:
def rest_api_helper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
    return json.loads(response.content)
    #image_data = json.loads(response.content)["predictions"][0]["bytesBase64Encoded"]
  else:
    error = f"Error rest_api_helper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

#### RetryCondition (for retrying LLM calls)

In [None]:
def RetryCondition(error):
  error_string = str(error)
  print(error_string)

  retry_errors = [
      "RESOURCE_EXHAUSTED",
      "No content in candidate",
      # Add more error messages here as needed
  ]

  for retry_error in retry_errors:
    if retry_error in error_string:
      print("Retrying...")
      return True

  return False

#### Gemini LLM

In [None]:
@retry(wait=wait_exponential(multiplier=1, min=1, max=60), stop=stop_after_attempt(10), retry=retry_if_exception(RetryCondition), before_sleep=before_sleep_log(logging.getLogger(), logging.INFO))
def GeminiLLM(prompt, model = "gemini-2.5-flash", response_schema = None,
                 temperature = 1, topP = 1, topK = 32):

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#supported_models

  llm_response = None
  if temperature < 0:
    temperature = 0

  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request() # required to acess access token
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
      "Content-Type" : "application/json",
      "Authorization" : "Bearer " + access_token
  }

  # https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
  url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/publishers/google/models/{model}:generateContent"

  generation_config = {
    "temperature": temperature,
    "topP": topP,
    "maxOutputTokens": 65536,
    "candidateCount": 1,
    "responseMimeType": "application/json",
  }

  # Add inthe response schema for when it is provided
  if response_schema is not None:
    generation_config["responseSchema"] = response_schema

  if model == "gemini-2.5-flash":
    generation_config["topK"] = topK

  payload = {
    "contents": {
      "role": "user",
      "parts": {
          "text": prompt
      },
    },
    "generation_config": {
      **generation_config
    },
    "safety_settings": {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "threshold": "BLOCK_LOW_AND_ABOVE"
    }
  }

  response = requests.post(url, json=payload, headers=headers)

  if response.status_code == 200:
    try:
      json_response = json.loads(response.content)
    except Exception as error:
      raise RuntimeError(f"An error occurred parsing the JSON: {error}")

    if "candidates" in json_response:
      candidates = json_response["candidates"]
      if len(candidates) > 0:
        candidate = candidates[0]
        if "content" in candidate:
          content = candidate["content"]
          if "parts" in content:
            parts = content["parts"]
            if len(parts):
              part = parts[0]
              if "text" in part:
                text = part["text"]
                llm_response = text
              else:
                raise RuntimeError("No text in part: {response.content}")
            else:
              raise RuntimeError("No parts in content: {response.content}")
          else:
            raise RuntimeError("No parts in content: {response.content}")
        else:
          raise RuntimeError("No content in candidate: {response.content}")
      else:
        raise RuntimeError("No candidates: {response.content}")
    else:
      raise RuntimeError("No candidates: {response.content}")

    # Remove some typically response characters (if asking for a JSON reply)
    llm_response = llm_response.replace("```json","")
    llm_response = llm_response.replace("```","")
    llm_response = llm_response.replace("\n","")

    return llm_response

  else:
    raise RuntimeError(f"Error with prompt:'{prompt}'  Status:'{response.status_code}' Text:'{response.text}'")

#### Helper Functions

In [None]:
def RunQuery(sql):
  import time
  from google.cloud import bigquery
  client = bigquery.Client()

  if (sql.startswith("SELECT") or sql.startswith("WITH")):
      df_result = client.query(sql).to_dataframe()
      return df_result
  else:
    job_config = bigquery.QueryJobConfig(priority=bigquery.QueryPriority.INTERACTIVE)
    query_job = client.query(sql, job_config=job_config)

    # Check on the progress by getting the job's updated state.
    query_job = client.get_job(
        query_job.job_id, location=query_job.location
    )
    print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    while query_job.state != "DONE":
      time.sleep(2)
      query_job = client.get_job(
          query_job.job_id, location=query_job.location
          )
      print("Job {} is currently in state {} with error result of {}".format(query_job.job_id, query_job.state, query_job.error_result))

    if query_job.error_result == None:
      return True
    else:
      raise Exception(query_job.error_result)

### <font color='#4285f4'>Knowledge Engine - Helper Methods</font>

These methods all return a "dict" so we can use them in the data analytics agent code.

#### get_knowledge_engine_scans

In [None]:
def get_knowledge_engine_scans() -> dict:
    """
    Lists all Dataplex knowledge engine scans in the configured region.

    This function specifically filters the results to include only scans of
    type 'knowledge_engine'.

    Returns:
        dict: A dictionary containing the status and the list of knowledge engine scans.
        {
            "status": "success" or "failed",
            "tool_name": "get_knowledge_engine_scans",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "dataScans": [ ... list of scan objects of type knowledge_engine ... ]
            }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")
    messages = []

    # The URL to list all data scans in the specified project and region.
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{dataplex_region}/dataScans"

    try:
        # Call the REST API to get the list of all existing data scans
        json_result = rest_api_helper(url, "GET", None)
        messages.append("Successfully retrieved list of all data scans from the API.")

        # Filter the returned scans to only include those of type 'knowledge_engine'
        all_scans = json_result.get("dataScans", [])

        # Using a list comprehension for a concise filter
        profile_scans_only = [
            scan for scan in all_scans if scan.get("type") == "KNOWLEDGE_ENGINE"
        ]

        messages.append(f"Filtered results. Found {len(profile_scans_only)} knowledge engine scans.")

        # Create the final results payload with the filtered list
        filtered_results = {"dataScans": profile_scans_only}

        return {
            "status": "success",
            "tool_name": "get_knowledge_engine_scans",
            "query": None,
            "messages": messages,
            "results": filtered_results
        }
    except Exception as e:
        messages.append(f"An error occurred while listing knowledge engine scans: {e}")
        return {
            "status": "failed",
            "tool_name": "get_knowledge_engine_scans",
            "query": None,
            "messages": messages,
            "results": None
        }


#### exists_knowledge_engine_scan

In [None]:
def exists_knowledge_engine_scan(knowledge_engine_scan_name: str) -> dict:
    """
    Checks if a Dataplex knowledge engine scan already exists by checking the full list.

    Args:
        knowledge_engine_scan_name (str): The short name/ID of the knowledge engine scan.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "exists_knowledge_engine_scan",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "exists": True # or False
            }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")

    # Call the dedicated function to list all scans
    list_result = get_knowledge_engine_scans()
    messages = list_result.get("messages", [])

    # If listing scans failed, propagate the failure
    if list_result["status"] == "failed":
        return {
            "status": "failed",
            "tool_name": "exists_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }

    try:
        scan_exists = False
        json_payload = list_result.get("results", {})
        full_scan_name_to_find = f"projects/{project_id}/locations/{dataplex_region}/dataScans/{knowledge_engine_scan_name}"

        # Loop through the list of scans from the results
        for item in json_payload.get("dataScans", []):
            if item.get("name") == full_scan_name_to_find:
                scan_exists = True
                messages.append(f"Found matching scan: '{knowledge_engine_scan_name}'.")
                break

        if not scan_exists:
            messages.append(f"Scan '{knowledge_engine_scan_name}' does not exist.")

        return {
            "status": "success",
            "tool_name": "exists_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": {"exists": scan_exists}
        }
    except Exception as e: # Catch potential errors while processing the list
        messages.append(f"An unexpected error occurred while processing scan list: {e}")
        return {
            "status": "failed",
            "tool_name": "exists_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }


#### create_knowledge_engine_scan

In [None]:
def create_knowledge_engine_scan(knowledge_engine_scan_name: str, knowledge_engine_display_name: str, bigquery_dataset_name: str) -> dict:
    """
    Creates a new Dataplex knowledge engine Scan if it does not already exist.

    Args:
        knowledge_engine_scan_name (str): The short name/ID for the new knowledge engine scan.
        knowledge_engine_display_name (str): The user-friendly display name for the scan.
        bigquery_dataset_name (str): The BigQuery dataset to be scanned.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_knowledge_engine_scan",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the API call ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")

    # First, check if the knowledge engine scan already exists.
    existence_check = exists_knowledge_engine_scan(knowledge_engine_scan_name)
    messages = existence_check.get("messages", [])

    # If the check failed, propagate the failure.
    if existence_check["status"] == "failed":
        return {
            "status": "failed",
            "tool_name": "create_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }

    # If the scan already exists, report success and stop.
    if existence_check["results"]["exists"]:
        full_scan_name = f"projects/{project_id}/locations/{dataplex_region}/dataScans/{knowledge_engine_scan_name}"
        return {
            "status": "success",
            "tool_name": "create_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": {"name": full_scan_name, "created": False}
        }

    # If the scan does not exist, proceed with creation.
    messages.append(f"Creating knowledge engine Scan '{knowledge_engine_scan_name}'.")

    # API endpoint to create a data scan. The scan ID is passed as a query parameter.
    # https://cloud.google.com/dataplex/docs/reference/rest/v1/projects.locations.dataScans/create
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{dataplex_region}/dataScans?dataScanId={knowledge_engine_scan_name}"

    request_body = {
    "name": f"projects/{project_id}/locations/{dataplex_region}/dataScans/{knowledge_engine_scan_name}",
    "uid": knowledge_engine_scan_name,
    "description": f"Knowledge engine scan for the dataset {bigquery_dataset_name}",
    "displayName": knowledge_engine_scan_name,
    "data": {
        "resource": f"//bigquery.googleapis.com/projects/{project_id}/datasets/{bigquery_dataset_name}" },
        "executionSpec": {
            "trigger": {
                "onDemand": {} }
                },
        "type": "KNOWLEDGE_ENGINE",
        "knowledgeEngineSpec": {}
    }


    try:
        # The create API returns a long-running operation object.
        json_result = rest_api_helper(url, "POST", request_body)

        operation_name = json_result.get("name", "Unknown Operation")
        messages.append(f"Successfully initiated knowledge engine Scan creation. Operation: {operation_name}")

        return {
            "status": "success",
            "tool_name": "create_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while creating the knowledge engine scan: {e}")
        return {
            "status": "failed",
            "tool_name": "create_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }


#### start_knowledge_engine_scan

In [None]:
def start_knowledge_engine_scan(knowledge_engine_scan_name: str) -> dict:
    """
    Triggers a run of an existing Dataplex knowledge engine scan.

    This initiates a new scan job. To check the status of this job, you will
    need the job name from the results and use the 'get_state_knowledge_engine_scan' tool.

    Args:
        knowledge_engine_scan_name (str): The short name/ID of the knowledge engine scan to run.

    Returns:
        dict: A dictionary containing the status and the job information.
        {
            "status": "success" or "failed",
            "tool_name": "start_knowledge_engine_scan",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "job": {
                    "name": "projects/.../locations/.../dataScans/.../jobs/...",
                    "uid": "...",
                    "createTime": "...",
                    "startTime": "...",
                    "state": "RUNNING",
                    "dataProfileResult": {}
                }
            }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")
    messages = []

    # The API endpoint to run a data scan job. Note the custom ':run' verb at the end.
    # https://cloud.google.com/dataplex/docs/reference/rest/v1/projects.locations.dataScans/run
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{dataplex_region}/dataScans/{knowledge_engine_scan_name}:run"

    # The run method requires a POST request with an empty body.
    request_body = {}

    try:
        messages.append(f"Attempting to run knowledge engine Scan '{knowledge_engine_scan_name}'.")

        # Call the REST API to trigger the scan run.
        json_result = rest_api_helper(url, "POST", request_body)

        # Extract job details for a more informative message.
        # Use .get() for safe access in case the response structure is unexpected.
        job_info = json_result.get("job", {})
        job_name = job_info.get("name", "Unknown Job")
        job_state = job_info.get("state", "Unknown State")

        messages.append(f"Successfully started knowledge engine Scan job: {job_name} - State: {job_state}")

        return {
            "status": "success",
            "tool_name": "start_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while starting the knowledge engine scan: {e}")
        return {
            "status": "failed",
            "tool_name": "start_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }

#### get_knowledge_engine_scan_state

In [None]:
def get_knowledge_engine_scan_state(knowledge_engine_scan_job_name: str) -> dict:
    """
    Gets the current state of a running knowledge engine scan job.

    The job is created when a scan is started via 'start_knowledge_engine_scan'.

    Args:
        knowledge_engine_scan_job_name (str): The full resource name of the scan job, e.g.,
                                          "projects/.../locations/.../dataScans/.../jobs/...".

    Returns:
        dict: A dictionary containing the status and the job state.
        {
            "status": "success" or "failed",
            "tool_name": "get_knowledge_engine_scan_state",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "state": "SUCCEEDED" # or "RUNNING", "FAILED", etc.
            }
        }
    """
    messages = []

    # The API endpoint for getting a job's status is generic.
    # The job name itself is the full path after the API version.
    url = f"https://dataplex.googleapis.com/v1/{knowledge_engine_scan_job_name}"

    try:
        # Make a GET request to the specific job URL.
        json_result = rest_api_helper(url, "GET", None)

        # Safely extract the state from the response.
        state = json_result.get("state", "UNKNOWN")
        messages.append(f"knowledge engine job '{knowledge_engine_scan_job_name}' is in state: {state}")

        return {
            "status": "success",
            "tool_name": "get_knowledge_engine_scan_state",
            "query": None,
            "messages": messages,
            "results": {"state": state}
        }
    except Exception as e:
        messages.append(f"An error occurred while getting the knowledge engine scan job state: {e}")
        return {
            "status": "failed",
            "tool_name": "get_knowledge_engine_scan_state",
            "query": None,
            "messages": messages,
            "results": None
        }



#### update_bigquery_dataset_dataplex_labels

In [None]:
def update_bigquery_dataset_dataplex_labels(knowledge_engine_scan_name: str, bigquery_dataset_name: str) -> dict:
    """
    Updates a BigQuery table's labels to link it to a Dataplex knowledge engine scan.

    This operation is necessary for the knowledge engine results to appear in the
    "knowledge engine" tab of the table details page in the BigQuery Console.

    Args:
        knowledge_engine_scan_name (str): The short name/ID of the knowledge engine scan to link.
        bigquery_dataset_name (str): The BigQuery dataset containing the knowledge scan.

    Returns:
        dict: A dictionary containing the status and the BigQuery API response.
        {
            "status": "success" or "failed",
            "tool_name": "update_bigquery_dataset_dataplex_labels",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the BigQuery tables.patch API call ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")
    messages = []

    # API endpoint for patching a BigQuery table's metadata.
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/patch
    url = f"https://bigquery.googleapis.com/bigquery/v2/projects/{project_id}/datasets/{bigquery_dataset_name}"

    # The request body contains the specific labels that link the table to the scan.
    request_body = {
        "labels": {
            "dataplex-knowledge-engine-published-project": project_id,
            "dataplex-knowledge-engine-published-location": dataplex_region,
            "dataplex-knowledge-engine-published-scan": knowledge_engine_scan_name,
        }
    }

    try:
        messages.append(f"Patching BigQuery table '{bigquery_dataset_name}' with Dataplex labels.")

        # Call the REST API using PATCH to update the table's labels.
        json_result = rest_api_helper(url, "PATCH", request_body)

        messages.append("Successfully updated BigQuery dataset labels.")

        return {
            "status": "success",
            "tool_name": "update_bigquery_dataset_dataplex_labels",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while updating the BigQuery table labels: {e}")
        return {
            "status": "failed",
            "tool_name": "update_bigquery_dataset_dataplex_labels",
            "query": None,
            "messages": messages,
            "results": None
        }

#### get_knowledge_engine_scan

In [None]:
def get_knowledge_engine_scan(knowledge_engine_scan_name: str) -> dict:
    """
    Gets a single Dataplex knowledge engine scan in the configured region.
    This returns the "Full" view which has all the scan details (more than just a data scan listing (e.g. tool: get_knowledge_engine_scans))

    Args:
        knowledge_engine_scan_name (str): The name of the knowledge engine scan.

    Returns:
        dict: A dictionary containing the status and the list of knowledge engine scans.
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataplex_region = os.getenv("AGENT_ENV_DATAPLEX_REGION")
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{dataplex_region}/dataScans/{knowledge_engine_scan_name}?view=FULL"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append("Successfully retrieved the data get_knowledge_engine_scan from the API.")

        return {
            "status": "success",
            "tool_name": "get_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        messages.append(f"An error occurred while listing knowledge engine scan: {e}")
        return {
            "status": "failed",
            "tool_name": "get_knowledge_engine_scan",
            "query": None,
            "messages": messages,
            "results": None
        }


### <font color='#4285f4'>Business Glossary - Helper Methods</font>

#### get_business_glossaries

In [None]:
def get_business_glossaries() -> dict:
    """
    Lists all Dataplex business glossaries in the configured region.

    Returns:
        dict: A dictionary containing the status and the list of business glossaries.
        {
            "status": "success" or "failed",
            "tool_name": "get_business_glossaries",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "glossaries": [ ... list of glossary objects ... ]
            }
        }
    """
    # project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    # business_glossary_region = os.getenv("AGENT_ENV_BUSINESS_GLOSSARY_REGION")
    messages = []

    # The URL to list all business glossaries in the specified project and region.
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries"

    try:
        # Call the REST API to get the list of all existing business glossaries
        json_result = rest_api_helper(url, "GET", None)
        messages.append("Successfully retrieved list of all business glossaries from the API.")

        # Extract the list of glossaries
        glossaries = json_result.get("glossaries", [])

        messages.append(f"Found {len(glossaries)} business glossaries.")

        # Create the final results payload with the list
        filtered_results = {"glossaries": glossaries}

        return {
            "status": "success",
            "tool_name": "get_business_glossaries",
            "query": None,
            "messages": messages,
            "results": filtered_results
        }
    except Exception as e:
        messages.append(f"An error occurred while listing business glossaries: {e}")
        return {
            "status": "failed",
            "tool_name": "get_business_glossaries",
            "query": None,
            "messages": messages,
            "results": None
        }

#### exists_business_glossary

In [None]:
def exists_business_glossary(glossary_id: str) -> dict:
    """
    Checks if a Dataplex business glossary already exists by checking the full list.

    Args:
        glossary_id (str): The ID of the business glossary to retrieve

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "exists_business_glossary",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "exists": True # or False
            }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #business_glossary_region = os.getenv("AGENT_ENV_BUSINESS_GLOSSARY_REGION")

    # Call the dedicated function to list all scans
    list_result = get_business_glossaries()
    messages = list_result.get("messages", [])

    # If listing scans failed, propagate the failure
    if list_result["status"] == "failed":
        return {
            "status": "failed",
            "tool_name": "exists_business_glossary_scan",
            "query": None,
            "messages": messages,
            "results": None
        }

    try:
        glossary_exists = False
        json_payload = list_result.get("results", {})
        glossary_name_to_find = f"projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}"

        # Loop through the list of scans from the results
        for item in json_payload.get("glossaries", []):
            if item.get("name") == glossary_name_to_find:
                glossary_exists = True
                messages.append(f"Found matching glossary: '{glossary_id}'.")
                break

        if not glossary_exists:
            messages.append(f"Glossary '{glossary_id}' does not exist.")

        return {
            "status": "success",
            "tool_name": "exists_business_glossary",
            "query": None,
            "messages": messages,
            "results": {"exists": glossary_exists}
        }
    except Exception as e: # Catch potential errors while processing the list
        messages.append(f"An unexpected error occurred while processing scan list: {e}")
        return {
            "status": "failed",
            "tool_name": "exists_business_glossary",
            "query": None,
            "messages": messages,
            "results": None
        }


#### create_business_glossary

In [None]:
def create_business_glossary(glossary_id: str, display_name: str, description: str = "") -> dict:
    """
    Creates a new Dataplex business glossary if it does not already exist.

    Args:
        glossary_id (str): The short name/ID for the new business glossary.
        display_name (str): The user-friendly display name for the glossary.
        description (str, optional): A brief description for the glossary. Defaults to "".

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_business_glossary",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the API call or creation status ... }
        }
    """
    # business_glossary_region = os.getenv("AGENT_ENV_BUSINESS_GLOSSARY_REGION")
    messages = []

    # First, check if the business glossary already exists.
    # We'll call get_business_glossaries and manually check for the existence
    # as there's no direct 'exists_business_glossary' function yet.
    list_result = get_business_glossaries()
    messages.extend(list_result.get("messages", [])) # Append messages from the list operation

    if list_result["status"] == "failed":
        return {
            "status": "failed",
            "tool_name": "create_business_glossary",
            "query": None,
            "messages": messages,
            "results": None
        }

    glossaries = list_result.get("results", {}).get("glossaries", [])
    glossary_exists = False
    full_glossary_name_to_find = f"projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}"

    for glossary in glossaries:
        if glossary.get("name") == full_glossary_name_to_find:
            glossary_exists = True
            break

    if glossary_exists:
        messages.append(f"Business glossary '{glossary_id}' already exists.")
        return {
            "status": "success",
            "tool_name": "create_business_glossary",
            "query": None,
            "messages": messages,
            "results": {"name": full_glossary_name_to_find, "created": False}
        }

    # If the glossary does not exist, proceed with creation.
    messages.append(f"Creating business glossary '{glossary_id}'.")

    # API endpoint to create a glossary. The glossary ID is passed as a query parameter.
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries?glossaryId={glossary_id}"

    request_body = {
        "displayName": display_name,
        "description": description
    }

    try:
        json_result = rest_api_helper(url, "POST", request_body)

        messages.append(f"Successfully created business glossary '{glossary_id}'.")

        return {
            "status": "success",
            "tool_name": "create_business_glossary",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while creating the business glossary: {e}")
        return {
            "status": "failed",
            "tool_name": "create_business_glossary",
            "query": None,
            "messages": messages,
            "results": None
        }

#### get_business_glossary

In [None]:
def get_business_glossary(glossary_id: str) -> dict:
    """
    Gets a single Dataplex business glossary in the configured region.

    Args:
        glossary_id (str): The ID of the business glossary to retrieve.

    Returns:
        dict: A dictionary containing the status and the details of the business glossary.
        {
            "status": "success" or "failed",
            "tool_name": "get_business_glossary",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... glossary object details ... }
        }
    """
    #business_glossary_region = os.getenv("AGENT_ENV_BUSINESS_GLOSSARY_REGION")

    messages = []
    # The URL to get a specific business glossary
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append(f"Successfully retrieved details for business glossary '{glossary_id}' from the API.")

        return {
            "status": "success",
            "tool_name": "get_business_glossary",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        messages.append(f"An error occurred while getting business glossary '{glossary_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "get_business_glossary",
            "query": None,
            "messages": messages,
            "results": None
        }

#### get_business_glossary_categories

In [None]:
def get_business_glossary_categories(glossary_id: str) -> dict:
    """
    Lists all categories within a specified Dataplex business glossary.

    Args:
        glossary_id (str): The ID of the parent business glossary.

    Returns:
        dict: A dictionary containing the status and the list of categories.
        {
            "status": "success" or "failed",
            "tool_name": "get_business_glossary_categories",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "categories": [ ... list of category objects ... ]
            }
        }
    """
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/categories"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append(f"Successfully retrieved list of categories for glossary '{glossary_id}' from the API.")

        categories = json_result.get("categories", [])
        messages.append(f"Found {len(categories)} categories in glossary '{glossary_id}'.")

        return {
            "status": "success",
            "tool_name": "get_business_glossary_categories",
            "query": None,
            "messages": messages,
            "results": {"categories": categories}
        }
    except Exception as e:
        messages.append(f"An error occurred while listing categories for glossary '{glossary_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "get_business_glossary_categories",
            "query": None,
            "messages": messages,
            "results": None
        }



#### get_business_glossary_category

In [None]:
def get_business_glossary_category(glossary_id: str, category_id: str) -> dict:
    """
    Gets a single Dataplex business glossary category within a specified glossary.

    Args:
        glossary_id (str): The ID of the parent business glossary.
        category_id (str): The ID of the category to retrieve.

    Returns:
        dict: A dictionary containing the status and the details of the category.
        {
            "status": "success" or "failed",
            "tool_name": "get_business_glossary_category",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... category object details ... }
        }
    """
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/categories/{category_id}"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append(f"Successfully retrieved details for category '{category_id}' in glossary '{glossary_id}' from the API.")

        return {
            "status": "success",
            "tool_name": "get_business_glossary_category",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = str(e)
        # Check if the error is specifically a "not found" scenario (e.g., HTTP 404)
        if "404" in error_message and "not found" in error_message.lower():
            messages.append(f"Category '{category_id}' not found in glossary '{glossary_id}'.")
            return {
                "status": "failed", # Status is failed because it wasn't found
                "tool_name": "get_business_glossary_category",
                "query": None,
                "messages": messages,
                "results": None # Results are None when not found
            }
        else:
            messages.append(f"An error occurred while getting category '{category_id}' in glossary '{glossary_id}': {e}")
            return {
                "status": "failed",
                "tool_name": "get_business_glossary_category",
                "query": None,
                "messages": messages,
                "results": None
            }


#### create_business_glossary_category

In [None]:
def create_business_glossary_category(glossary_id: str, category_id: str, display_name: str, description: str = "") -> dict:
    """
    Creates a new Dataplex business glossary category if it does not already exist.

    Args:
        glossary_id (str): The ID of the parent business glossary.
        category_id (str): The short name/ID for the new category.
        display_name (str): The user-friendly display name for the category.
        description (str, optional): A brief description for the category. Defaults to "".

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_business_glossary_category",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the API call or creation status ... }
        }
    """
    messages = []

    # First, check if the category already exists.
    existence_check = get_business_glossary_category(glossary_id, category_id)
    messages.extend(existence_check.get("messages", [])) # Append messages from the sub-tool call

    if existence_check["status"] == "success" and existence_check["results"]:
        # Category found, so it already exists.
        full_category_name = existence_check["results"].get("name")
        messages.append(f"Business glossary category '{category_id}' already exists in glossary '{glossary_id}'.")
        return {
            "status": "success",
            "tool_name": "create_business_glossary_category",
            "query": None,
            "messages": messages,
            "results": {"name": full_category_name, "created": False}
        }
    elif existence_check["status"] == "failed" and "not found" in str(existence_check["messages"]).lower():
        # This is the expected scenario when the category does not exist.
        messages.append(f"Pre-check confirmed category '{category_id}' does not exist. Proceeding with creation.")
    else:
        # Any other type of failure during the existence check is an actual problem.
        messages.append(f"An unexpected error occurred during the existence check for category '{category_id}'.")
        return {
            "status": "failed",
            "tool_name": "create_business_glossary_category",
            "query": None,
            "messages": messages,
            "results": None
        }

    # If we reached here, the category does not exist, and we need to create it.
    messages.append(f"Attempting to create business glossary category '{category_id}' in glossary '{glossary_id}'.")

    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/categories?categoryId={category_id}"
    parent_glossary_name = f"projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}"

    request_body = {
        "displayName": display_name,
        "description": description,
        "parent": parent_glossary_name
    }

    try:
        json_result = rest_api_helper(url, "POST", request_body)

        messages.append(f"Successfully created business glossary category '{category_id}' in glossary '{glossary_id}'.")

        return {
            "status": "success",
            "tool_name": "create_business_glossary_category",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while creating the business glossary category '{category_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "create_business_glossary_category",
            "query": None,
            "messages": messages,
            "results": None
        }

#### get_business_glossary_term

In [None]:
def get_business_glossary_term(glossary_id: str, term_id: str) -> dict:
    """
    Gets a single Dataplex business glossary term.

    Args:
        glossary_id (str): The ID of the parent business glossary.
        term_id (str): The ID of the term to retrieve.

    Returns:
        dict: A dictionary containing the status and the details of the term.
        {
            "status": "success" or "failed",
            "tool_name": "get_business_glossary_term",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... term object details ... }
        }
    """
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/terms/{term_id}"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append(f"Successfully retrieved details for term '{term_id}' in glossary '{glossary_id}' from the API.")
        return {
            "status": "success",
            "tool_name": "get_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = str(e)
        if "404" in error_message and "not found" in error_message.lower():
            messages.append(f"Term '{term_id}' not found in glossary '{glossary_id}'.")
            return {
                "status": "failed",
                "tool_name": "get_business_glossary_term",
                "query": None,
                "messages": messages,
                "results": None
            }
        else:
            messages.append(f"An error occurred while getting term '{term_id}' in glossary '{glossary_id}': {e}")
            return {
                "status": "failed",
                "tool_name": "get_business_glossary_term",
                "query": None,
                "messages": messages,
                "results": None
            }



#### list_business_glossary_terms

In [None]:
def list_business_glossary_terms(glossary_id: str) -> dict:
    """
    Lists all terms within a specified Dataplex business glossary.

    Args:
        glossary_id (str): The ID of the parent business glossary.

    Returns:
        dict: A dictionary containing the status and the list of terms.
        {
            "status": "success" or "failed",
            "tool_name": "list_business_glossary_terms",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": {
                "terms": [ ... list of term objects ... ]
            }
        }
    """
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/terms"

    try:
        json_result = rest_api_helper(url, "GET", None)
        messages.append(f"Successfully retrieved list of terms for glossary '{glossary_id}' from the API.")

        terms = json_result.get("terms", [])
        messages.append(f"Found {len(terms)} terms in glossary '{glossary_id}'.")

        return {
            "status": "success",
            "tool_name": "list_business_glossary_terms",
            "query": None,
            "messages": messages,
            "results": {"terms": terms}
        }
    except Exception as e:
        messages.append(f"An error occurred while listing terms for glossary '{glossary_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "list_business_glossary_terms",
            "query": None,
            "messages": messages,
            "results": None
        }


#### create_business_glossary_term

In [None]:
def create_business_glossary_term(
    glossary_id: str,
    term_id: str,
    display_name: str,
    description: str = "",
    parent_category_id: str = None
) -> dict:
    """
    Creates a new Dataplex business glossary term if it does not already exist.
    The term can be created directly under a glossary or under a specific category.

    Args:
        glossary_id (str): The ID of the parent business glossary.
        term_id (str): The short name/ID for the new term.
        display_name (str): The user-friendly display name for the term.
        description (str, optional): A brief description for the term. Defaults to "".
        parent_category_id (str, optional): The ID of the parent category if the term
                                             should be nested. Defaults to None (term under glossary).

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_business_glossary_term",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the API call or creation status ... }
        }
    """
    messages = []

    # First, check if the term already exists.
    existence_check = get_business_glossary_term(glossary_id, term_id)
    messages.extend(existence_check.get("messages", []))

    if existence_check["status"] == "success" and existence_check["results"]:
        # Term found, so it already exists.
        full_term_name = existence_check["results"].get("name")
        messages.append(f"Business glossary term '{term_id}' already exists in glossary '{glossary_id}'.")
        return {
            "status": "success",
            "tool_name": "create_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": {"name": full_term_name, "created": False}
        }
    elif existence_check["status"] == "failed" and "not found" in str(existence_check["messages"]).lower():
        # This is the expected scenario when the term does not exist.
        messages.append(f"Pre-check confirmed term '{term_id}' does not exist. Proceeding with creation.")
    else:
        # Any other type of failure during the existence check is an actual problem.
        messages.append(f"An unexpected error occurred during the existence check for term '{term_id}'.")
        return {
            "status": "failed",
            "tool_name": "create_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": None
        }

    # If we reached here, the term does not exist, and we need to create it.
    messages.append(f"Attempting to create business glossary term '{term_id}' in glossary '{glossary_id}'.")

    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/terms?termId={term_id}"

    # Construct the parent resource name based on whether a category is provided
    if parent_category_id:
        parent_resource = f"projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/categories/{parent_category_id}"
        messages.append(f"Term will be nested under category: '{parent_category_id}'.")
    else:
        parent_resource = f"projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}"
        messages.append("Term will be created directly under the glossary.")


    request_body = {
        "displayName": display_name,
        "description": description,
        "parent": parent_resource
    }

    try:
        json_result = rest_api_helper(url, "POST", request_body)

        messages.append(f"Successfully created business glossary term '{term_id}'.")

        return {
            "status": "success",
            "tool_name": "create_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": json_result
        }

    except Exception as e:
        messages.append(f"An error occurred while creating the business glossary term '{term_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "create_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": None
        }

#### update_business_glossary_term

In [None]:
def update_business_glossary_term(
    glossary_id: str,
    term_id: str,
    display_name: str = None,
    description: str = None
) -> dict:
    """
    Updates an existing Dataplex business glossary term.
    Only provided fields (displayName, description) will be updated.

    Args:
        glossary_id (str): The ID of the parent business glossary.
        term_id (str): The ID of the term to update.
        display_name (str, optional): The new user-friendly display name for the term.
        description (str, optional): The new brief description for the term.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "update_business_glossary_term",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... response from the API call ... }
        }
    """
    messages = []
    url = f"https://dataplex.googleapis.com/v1/projects/{project_id}/locations/{business_glossary_region}/glossaries/{glossary_id}/terms/{term_id}"

    update_mask_fields = []
    request_body = {}

    if display_name is not None:
        request_body["displayName"] = display_name
        update_mask_fields.append("displayName")
    if description is not None:
        request_body["description"] = description
        update_mask_fields.append("description")

    if not update_mask_fields:
        messages.append("No fields provided for update. Skipping update operation.")
        return {
            "status": "success",
            "tool_name": "update_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": {"updated": False, "reason": "No fields to update"}
        }

    update_mask = ",".join(update_mask_fields)
    url_with_mask = f"{url}?update_mask={update_mask}"

    messages.append(f"Attempting to update business glossary term '{term_id}' with fields: {update_mask_fields}.")

    try:
        json_result = rest_api_helper(url_with_mask, "PATCH", request_body)
        messages.append(f"Successfully updated business glossary term '{term_id}'.")
        return {
            "status": "success",
            "tool_name": "update_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        messages.append(f"An error occurred while updating the business glossary term '{term_id}': {e}")
        return {
            "status": "failed",
            "tool_name": "update_business_glossary_term",
            "query": None,
            "messages": messages,
            "results": None
        }

### <font color='#4285f4'>MAIN CODE - Knowledge Engine</font>

In [None]:
knowledge_engine_scan_name = "agentic-beans-curated-knowledge-scan-01"
knowledge_engine_display_name = "Agents Beans Curated Knowledge Scan"
bigquery_dataset_name = "agentic_beans_curated"

In [None]:
create_knowledge_engine_scan_response = create_knowledge_engine_scan(
    knowledge_engine_scan_name,
    knowledge_engine_display_name,
    bigquery_dataset_name)

In [None]:
create_knowledge_engine_scan_response

In [None]:
start_knowledge_engine_scan_response = start_knowledge_engine_scan(knowledge_engine_scan_name)

In [None]:
# You can loop or retest if the scan is complete
start_knowledge_engine_scan_response

In [None]:
scan_job_response = get_knowledge_engine_scan_state(start_knowledge_engine_scan_response["results"]["job"]["name"])

In [None]:
scan_job_response

In [None]:
# Wait for it to complete
while scan_job_response["results"]["state"] == "PENDING" or \
      scan_job_response["results"]["state"] == "STATE_UNSPECIFIED" or \
      scan_job_response["results"]["state"] == "RUNNING" or \
      scan_job_response["results"]["state"] == "CANCELING":
      # Get the latest state
      time.sleep(5)      
      scan_job_response = get_knowledge_engine_scan_state(start_knowledge_engine_scan_response["results"]["job"]["name"])
      print ("State: ", scan_job_response["results"]["state"])

In [None]:
# Update the BigQuery user interface to point the scan
update_bigquery_dataset_dataplex_labels_response = update_bigquery_dataset_dataplex_labels(
    knowledge_engine_scan_name,
    bigquery_dataset_name)

In [None]:
update_bigquery_dataset_dataplex_labels_response

### <font color='#4285f4'>MAIN CODE - Business Glossary (Using Knowledge Engine)</font>

In [None]:
business_glossary_region = "global"
glossary_id = "agentic-beans-glossary-01"
glossary_display_name = "Agentic Beans Glossary 01"
glossary_description = "Holds the terms and synonyms for the Agentic Beans coffee company"

In [None]:
get_business_glossaries_response = get_business_glossaries()
print(f"get_business_glossaries_response: {json.dumps(get_business_glossaries_response, indent=2)}")

In [None]:
exists_business_glossary_response = exists_business_glossary(glossary_id)
print(f"exists_business_glossary_response: {json.dumps(exists_business_glossary_response, indent=2)}")

In [None]:
# Create the catalog
if exists_business_glossary_response["results"]["exists"] == False:
  create_business_glossary_response  = create_business_glossary(glossary_id, glossary_display_name, glossary_description)
  print(f"exists_business_glossary_response: {json.dumps(exists_business_glossary_response, indent=2)}")

- We want to create categories under our glossary
- We also need to create our terms
- We will load the terms from the Knowledge Engine scan (you should run that notebook before this one)
- To create the categories we will pass the terms to Gemini

In [None]:
# It can take a minute to show up
print(f"https://console.cloud.google.com/dataplex/dp-glossaries?project={project_id}")

In [None]:
exists_knowledge_engine_scan_response = exists_knowledge_engine_scan(knowledge_engine_scan_name)
print(f"exists_knowledge_engine_scan_response: {json.dumps(exists_knowledge_engine_scan_response, indent=2)}")

In [None]:
if exists_knowledge_engine_scan_response["results"]["exists"] == False:
  print(f"ERROR: You must run the Knowledge Engine notebook first")
else:
  print(f"The Knowledge Engine scan exists")

In [None]:
get_knowledge_engine_scan_response = get_knowledge_engine_scan(knowledge_engine_scan_name)

In [None]:
# This is a large output, you can clear the cell after viewing
# We have Terms under results.knowledgeEngineResult.datasetResult.businessGlossary.terms[]
print(f"get_knowledge_engine_scan_response: {json.dumps(get_knowledge_engine_scan_response, indent=2)}")

In [None]:
response_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "category_id": {"type": "string", "description": "Unique, kebab-case identifier for the category."},
            "display_name": {"type": "string", "description": "User-friendly name for the category (alphanumeric, underscores, hyphens, spaces only, no leading/trailing spaces, no commas or ampersands)."},
            "description": {"type": "string", "description": "Brief description of the category."},
            "terms": {"type": "array", "description": "The terms that belong to this category.", "items": {"type": "string"} }
        },
        "required": ["category_id", "display_name", "description", "terms"]
    }
}


terms_list = []
for item in get_knowledge_engine_scan_response["results"]["knowledgeEngineResult"]["datasetResult"]["businessGlossary"]["terms"]:
  item = {
      "title": item["title"],
      "description": item["description"]
  }
  terms_list.append(item)

terms_str = json.dumps(terms_list)

In [None]:
prompt = f"""You are an expert in data governance and business glossary creation.
Based on the following list of business terms and their descriptions, propose 2 to 10 distinct business glossary categories.
Then place each term into their perspective category.
Each term should be in one and only one category.
You must place every term in a category.
You are being given {len(terms_list)} terms.
The terms array accross the categories should have the same number of terms.

Each category should have:
1.  A 'category_id' (string): A unique, kebab-case identifier (e.g., "sales-operations", "customer-data").
2.  A 'display_name' (string): A user-friendly name, which must ONLY contain alphabets, numbers, underscores, hyphens, and/or spaces. The first and last character cannot be a space. Do not use commas or ampersands.
3.  A 'description' (string): A brief explanation of what terms would belong to this category.
4.  A 'terms' (array): A list of term title's that apply for the category id.

Ensure the 'display_name' strictly adheres to the specified format.

Here are the terms:
{terms_str}
"""

gemini_response = GeminiLLM(prompt, model = "gemini-2.5-pro", response_schema=response_schema)

In [None]:
gemini_response_dict = json.loads(gemini_response)
terms_count = 0

for item in gemini_response_dict:
  category_id = item["category_id"]
  display_name = item["display_name"]
  description = item["description"]
  terms = item["terms"]
  print(f"category_id: {category_id}")
  print(f"display_name: {display_name}")
  print(f"description: {description}")
  print(f"terms: {terms}")
  print(f"len(terms): {len(terms)}")
  terms_count += len(terms)

# NOTE: Not all terms might get returned by Gemini
print(f"terms list size: {len(terms_list)}, term count from gemini {terms_count}")

In [None]:
# Create the categories
for item in gemini_response_dict:
  category_id = item["category_id"]
  display_name = item["display_name"]
  description = item["description"]

  # This will check for existance before creating
  print(f"Creating category_id ({category_id})")
  create_business_glossary_category_response = create_business_glossary_category(glossary_id,category_id,display_name,description)
  print(f"create_business_glossary_category_response ({create_business_glossary_category_response})")
  print()

In [None]:
def get_term_description(get_knowledge_engine_scan_response, term_title_to_find):
    """
    Retrieves the description of a term from the knowledge engine response
    based on its title.

    Args:
        get_knowledge_engine_scan_response (dict): The full response dictionary from the knowledge engine scan.
        term_title_to_find (str): The title of the term whose description is needed.

    Returns:
        str or None: The description of the term if found, otherwise None.
    """
    try:
        terms_array = get_knowledge_engine_scan_response["results"]["knowledgeEngineResult"]["datasetResult"]["businessGlossary"]["terms"]

        # Iterate through the list of term dictionaries
        for term in terms_array:
            if term.get("title") == term_title_to_find:
                return term.get("description")

        # If the loop finishes, the term was not found
        return None

    except KeyError as e:
        print(f"Error: Missing key in response data: {e}")
        return None
    except TypeError:
        print("Error: Invalid response data structure. 'terms' might not be an array or other keys are missing.")
        return None


In [None]:
# Create the terms under their categories

for item in gemini_response_dict:
  category_id = item["category_id"]
  for term in item["terms"]:
    term_id = term.replace(" ","-").lower()
    term_title = term
    term_description = get_term_description(get_knowledge_engine_scan_response,term_title)
    print(f"category_id ({category_id}) | term_id ({term_id}) | term ({term_title}) | description ({term_description})")

    create_business_glossary_term_response = create_business_glossary_term(glossary_id, term_id, term_title, term_description, category_id)
    print(f"create_business_glossary_term_response ({create_business_glossary_term_response})")
    print()

    # Field term_id must contain only lowercase letters, numbers, and/or hyphens
