In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Module 02c: Data Documentation Scan

In the previous notebook, you ran Data Documentation Scans against individual tables. In this notebook, you will run a Data Documentation Scan against the dataset itself - rscw_oltp_stg_ds.

**Motivation:** <br>

Data Documentation Scan run against the dataset generates -
1. Dataset description
2. Infers the relationship between tables.
3. Golden queries spanning tables in the dataset (versus just a single table)

<br>

Note: This feature was previously called Knowledge Engine Scan.

<br>


**Prerequisites:** <br>
Complete the prerequisites/dependencies detailed in the user module associated with this notebook.


In [None]:
import requests, json, time, logging
import google.auth
import google.auth.transport.requests
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from google.api_core.exceptions import NotFound
from google.api_core import exceptions
from urllib.parse import urlencode # Import urlencode



def get_access_token():
    """
    Generates an access token using Application Default Credentials or a service account key file.
    """
    try:
        # Authenticate using Application Default Credentials (ADC)
        # This will automatically find credentials set via `gcloud auth application-default login`
        # or from the environment if running on GCP.
        credentials, project = google.auth.default(scopes=SCOPES)

        # Refresh the credentials to ensure an up-to-date access token
        request = google.auth.transport.requests.Request()
        credentials.refresh(request)
        return credentials.token
    except Exception as e:
        print(f"Error generating access token: {e}")
        return None

def patch_source_table_with_labels(access_token, dataset_id,scan_type, scan_id,  source_table_nm):
    """
    Patches the source BigQuery table with labels to correlate with the data scans

    Args:
        access_token (str): The Google Cloud access token
        dataset_id (str): The dataset id
        scan_type (str):
        scan_id (str): The scan id
        source_table_nm (str): The source table name

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    if not access_token:
            msg="Access token is missing. Cannot proceed with API call."
            return msg

    API_ENDPOINT=f"https://bigquery.googleapis.com/bigquery/v2/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}?"

    patch_request_body= generate_patch_label_request_body(scan_type,scan_id)

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }


    # Apply the patch
    try:
        response = requests.patch(API_ENDPOINT, headers=headers, json=patch_request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors



    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")


def generate_scan_request_body(dataset_id,scan_id, scan_type,
                                            source_table_nm, profile_results_table_nm):
    """
    Generates the scan request body

    Args:
        scan_id (str): scan id
        scan_type (str): Type of scan (DATA_PROFLE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN)
        source_table_nm (str): Source table name
        profile_results_table_nm (str): The profile results table name


    Returns:
        string: JSON with the request body for the data scan API call
    """
    scan_request_body={}
    if(scan_type == "DATA_PROFILE_SCAN"):

        scan_request_body={
            "displayName": f"{scan_id}",
            "data": {
                "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}"
            },
            "dataProfileSpec": {
                "postScanActions":
                {
                    "bigqueryExport":
                    {
                        "resultsTable": f"projects/{PROJECT_ID}/datasets/{SCAN_RESULTS_BQ_DATASET_ID}/tables/{profile_results_table_nm}"
                    }
                    }

            },
            "executionSpec": {
                "trigger": {
                    "onDemand": {} # Run on demand for this example
                }
            }
        }
    elif(scan_type == "DATA_DOCUMENTATION_SCAN"):
        scan_request_body={
        "displayName": f"{scan_id}",
        "type": "DATA_DOCUMENTATION",
        "dataDocumentationSpec": {},
        "data": {
            "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}"
        },
        "executionSpec": {
            "trigger": {
                "onDemand": {} # Run on demand for this example
            }
        }
    }
    elif(scan_type == "DATA_KNOWLEDGE_ENGINE_SCAN"):
        scan_request_body={
        "displayName": f"{scan_id}",
        "type": "DATA_DOCUMENTATION",
        "dataDocumentationSpec": {},
        "dataDocumentationResult": {},
        "data": {
            "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}"
        },
        "executionSpec": {
            "trigger": {
                "onDemand": {} # Run on demand for this example
            }
        },

    }

    return scan_request_body

def generate_patch_label_request_body(scan_type, scan_id):
    """
        Returns the patch labels json that needs to be attached to the source table to tie programmatic scans to the UI

        Args:
            scan_type (str): Type of scan (DATA_PROFLE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN)
            scan_id (str): Scan id
            operation_type (str): Type of operation (CREATE_SCAN/RUN_SCAN)

        Returns:
            string: json with the patch labels
        """
    label_json=""
    scan_stub = ""
    if scan_type == "DATA_PROFILE_SCAN":
        scan_stub="dp"
    elif scan_type == "DATA_DOCUMENTATION_SCAN":
        scan_stub="data-documentation"
    elif scan_type == "DATA_KNOWLEDGE_ENGINE_SCAN":
        scan_stub="data-documentation"


    label_json = {
        "labels": {f"dataplex-{scan_stub}-published-scan":f"{scan_id}",
                 f"dataplex-{scan_stub}-published-project":f"{PROJECT_ID}",
                 f"dataplex-{scan_stub}-published-location":f"{LOCATION}"}
      }

    return label_json


def get_scan_api_endpoint(scan_operation_type,scan_id):
    """
    Returns the scan API endpoint

    Args:
        scan_operation_type (str): Type of operation (CREATE_SCAN/RUN_SCAN)
        scan_id (str): Scan id

    Returns:
        string: API endpoint
    """
    scan_api_endpoint=""

    if(scan_operation_type == "CREATE_SCAN"):
        scan_api_endpoint=f"{DATA_SCAN_API_CREATE_ENDPOINT_PREFIX}{scan_id}"
    elif (scan_operation_type == "LIST_SCAN"):
        scan_api_endpoint=f"{DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX}{scan_id}"
    else:  # run scan
        scan_api_endpoint=f"{DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX}{scan_id}:run"


    return scan_api_endpoint

def check_if_scan_already_exists(access_token,scan_id):
    """
    Calls the BigQuery Scan API to check if a Data Scan already exists.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_id (str): Scan name

    Returns:
        string: NOT_FOUND or EXISTS_ALREADY
    """


    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }


    request_body = {}
    scan_list_api_endpoint= DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX + scan_id


    try:
        response = requests.get(scan_list_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        response_json = response.json()

        print(f"\nAPI Call Successful! Status Code: {response.status_code}")


        # Check for status node - if it is found, it says  "status": "NOT_FOUND" - it means the scan does not exist
        not_found = response_json['status']

        if not_found:
            return "NOT_FOUND"
        else:
            return "EXISTS_ALREADY"

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
        return "NOT_FOUND"
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")
        return "NOT_FOUND2"



def create_scan_synchronous(access_token,dataset_id, scan_id, scan_type, source_table_nm, scan_results_table_nm):
    """
    Calls the BigQuery Scan API with the generated access token.

    Args:
        access_token (str): token
        scan_id (str): Scan id
        scan_type (str): DATA_PROFILE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN
        source_table_nm (str): Name of source table in BQ
        scan_results_table_nm (str): Name of profile results table in BQ

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }
    #scan_list_status = check_if_scan_already_exists(access_token, scan_id)
    scan_list_status = "NOT_FOUND"

    if scan_list_status == "NOT_FOUND":

        scan_request_body = generate_scan_request_body(dataset_id,scan_id, scan_type, source_table_nm, scan_results_table_nm)
        scan_create_api_endpoint = get_scan_api_endpoint("CREATE_SCAN",scan_id)
        print(f"\nCalling API: {scan_create_api_endpoint}")
        print(f"Request Body: {json.dumps(scan_request_body, indent=2)}")

        try:
            # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
            response = requests.post(scan_create_api_endpoint, headers=headers, json=scan_request_body)
            response.raise_for_status()  # Raise an exception for HTTP errors
            initial_response = response.json()
            # Extract the operation name to poll
            operation_name = initial_response.get("name")
            if operation_name:

                # Poll for the completion of the operation
                final_result = poll_data_scan_operation("CREATE_SCAN",operation_name, access_token)
                if final_result:
                    print(f"Scan successfully completed and results obtained. Operation name: {operation_name}")
                else:
                    print(f"Scan operation did not complete successfully or timed out. Operation name: {operation_name}")
            else:
                print("Could not find 'name' in the initial response. Cannot poll for completion.")

        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
            print(f"Response Body: {response.text}")
        except requests.exceptions.RequestException as req_err:
            print(f"An error occurred during the API call: {req_err}")
    else:
        print("Scan already exists; Skipping creation..")

def poll_data_scan_operation(operation_type: str, operation_name: str, access_token: str, poll_interval_seconds=30, timeout_minutes=30):
    """
    Polls the Dataplex Operation API to check for the completion of a data scan.

    Args:
        operation_name (str): The full resource name of the LRO (Long Running Operation)
                              returned by the data scan run API call.
        access_token (str): The Google Cloud access token.
        poll_interval_seconds (int): How often to poll the API, in seconds.
        timeout_minutes (int): Maximum time to wait for the operation to complete, in minutes.

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # Base URL for Google Cloud Long Running Operations API
    # Example operation_name: projects/PROJECT_ID/locations/LOCATION/operations/OPERATION_ID
    # We need to ensure the base URL matches how the operation_name is structured for the API call.
    # The operation_name already contains the full path, so we use it directly.
    operation_api_url = f"https://dataplex.googleapis.com/v1/{operation_name}"

    print(f"\nPolling operation: {operation_name}")
    start_time = time.time()

    while (time.time() - start_time) < (timeout_minutes * 60):
        try:
            response = requests.get(operation_api_url, headers=headers)
            response.raise_for_status()
            operation_status = response.json()


            if(operation_type == "CREATE_SCAN"):
                if operation_status.get("done"):
                    print(f"Operation {operation_name} completed.")
                    if "error" in operation_status:
                        print(f"Operation failed with error: {operation_status['error']}")
                        return None
                    elif "response" in operation_status:
                        print(f"Operation succeeded. Result: {json.dumps(operation_status['response'], indent=2)}")
                        return operation_status["response"]
                    else:
                        print("Operation finished, but no explicit response or error found.")
                        return operation_status # Return the full status for further inspection
                else:
                    print(f"Operation {operation_name} still running. Retrying in {poll_interval_seconds} seconds...")
                    time.sleep(poll_interval_seconds)

            else:
                if operation_status["state"].upper()=="COMPLETED" or operation_status["state"].upper()=="SUCCEEDED" or operation_status["state"].upper()=="DONE":
                    print(f"Operation {operation_name} completed.")
                    if "error" in operation_status:
                        print(f"Operation failed with error: {operation_status['error']}")
                        return None
                    elif "response" in operation_status:
                        print(f"Operation succeeded. Result: {json.dumps(operation_status['response'], indent=2)}")
                        return operation_status["response"]
                    else:
                        print("Operation finished, but no explicit response or error found.")
                        return operation_status # Return the full status for further inspection
                else:
                    print(f"Operation {operation_name} still running. Retrying in {poll_interval_seconds} seconds...")
                    time.sleep(poll_interval_seconds)


        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred during polling: {http_err}")
            print(f"Response Body: {response.text}")
            return None
        except requests.exceptions.RequestException as req_err:
            print(f"An error occurred during polling the operation: {req_err}")
            return None

    print(f"Polling timed out after {timeout_minutes} minutes for operation: {operation_name}")
    return None


def run_scan_synchronous(access_token, scan_id):
    """
    Calls the Dataplex API to run a previously created Scan and polls for its completion.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_name (str): Name of the precreated scan

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """

    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # The request body for a ':run' operation is typically empty for on-demand execution.
    request_body = {}

    scan_run_api_endpoint = get_scan_api_endpoint("RUN_SCAN",scan_id)

    print(f"\nAttempting to run scan: {scan_id} at {scan_run_api_endpoint}")


    try:
        # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
        response = requests.post(scan_run_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        initial_response = response.json()

        # Extract the operation name to poll
        operation_name = initial_response['job']['name']
        #initial_response.get("job.name")
        if operation_name:
            # Poll for the completion of the operation
            final_result = poll_data_scan_operation("RUN_SCAN",operation_name, access_token)
            if final_result:
                print(f"\nScan successfully completed and results obtained. Operation name: {operation_name}")
                print(final_result)
            else:
                print(f"\nScan operation did not complete successfully or timed out. Operation name: {operation_name}")
        else:
            print("Could not find 'name' in the initial response. Cannot poll for completion.")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")

def run_scan_async(access_token, scan_id):
    """
    Calls the Dataplex API to run a previously created Scan and polls for its completion.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_name (str): Name of the precreated scan

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """

    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # The request body for a ':run' operation is typically empty for on-demand execution.
    request_body = {}

    scan_run_api_endpoint = get_scan_api_endpoint("RUN_SCAN",scan_id)

    print(f"\nAttempting to run scan: {scan_id} at {scan_run_api_endpoint}")


    try:
        # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
        response = requests.post(scan_run_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        initial_response = response.json()
        print(f"initial_response: {initial_response}")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")


def list_scans(access_token, scan_type):
    """
    Calls the Dataplex API to list data scans, optionally filtering by type.

    Args:
        access_token (str): token
        scan_type (str): DATA_PROFILE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN

    Returns:
        str: A formatted string of the scan list if successful, or an error
             message string.
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans"


    scan_type_map = {
        "DATA_PROFILE_SCAN": "DATA_PROFILE",
        "DATA_DOCUMENTATION_SCAN": "DATA_DOCUMENTATION",
        "DATA_KNOWLEDGE_ENGINE_SCAN": "KNOWLEDGE_ENGINE"
    }
    params = {}


    if scan_type != "ALL":
       api_scan_type = scan_type_map.get(scan_type)
       if api_scan_type:
           params['filter'] = f'type="{api_scan_type}"'
       scan_list_api_endpoint = f"{scan_list_api_endpoint}?{urlencode(params)}"

       if api_scan_type == scan_type_map.get("DATA_KNOWLEDGE_ENGINE_SCAN"):
        markdown_table = "| Dataset |  Scan |  State | \n"
        markdown_table += "|---|---|---|\n"
       else:
        markdown_table = "| Dataset | Table | Scan |  State | \n"
        markdown_table += "|---|---|---|---|\n"



    try:
        response = requests.get(scan_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()

        if "dataScans" in response_json and response_json["dataScans"]:
            for scan in response_json["dataScans"]:
                table_resource_uri = scan.get('data', {}).get('resource', '')
                table_resource_uri_parts = table_resource_uri.split("/")
                source_dataset_id = table_resource_uri_parts[6]

                if(source_dataset_id in SOURCE_BQ_DATASETS_IN_SCOPE):

                    if api_scan_type == scan_type_map.get("DATA_KNOWLEDGE_ENGINE_SCAN"):
                        markdown_table += (f"| {source_dataset_id} |  {scan.get('displayName', 'N/A')} |  {scan.get('state', 'N/A')} | \n")
                    else:

                        if len(table_resource_uri_parts) >= 8:

                            source_table_id = table_resource_uri_parts[8]
                            markdown_table += (f"| {source_dataset_id} | {source_table_id} | {scan.get('displayName', 'N/A')} |  {scan.get('state', 'N/A')} | \n")

            return "\n".join(markdown_table)
        else:
            print("No data scans found in this location.")
            return "No data scans found in this location."


    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        error_message = f"HTTP Error: {e.response.status_code} - {e.response.text}"
        return error_message
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        error_message = f"An unexpected error occurred: {e}"
        return error_message

def list_scan_jobs(access_token: str, scan_id: str):
    """
    Calls the Dataplex API to list data scan jobs, optionally filtering by type.

    Args:
        access_token (str): token
        scan_id (str): scan name

    Returns:
        str: A formatted string of the scan job list in table markdown format if successful, or an error
             message string.
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_job_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans/{scan_id}/jobs"

    try:
        response = requests.get(scan_job_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()

        markdown_table = "| Job_Name | UID | State | Start_Time | End_Time |\n"
        markdown_table += "|---|---|---|---|---|\n"


        if "dataScanJobs" in response_json and response_json["dataScanJobs"]:
            for job in response_json["dataScanJobs"]:
                markdown_table += (f"| {job['name'].split('/')[-1]} | {job.get('uid', 'N/A')} | {job.get('state', 'N/A')} |  {job.get('startTime', 'N/A')} | {job.get('endTime', 'N/A')} | \n")

            return "\n".join(markdown_table)

        else:
            print("No jobs found for this data scan.")


    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        error_message = f"HTTP Error: {e.response.status_code} - {e.response.text}"
        return error_message
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        error_message = f"An unexpected error occurred: {e}"
        return error_message


def fetch_scan_results(access_token: str, scan_id: str):
    """
    Calls the Dataplex API to list data scan results.

    Args:
        access_token (str): token
        scan_id (str): scan name

    Returns:
        str: A formatted string of the results including markdown tables where applicable
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_results_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans/{scan_id}?view=FULL"


    try:
        response = requests.get(scan_results_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()
        return response_json

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        try:
            # Return JSON error if available, otherwise raw text
            return e.response.json()
        except json.JSONDecodeError:
            print(f"Failed to decode JSON from response: {e}")
            return {"error": e.response.text}
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON from response: {e}")
        return {"error": f"Failed to decode JSON from response: {e}"}
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return {"error": f"An unexpected error occurred: {e}"}


def persist_documentation_scan_table_metadata(access_token: str, scan_id: str):
    """
    Persists the table metadata generated by the data documentation scan .

    Args:
        access_token: token
        scan_id: scan id

    Returns:
        A string indicating the status
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    scan_result = fetch_scan_results(access_token, scan_id)

    # Extract info from scan result
    table_resource_uri = scan_result["data"]["resource"]
    updated_table_description = scan_result["dataDocumentationResult"]["overview"]
    updated_schema_with_column_descriptions = scan_result["dataDocumentationResult"]["schema"]

    # Check for errors from the API call
    if not isinstance(scan_result, dict) or "error" in scan_result:
        print(f"Error fetching scan results: {scan_result}")
        return f"Failed to fetch or parse scan results: {scan_result}"

    try:

        # Parse table URI
        # e.g., //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
        parts = table_resource_uri.split("/")
        project_id = parts[4]
        dataset_id = parts[6]
        table_id = parts[8]

        # Get BQ client and table
        bq_client = get_bq_client()
        if not bq_client:
            return "Failed to get BigQuery client."

        table_ref = bq_client.dataset(dataset_id, project=project_id).table(
            table_id
        )
        table = bq_client.get_table(table_ref)


        # Update table description
        table.description = updated_table_description
        existing_table_schema = table.schema


        # Update column descriptions by creating a new schema
        updated_schema = []
        for existing_field in existing_table_schema:

            for item in updated_schema_with_column_descriptions["fields"]:
                if item["name"] == existing_field.name:
                    updated_field = bigquery.SchemaField(
                    existing_field.name,
                    existing_field.field_type,
                    existing_field.mode,
                    description=item["description"],
                    )
                    updated_schema.append(updated_field)



        table.schema = updated_schema

        # Update the table
        bq_client.update_table(table, ["description", "schema"])

        print(
            "Successfully updated metadata for table "
            f"{project_id}.{dataset_id}.{table_id}"
        )
        return "Succeeded"

    except (KeyError, IndexError) as e:
        print(f"Error parsing scan result: {e}")
        return f"Failed to parse scan result: {e}"
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return f"An unexpected error occurred: {e}"


def patch_source_dataset_with_labels(access_token, dataset_id,scan_type, scan_id):
    """
    Patches the source BigQuery dataset with labels to correlate with the data scans

    Args:
        access_token (str): The Google Cloud access token
        dataset_id (str): The dataset id
        scan_type (str):
        scan_id (str): The scan id


    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    if not access_token:
            msg="Access token is missing. Cannot proceed with API call."
            return msg

    API_ENDPOINT=f"https://bigquery.googleapis.com/bigquery/v2/projects/{PROJECT_ID}/datasets/{dataset_id}?"

    patch_request_body= generate_patch_label_request_body(scan_type,scan_id)

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }


    # Apply the patch
    try:
        response = requests.patch(API_ENDPOINT, headers=headers, json=patch_request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors



    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")

## 1. Variables/Configs

In [None]:
PROJECT_ID_LIST=!gcloud config list --format "value(core.project)" 2>/dev/null
PROJECT_ID=PROJECT_ID_LIST[0]
LOCATION="us-central1"
OLTP_DATASET_ID="rscw_ds"
DATA_SCAN_API_CREATE_ENDPOINT_PREFIX=f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans?dataScanId="
DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX=(f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans/")
SCOPES = ['https://www.googleapis.com/auth/cloud-platform']
SOURCE_BQ_DATASETS_IN_SCOPE="rscw_ds"
SCAN_RESULTS_BQ_DATASET_ID="rscw_ops_ds"

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"LOCATION: {PROJECT_ID}")
print(f"OLTP_DATASET_ID: {OLTP_DATASET_ID}")
print(f"DATA_SCAN_API_CREATE_ENDPOINT_PREFIX: {DATA_SCAN_API_CREATE_ENDPOINT_PREFIX}")
print(f"DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX: {DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX}")


## 2. Data Scan Utils

In [None]:
import requests, json, time, logging
import google.auth
import google.auth.transport.requests
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from google.api_core.exceptions import NotFound
from google.api_core import exceptions
from urllib.parse import urlencode # Import urlencode



def get_access_token():
    """
    Generates an access token using Application Default Credentials or a service account key file.
    """
    try:
        # Authenticate using Application Default Credentials (ADC)
        # This will automatically find credentials set via `gcloud auth application-default login`
        # or from the environment if running on GCP.
        credentials, project = google.auth.default(scopes=SCOPES)

        # Refresh the credentials to ensure an up-to-date access token
        request = google.auth.transport.requests.Request()
        credentials.refresh(request)
        return credentials.token
    except Exception as e:
        print(f"Error generating access token: {e}")
        return None

def patch_source_table_with_labels(access_token, dataset_id,scan_type, scan_id,  source_table_nm):
    """
    Patches the source BigQuery table with labels to correlate with the data scans

    Args:
        access_token (str): The Google Cloud access token
        dataset_id (str): The dataset id
        scan_type (str):
        scan_id (str): The scan id
        source_table_nm (str): The source table name

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    if not access_token:
            msg="Access token is missing. Cannot proceed with API call."
            return msg

    API_ENDPOINT=f"https://bigquery.googleapis.com/bigquery/v2/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}?"

    patch_request_body= generate_patch_label_request_body(scan_type,scan_id)

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }


    # Apply the patch
    try:
        response = requests.patch(API_ENDPOINT, headers=headers, json=patch_request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors



    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")


def generate_scan_request_body(dataset_id,scan_id, scan_type,
                                            source_table_nm, profile_results_table_nm):
    """
    Generates the scan request body

    Args:
        scan_id (str): scan id
        scan_type (str): Type of scan (DATA_PROFLE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN)
        source_table_nm (str): Source table name
        profile_results_table_nm (str): The profile results table name


    Returns:
        string: JSON with the request body for the data scan API call
    """
    scan_request_body={}
    if(scan_type == "DATA_PROFILE_SCAN"):

        scan_request_body={
            "displayName": f"{scan_id}",
            "data": {
                "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}"
            },
            "dataProfileSpec": {
                "postScanActions":
                {
                    "bigqueryExport":
                    {
                        "resultsTable": f"projects/{PROJECT_ID}/datasets/{SCAN_RESULTS_BQ_DATASET_ID}/tables/{profile_results_table_nm}"
                    }
                    }

            },
            "executionSpec": {
                "trigger": {
                    "onDemand": {} # Run on demand for this example
                }
            }
        }
    elif(scan_type == "DATA_DOCUMENTATION_SCAN"):
        scan_request_body={
        "displayName": f"{scan_id}",
        "type": "DATA_DOCUMENTATION",
        "dataDocumentationSpec": {},
        "data": {
            "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}/tables/{source_table_nm}"
        },
        "executionSpec": {
            "trigger": {
                "onDemand": {} # Run on demand for this example
            }
        }
    }
    elif(scan_type == "DATA_KNOWLEDGE_ENGINE_SCAN"):
        scan_request_body={
        "displayName": f"{scan_id}",
        "type": "DATA_DOCUMENTATION",
        "dataDocumentationSpec": {},
        "data": {
            "resource": f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{dataset_id}"
        },
        "executionSpec": {
            "trigger": {
                "onDemand": {} # Run on demand for this example
            }
        },

    }

    return scan_request_body

def generate_patch_label_request_body(scan_type, scan_id):
    """
        Returns the patch labels json that needs to be attached to the source table to tie programmatic scans to the UI

        Args:
            scan_type (str): Type of scan (DATA_PROFLE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN)
            scan_id (str): Scan id
            operation_type (str): Type of operation (CREATE_SCAN/RUN_SCAN)

        Returns:
            string: json with the patch labels
        """
    label_json=""
    scan_stub = ""
    if scan_type == "DATA_PROFILE_SCAN":
        scan_stub="dp"
    elif scan_type == "DATA_DOCUMENTATION_SCAN":
        scan_stub="data-documentation"
    elif scan_type == "DATA_KNOWLEDGE_ENGINE_SCAN":
        scan_stub="knowledge-engine"


    label_json = {
        "labels": {f"dataplex-{scan_stub}-published-scan":f"{scan_id}",
                 f"dataplex-{scan_stub}-published-project":f"{PROJECT_ID}",
                 f"dataplex-{scan_stub}-published-location":f"{LOCATION}"}
      }

    return label_json


def get_scan_api_endpoint(scan_operation_type,scan_id):
    """
    Returns the scan API endpoint

    Args:
        scan_operation_type (str): Type of operation (CREATE_SCAN/RUN_SCAN)
        scan_id (str): Scan id

    Returns:
        string: API endpoint
    """
    scan_api_endpoint=""

    if(scan_operation_type == "CREATE_SCAN"):
        scan_api_endpoint=f"{DATA_SCAN_API_CREATE_ENDPOINT_PREFIX}{scan_id}"
    elif (scan_operation_type == "LIST_SCAN"):
        scan_api_endpoint=f"{DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX}{scan_id}"
    else:  # run scan
        scan_api_endpoint=f"{DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX}{scan_id}:run"


    return scan_api_endpoint

def check_if_scan_already_exists(access_token,scan_id):
    """
    Calls the BigQuery Scan API to check if a Data Scan already exists.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_id (str): Scan name

    Returns:
        string: NOT_FOUND or EXISTS_ALREADY
    """


    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }


    request_body = {}
    scan_list_api_endpoint= DATA_SCAN_API_EXECUTION_ENDPOINT_PREFIX + scan_id


    try:
        response = requests.get(scan_list_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        response_json = response.json()

        print(f"\nAPI Call Successful! Status Code: {response.status_code}")


        # Check for status node - if it is found, it says  "status": "NOT_FOUND" - it means the scan does not exist
        not_found = response_json['status']

        if not_found:
            return "NOT_FOUND"
        else:
            return "EXISTS_ALREADY"

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
        return "NOT_FOUND"
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")
        return "NOT_FOUND2"



def create_scan_synchronous(access_token,dataset_id, scan_id, scan_type, source_table_nm, scan_results_table_nm):
    """
    Calls the BigQuery Scan API with the generated access token.

    Args:
        access_token (str): token
        scan_id (str): Scan id
        scan_type (str): DATA_PROFILE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN
        source_table_nm (str): Name of source table in BQ
        scan_results_table_nm (str): Name of profile results table in BQ

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }
    scan_list_status = check_if_scan_already_exists(access_token, scan_id)

    if scan_list_status == "NOT_FOUND":

        scan_request_body = generate_scan_request_body(dataset_id,scan_id, scan_type, source_table_nm, scan_results_table_nm)
        scan_create_api_endpoint = get_scan_api_endpoint("CREATE_SCAN",scan_id)
        print(f"\nCalling API: {scan_create_api_endpoint}")
        print(f"Request Body: {json.dumps(scan_request_body, indent=2)}")

        try:
            # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
            response = requests.post(scan_create_api_endpoint, headers=headers, json=scan_request_body)
            response.raise_for_status()  # Raise an exception for HTTP errors
            initial_response = response.json()
            # Extract the operation name to poll
            operation_name = initial_response.get("name")
            if operation_name:

                # Poll for the completion of the operation
                final_result = poll_data_scan_operation("CREATE_SCAN",operation_name, access_token)
                if final_result:
                    print(f"Scan successfully completed and results obtained. Operation name: {operation_name}")
                else:
                    print(f"Scan operation did not complete successfully or timed out. Operation name: {operation_name}")
            else:
                print("Could not find 'name' in the initial response. Cannot poll for completion.")

        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
            print(f"Response Body: {response.text}")
        except requests.exceptions.RequestException as req_err:
            print(f"An error occurred during the API call: {req_err}")
    else:
        print("Scan already exists; Skipping creation..")

def poll_data_scan_operation(operation_type: str, operation_name: str, access_token: str, poll_interval_seconds=30, timeout_minutes=30):
    """
    Polls the Dataplex Operation API to check for the completion of a data scan.

    Args:
        operation_name (str): The full resource name of the LRO (Long Running Operation)
                              returned by the data scan run API call.
        access_token (str): The Google Cloud access token.
        poll_interval_seconds (int): How often to poll the API, in seconds.
        timeout_minutes (int): Maximum time to wait for the operation to complete, in minutes.

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # Base URL for Google Cloud Long Running Operations API
    # Example operation_name: projects/PROJECT_ID/locations/LOCATION/operations/OPERATION_ID
    # We need to ensure the base URL matches how the operation_name is structured for the API call.
    # The operation_name already contains the full path, so we use it directly.
    operation_api_url = f"https://dataplex.googleapis.com/v1/{operation_name}"

    print(f"\nPolling operation: {operation_name}")
    start_time = time.time()

    while (time.time() - start_time) < (timeout_minutes * 60):
        try:
            response = requests.get(operation_api_url, headers=headers)
            response.raise_for_status()
            operation_status = response.json()


            if(operation_type == "CREATE_SCAN"):
                if operation_status.get("done"):
                    print(f"Operation {operation_name} completed.")
                    if "error" in operation_status:
                        print(f"Operation failed with error: {operation_status['error']}")
                        return None
                    elif "response" in operation_status:
                        print(f"Operation succeeded. Result: {json.dumps(operation_status['response'], indent=2)}")
                        return operation_status["response"]
                    else:
                        print("Operation finished, but no explicit response or error found.")
                        return operation_status # Return the full status for further inspection
                else:
                    print(f"Operation {operation_name} still running. Retrying in {poll_interval_seconds} seconds...")
                    time.sleep(poll_interval_seconds)

            else:
                if operation_status["state"].upper()=="COMPLETED" or operation_status["state"].upper()=="SUCCEEDED" or operation_status["state"].upper()=="DONE":
                    print(f"Operation {operation_name} completed.")
                    if "error" in operation_status:
                        print(f"Operation failed with error: {operation_status['error']}")
                        return None
                    elif "response" in operation_status:
                        print(f"Operation succeeded. Result: {json.dumps(operation_status['response'], indent=2)}")
                        return operation_status["response"]
                    else:
                        print("Operation finished, but no explicit response or error found.")
                        return operation_status # Return the full status for further inspection
                else:
                    print(f"Operation {operation_name} still running. Retrying in {poll_interval_seconds} seconds...")
                    time.sleep(poll_interval_seconds)


        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred during polling: {http_err}")
            print(f"Response Body: {response.text}")
            return None
        except requests.exceptions.RequestException as req_err:
            print(f"An error occurred during polling the operation: {req_err}")
            return None

    print(f"Polling timed out after {timeout_minutes} minutes for operation: {operation_name}")
    return None


def run_scan_synchronous(access_token, scan_id):
    """
    Calls the Dataplex API to run a previously created Scan and polls for its completion.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_name (str): Name of the precreated scan

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """

    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # The request body for a ':run' operation is typically empty for on-demand execution.
    request_body = {}

    scan_run_api_endpoint = get_scan_api_endpoint("RUN_SCAN",scan_id)

    print(f"\nAttempting to run scan: {scan_id} at {scan_run_api_endpoint}")


    try:
        # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
        response = requests.post(scan_run_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        initial_response = response.json()

        # Extract the operation name to poll
        operation_name = initial_response['job']['name']
        #initial_response.get("job.name")
        if operation_name:
            # Poll for the completion of the operation
            final_result = poll_data_scan_operation("RUN_SCAN",operation_name, access_token)
            if final_result:
                print(f"\nScan successfully completed and results obtained. Operation name: {operation_name}")
                print(final_result)
            else:
                print(f"\nScan operation did not complete successfully or timed out. Operation name: {operation_name}")
        else:
            print("Could not find 'name' in the initial response. Cannot poll for completion.")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")

def run_scan_async(access_token, scan_id):
    """
    Calls the Dataplex API to run a previously created Scan and polls for its completion.

    Args:
        access_token (str): token
        scan_api_endpoint (str): API endpoint
        scan_name (str): Name of the precreated scan

    Returns:
        dict: The final operation response if successful, None if timed out or failed.
    """

    if not access_token:
        print("Access token is missing. Cannot proceed with API call.")
        return

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # The request body for a ':run' operation is typically empty for on-demand execution.
    request_body = {}

    scan_run_api_endpoint = get_scan_api_endpoint("RUN_SCAN",scan_id)

    print(f"\nAttempting to run scan: {scan_id} at {scan_run_api_endpoint}")


    try:
        # The ':run' method is a POST request. It returns a Long Running Operation (LRO).
        response = requests.post(scan_run_api_endpoint, headers=headers, json=request_body)
        response.raise_for_status()  # Raise an exception for HTTP errors
        initial_response = response.json()
        print(f"initial_response: {initial_response}")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response Body: {response.text}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred during the API call: {req_err}")


def list_scans(access_token, scan_type):
    """
    Calls the Dataplex API to list data scans, optionally filtering by type.

    Args:
        access_token (str): token
        scan_type (str): DATA_PROFILE_SCAN/DATA_DOCUMENTATION_SCAN/DATA_KNOWLEDGE_ENGINE_SCAN

    Returns:
        str: A formatted string of the scan list if successful, or an error
             message string.
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans"


    scan_type_map = {
        "DATA_PROFILE_SCAN": "DATA_PROFILE",
        "DATA_DOCUMENTATION_SCAN": "DATA_DOCUMENTATION",
        "DATA_KNOWLEDGE_ENGINE_SCAN": "KNOWLEDGE_ENGINE"
    }
    params = {}


    if scan_type != "ALL":
       api_scan_type = scan_type_map.get(scan_type)
       if api_scan_type:
           params['filter'] = f'type="{api_scan_type}"'
       scan_list_api_endpoint = f"{scan_list_api_endpoint}?{urlencode(params)}"

       if api_scan_type == scan_type_map.get("DATA_KNOWLEDGE_ENGINE_SCAN"):
        markdown_table = "| Dataset |  Scan |  State | \n"
        markdown_table += "|---|---|---|\n"
       else:
        markdown_table = "| Dataset | Table | Scan |  State | \n"
        markdown_table += "|---|---|---|---|\n"



    try:
        response = requests.get(scan_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()

        if "dataScans" in response_json and response_json["dataScans"]:
            for scan in response_json["dataScans"]:
                table_resource_uri = scan.get('data', {}).get('resource', '')
                table_resource_uri_parts = table_resource_uri.split("/")
                source_dataset_id = table_resource_uri_parts[6]

                if(source_dataset_id in SOURCE_BQ_DATASETS_IN_SCOPE):

                    if api_scan_type == scan_type_map.get("DATA_KNOWLEDGE_ENGINE_SCAN"):
                        markdown_table += (f"| {source_dataset_id} |  {scan.get('displayName', 'N/A')} |  {scan.get('state', 'N/A')} | \n")
                    else:

                        if len(table_resource_uri_parts) >= 8:

                            source_table_id = table_resource_uri_parts[8]
                            markdown_table += (f"| {source_dataset_id} | {source_table_id} | {scan.get('displayName', 'N/A')} |  {scan.get('state', 'N/A')} | \n")

            return "\n".join(markdown_table)
        else:
            print("No data scans found in this location.")
            return "No data scans found in this location."


    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        error_message = f"HTTP Error: {e.response.status_code} - {e.response.text}"
        return error_message
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        error_message = f"An unexpected error occurred: {e}"
        return error_message

def list_scan_jobs(access_token: str, scan_id: str):
    """
    Calls the Dataplex API to list data scan jobs, optionally filtering by type.

    Args:
        access_token (str): token
        scan_id (str): scan name

    Returns:
        str: A formatted string of the scan job list in table markdown format if successful, or an error
             message string.
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_job_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans/{scan_id}/jobs"

    try:
        response = requests.get(scan_job_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()

        markdown_table = "| Job_Name | UID | State | Start_Time | End_Time |\n"
        markdown_table += "|---|---|---|---|---|\n"


        if "dataScanJobs" in response_json and response_json["dataScanJobs"]:
            for job in response_json["dataScanJobs"]:
                markdown_table += (f"| {job['name'].split('/')[-1]} | {job.get('uid', 'N/A')} | {job.get('state', 'N/A')} |  {job.get('startTime', 'N/A')} | {job.get('endTime', 'N/A')} | \n")

            return "\n".join(markdown_table)

        else:
            print("No jobs found for this data scan.")


    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        error_message = f"HTTP Error: {e.response.status_code} - {e.response.text}"
        return error_message
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        error_message = f"An unexpected error occurred: {e}"
        return error_message


def fetch_scan_results(access_token: str, scan_id: str):
    """
    Calls the Dataplex API to list data scan results.

    Args:
        access_token (str): token
        scan_id (str): scan name

    Returns:
        str: A formatted string of the results including markdown tables where applicable
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    scan_results_list_api_endpoint = f"https://dataplex.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/dataScans/{scan_id}?view=FULL"


    try:
        response = requests.get(scan_results_list_api_endpoint, headers=headers)
        response.raise_for_status()  # Raises an error for bad status codes (4xx or 5xx)
        response_json = response.json()
        return response_json

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e.response.status_code} - {e.response.text}")
        try:
            # Return JSON error if available, otherwise raw text
            return e.response.json()
        except json.JSONDecodeError:
            print(f"Failed to decode JSON from response: {e}")
            return {"error": e.response.text}
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON from response: {e}")
        return {"error": f"Failed to decode JSON from response: {e}"}
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return {"error": f"An unexpected error occurred: {e}"}


def persist_documentation_scan_table_metadata(access_token: str, scan_id: str):
    """
    Persists the table metadata generated by the data documentation scan .

    Args:
        access_token: token
        scan_id: scan id

    Returns:
        A string indicating the status
    """
    if not access_token:
        msg ="Access token is missing. Cannot proceed with API call."
        print(msg)
        return msg

    scan_result = fetch_scan_results(access_token, scan_id)

    # Extract info from scan result
    table_resource_uri = scan_result["data"]["resource"]
    updated_table_description = scan_result["dataDocumentationResult"]["overview"]
    updated_schema_with_column_descriptions = scan_result["dataDocumentationResult"]["schema"]

    # Check for errors from the API call
    if not isinstance(scan_result, dict) or "error" in scan_result:
        print(f"Error fetching scan results: {scan_result}")
        return f"Failed to fetch or parse scan results: {scan_result}"

    try:

        # Parse table URI
        # e.g., //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID
        parts = table_resource_uri.split("/")
        project_id = parts[4]
        dataset_id = parts[6]
        table_id = parts[8]

        # Get BQ client and table
        bq_client = get_bq_client()
        if not bq_client:
            return "Failed to get BigQuery client."

        table_ref = bq_client.dataset(dataset_id, project=project_id).table(
            table_id
        )
        table = bq_client.get_table(table_ref)


        # Update table description
        table.description = updated_table_description
        existing_table_schema = table.schema


        # Update column descriptions by creating a new schema
        updated_schema = []
        for existing_field in existing_table_schema:

            for item in updated_schema_with_column_descriptions["fields"]:
                if item["name"] == existing_field.name:
                    updated_field = bigquery.SchemaField(
                    existing_field.name,
                    existing_field.field_type,
                    existing_field.mode,
                    description=item["description"],
                    )
                    updated_schema.append(updated_field)



        table.schema = updated_schema

        # Update the table
        bq_client.update_table(table, ["description", "schema"])

        print(
            "Successfully updated metadata for table "
            f"{project_id}.{dataset_id}.{table_id}"
        )
        return "Succeeded"

    except (KeyError, IndexError) as e:
        print(f"Error parsing scan result: {e}")
        return f"Failed to parse scan result: {e}"
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return f"An unexpected error occurred: {e}"

## 3. BQ Utils

In [None]:
from google.cloud import bigquery
from google.api_core import exceptions
import google.auth
import itertools, json, logging
from typing import Optional


def get_bq_client() -> Optional[bigquery.Client]:
  """Initializes and returns a BigQuery client.

  Returns:
      Optional[bigquery.Client]: A BigQuery client instance, or None if
      initialization fails.
  """

  try:
    client = bigquery.Client(project=PROJECT_ID)
    return client
  except google.auth.exceptions.DefaultCredentialsError as e:
    print(f"Authentication failed: {e}")
    print(
        "Please configure your GCP credentials."
        "See https://cloud.google.com/docs/authentication/provide-credentials-adc"
    )
    return None
  except Exception as e:
    print(f"An unexpected error occurred while creating BigQuery client: {e}")
    return None


def execute_bq_sql_query(
    sql_query: str
) -> Optional[bigquery.table.RowIterator]:
    """Executes a SQL query and returns the results.

    Args:
        bq_client: The BigQuery client.
        sql_query: The SQL query to execute.

    Returns:
        Optional[bigquery.table.RowIterator]: An iterator for the query
        results, or None if an error occurs.
    """

    try:
        bq_client = get_bq_client()

        if not bq_client:
            print("BigQuery client is not available.")
            return None

        rows = bq_client.query_and_wait(sql_query)  # Make an API request.
        return rows
    except exceptions.GoogleAPICallError as e:
        print(f"BigQuery API call failed: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def generate_markdown_table_from_bq_rows(row_iterator: bigquery.table.RowIterator):
    """Generates a Markdown table from a BigQuery row iterator."""


    if not row_iterator:
        return "No results to display."
    try:
        headers = [field.name for field in row_iterator.schema]

        markdown_table = "|" + "|".join(headers) + "|\n"
        markdown_table += "|" + "|".join(["---"] * len(headers)) + "|\n"

        for row in row_iterator:
            row_values = [str(row[header]) for header in headers]
            markdown_table += "|" + "|".join(row_values) + "|\n"

        print(f"Markdown Table: {markdown_table}")

        return markdown_table
    except Exception as e:
        print(f"Error generating markdown table: {e}")
        return "Error generating markdown table."

def get_query_results_markdown(sql_query: str) -> str:
    """Executes a SQL query and returns the results as a Markdown table."""

    rows = execute_bq_sql_query(sql_query)
    if rows is None:
        print("Failed to execute query and retrieve results.")
        return "Failed to execute query and retrieve results."
    else:
        markdown_table = generate_markdown_table_from_bq_rows(rows)
        return markdown_table

def field_to_dict(field: bigquery.SchemaField) -> dict:
        """
        Recursively convert a SchemaField into a dict, including subfields if any.
        """
        field_dict = {"name": field.name, "description": field.description or ""}
        # If the field is a RECORD with nested fields, recurse
        if field.fields:
            field_dict["fields"] = [
                field_to_dict(subfield) for subfield in field.fields
            ]
        return field_dict

def fetch_all_tables_metadata_json() -> str:
    """
    Retrieves detailed info about each table in the specified datasets and
    returns it as a JSON string. For each table, it includes:
      - table_name (full path in 'project.dataset.table')
      - description
      - columns (list of columns with name, description)
        * handles nested fields (RECORD type) recursively
    """

    table_iterators = []
    project_id = PROJECT_ID

    try:
        bq_client = bigquery.Client(project=project_id)
        bq_dataset_list = SOURCE_BQ_DATASETS_IN_SCOPE

        for ds_id in bq_dataset_list:

            try:
                table_iterators.append(bq_client.list_tables(f"{project_id}.{ds_id}"))
            except exceptions.NotFound:
                print(f"Dataset not found, skipping: {project_id}.{ds_id}")
    except google.auth.exceptions.DefaultCredentialsError as e:
        print(f"Authentication failed: {e}")
        return json.dumps({"error": f"Authentication failed: {e}"})
    except Exception as e:
        print(f"An unexpected error occurred during client setup: {e}")
        return json.dumps(
            {"error": f"An unexpected error occurred during client setup: {e}"}
        )

    all_tables_info = []
    for table_item in itertools.chain.from_iterable(table_iterators):
        full_table_id = ""  # Initialize here for the except block
        try:
            full_table_id = (
                f"{table_item.project}.{table_item.dataset_id}.{table_item.table_id}"
            )

            table_obj = bq_client.get_table(full_table_id)
            table_info = {
                "table_name": full_table_id,
                "description": table_obj.description or "",
                "columns": [field_to_dict(f) for f in table_obj.schema],
            }
            all_tables_info.append(table_info)
        except exceptions.NotFound:
            print(f"Table not found, skipping: {full_table_id}")
            continue
        except Exception as e:
            print(f"Could not process table {full_table_id}: {e}")
            continue

    # Convert the list of table dictionaries to a JSON string
    return json.dumps(all_tables_info, indent=2)

def fetch_bq_table_schema(table_fq_resource_uri: str) -> str:
    """
    Retrieves the BQ table metadata
    """

    try:
        bq_client = bigquery.Client(project=PROJECT_ID)
        table_resource_uri_parts = table_fq_resource_uri.split("/")
        full_table_id=table_resource_uri_parts[6] + "." + table_resource_uri_parts[8].rstrip('"')
        try:
            table_obj = bq_client.get_table(full_table_id.strip())


            serializable_schema = []
            for field in table_obj.schema:
                field_dict = {
                    "name": field.name,
                    "type": field.field_type,
                    "mode": field.mode,
                    "description": field.description,
                }
                if field.fields:  # Handle nested fields for RECORD types
                    field_dict["fields"] = [
                        {
                            "name": nested_field.name,
                            "type": nested_field.field_type,
                            "mode": nested_field.mode,
                            "description": nested_field.description,
                        }
                        for nested_field in field.fields
                    ]
                serializable_schema.append(field_dict)

            return json.dumps(serializable_schema, indent=2)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return json.dumps({"error": f"An unexpected error occurred: {e}"}
        )

    except google.auth.exceptions.DefaultCredentialsError as e:
        print(f"Authentication failed: {e}")
        return json.dumps({"error": f"Authentication failed: {e}"})
    except exceptions.NotFound:
        print(f"Resource not found: {table_fq_resource_uri}")
        return json.dumps(
            {"error": "Resource not found"}
        )
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return json.dumps(
            {"error": f"An unexpected error occurred: {e}"}
        )

def update_table_schema(project_id: str, dataset_id: str, table_id: str, new_schema: list[bigquery.SchemaField]):
    """
    Updates the schema of a BigQuery table.

    Args:
        project_id: The project ID.
        dataset_id: The dataset ID.
        table_id: The table ID.
        new_schema: A list of bigquery.SchemaField objects for the new schema.
    """

    # Instantiates a BQ connection
    try:
        bq_client = bigquery.Client(project=project_id)
        print("BigQuery client initialized successfully.")
    except Exception as e:
        print(f"Error initializing BigQuery client: {e}")
        print("Please ensure you have authenticated with 'gcloud auth application-default login'")
        print("and that your PROJECT_ID is correct.")
        return

    table_ref = bq_client.dataset(dataset_id).table(table_id)

    try:
        table = bq_client.get_table(table_ref)
        print(f"Fetched table: {table.project}.{table.dataset_id}.{table.table_id}")
    except NotFound:
        print(f"Error: Table {dataset_id}.{table_id} not found.")
        return
    except Exception as e:
        print(f"An error occurred while fetching the table: {e}")
        return

    # Set the table's schema to the newly constructed schema.
    table.schema = new_schema

    # Make the API call to update the table's schema.
    # The second argument to update_table() specifies which properties to update.
    try:
        table = bq_client.update_table(table, ["schema"])  # API request
        print(f"\nSuccessfully updated the table schema for {table.table_id}.")

        #for field in table.schema:
        #    print(f" - {field.name} ({field.field_type}): {field.description}")
    except Exception as e:
        print(f"\nAn error occurred while updating the table schema: {e}")


def fetch_list_of_tables_in_dataset(dataset_id: str) :
    """Fetches a list of table IDs in a given BigQuery dataset.

    Args:
        dataset_id: The ID of the dataset.

    Returns:
        A list of table IDs, or an empty list if an error occurs.
    """
    client = get_bq_client()
    if not client:
        print("Failed to initialize BigQuery client.")
        return ["Failed to initialize BigQuery client."]

    try:
        bq_tables_list = client.list_tables(dataset_id)  # Make an API request.
        return [table.table_id for table in bq_tables_list]
    except exceptions.NotFound:
        print(f"Dataset not found: {dataset_id}")
        return ["Dataset not found."]

    except Exception as e:
        print(f"An unexpected error occurred while listing tables: {e}")
        return ["An unexpected error occurred while listing tables."]



## 4. Start the data documentation scan on the dataset

Note: <br>
1. The scan will take 2-3 minutes to complete. Run this notebook and switch to the user manual.
2. To start module 3, this notebook should have completed execution in entirety

In [None]:
DATASET_ID="rscw_oltp_stg_ds"


# 1. Generate the access token
token = get_access_token()


if token:
    print("Successfully generated access token.")
    # Run the scan
    print("\nStarting the scan for the entire dataset.....")

    dataset_id=DATASET_ID.lower()

    scan_type="DATA_KNOWLEDGE_ENGINE_SCAN"


    # 2a. Create the scan (does not automatically execute the scan)
    scan_id = dataset_id.replace("_","-") + "-dataset-documentation-scan"
    create_scan_synchronous(token,dataset_id, scan_id,scan_type,"","")
    print(f"Successfully created (if it didnt exist) the scan: {scan_id}")

    # 2b. Run the patch below so that programmatically executed scans show up in the Cloud Console UI
    patch_source_dataset_with_labels(token, dataset_id, scan_type, scan_id)
    print(f"Successfully patched the dataset with the scan name: {scan_id}")

    # 2c. Execute the scan
    run_scan_synchronous(token, scan_id)
    print(f"Successfully ran the scan: {scan_id}")

else:
    error_message="Failed to get access token. Please check your credentials and permissions."
    print(error_message)

Sample command to delete the scan (just FYI - DO NOT RUN THIS):<br>
`alias gcurl='curl -H "Authorization: Bearer $(gcloud auth print-access-token)" -H "Content-Type: application/json"'`

<br>

`gcurl -X DELETE https://dataplex.googleapis.com/v1/projects/data-insights-quickstart/locations/us-central1/dataScans/rscw-oltp-stg-ds-dataset-documentation-scan`


## This concludes the lab module unit on Data Documentation Scan at the dataset level, proceed to the User Manual for instructions on reviewing the results in the Cloud Console - BigQuery UI and catalog.

**Note:** The execution of this notebook should complete to run the Module 3 that is Data Engineering Agent focused