### <font color='#4285f4'>Overview</font>

Overview: This notebook will call Gemini to describe the code, and then call Data Engineering agent to create BigQuery pipeline or dataform pipeline

Cost:

Approximate cost: $1

Author:

Navjot Singh

Adam Paternostro

### <font color='#4285f4'>License</font>

In [1]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [9]:
import os
import google.auth
from PIL import Image
from IPython.display import HTML
import IPython.display
import google.auth
import requests
import json
import uuid
import base64
import os
import cv2
import random
import time
import datetime
import base64
import random
from google.auth.transport import requests
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig


import logging
from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log, retry_if_exception
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
location = "us-central1"
d_location="us-central1"

# PySpark Code which reads data from multiple BigQuery Tables and writes into a denormalized table

In [6]:
sample_pyspark_code=f"""
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.appName("DenormalizedOrderItemDetails").getOrCreate()

# Define the BigQuery project ID (replace with your actual project ID)
# Assuming 'ecommerce' is your project containing the datasets.
# You might need to adjust this if your tables are in a different project.
bigquery_project_id = "{project_id}"
temporary_gcs_bucket = "test" # Replace with your GCS temporary bucket

# Read the tables from BigQuery
order_items_df = spark.read.format("bigquery") \
  .option("table", f"{project_id}.ecommerce.order_items") \
  .load()

orders_df = spark.read.format("bigquery") \
  .option("table", f"{project_id}.ecommerce.orders") \
  .load()

users_df = spark.read.format("bigquery") \
  .option("table", f"{project_id}.ecommerce.users") \
  .load()

products_df = spark.read.format("bigquery") \
  .option("table", f"{project_id}.ecommerce.products") \
  .load()

# Perform the joins
# Join order_items with orders
joined_df = order_items_df.alias("oi").join(
    orders_df.alias("o"),
    col("oi.order_id") == col("o.order_id"),
    "inner"
)

# Join the result with users
joined_df = joined_df.join(
    users_df.alias("u"),
    col("oi.user_id") == col("u.id"),
    "inner"
)

# Join the result with products
denormalized_order_item_details_df = joined_df.join(
    products_df.alias("p"),
    col("oi.product_id") == col("p.id"),
    "inner"
).select(
    col("oi.id").alias("order_item_id"),
    col("o.order_id"),
    col("o.user_id"),
    col("p.id").alias("product_id"),
    col("u.first_name"),
    col("u.last_name"),
    col("u.email"),
    col("u.age"),
    col("u.gender"),
    col("u.state"),
    col("u.street_address"),
    col("u.postal_code"),
    col("u.city"),
    col("u.country"),
    col("u.traffic_source"),
    col("p.name").alias("product_name"),
    col("p.brand").alias("product_brand"),
    col("p.category").alias("product_category"),
    col("p.department").alias("product_department"),
    col("p.retail_price").alias("product_retail_price"),
    col("oi.status"),
    col("oi.created_at"),
    col("oi.shipped_at"),
    col("oi.delivered_at"),
    col("oi.returned_at"),
    col("oi.sale_price")
)

# Write the result to a new BigQuery table
# The target table name is 'ecommerce.denormalized_order_item_details'
output_table = f"{project_id}.ecommerce.denormalized_order_item_details"

denormalized_order_item_details_df.write.format("bigquery") \
  .option("temporaryGcsBucket", temporary_gcs_bucket) \
  .option("table", output_table) \
  .mode("overwrite").save()

spark.stop()"""

# Rest API Helper function

In [7]:
def rest_api_helper(url: str, http_verb: str, request_body: str) -> str:
  """Calls the Google Cloud REST API passing in the current users credentials"""

  import google.auth.transport.requests
  import requests
  import google.auth
  import json

  # Get an access token based upon the current user
  creds, project = google.auth.default()
  auth_req = google.auth.transport.requests.Request()
  creds.refresh(auth_req)
  access_token=creds.token

  headers = {
    "Content-Type" : "application/json",
    "Authorization" : "Bearer " + access_token
  }

  if http_verb == "GET":
    response = requests.get(url, headers=headers)
  elif http_verb == "POST":
    response = requests.post(url, json=request_body, headers=headers)
  elif http_verb == "PUT":
    response = requests.put(url, json=request_body, headers=headers)
  elif http_verb == "PATCH":
    response = requests.patch(url, json=request_body, headers=headers)
  elif http_verb == "DELETE":
    response = requests.delete(url, headers=headers)
  else:
    raise RuntimeError(f"Unknown HTTP verb: {http_verb}")

  if response.status_code == 200:
      return json.loads(response.content)
  else:
    error = f"Error rest_api_helper -> ' Status: '{response.status_code}' Text: '{response.text}'"
    raise RuntimeError(error)

# Account impersonation

In [8]:
project_number_list = !gcloud projects describe {project_id} --format="value(projectNumber)"
project_number = project_number_list[0]
command=f"""gcloud iam service-accounts add-iam-policy-binding {project_number}-compute@developer.gserviceaccount.com --member="serviceAccount:service-{project_number}@gcp-sa-dataform.iam.gserviceaccount.com" --role='roles/iam.serviceAccountTokenCreator'"""
!{command}

Updated IAM policy for serviceAccount [879468126116-compute@developer.gserviceaccount.com].
bindings:
- members:
  - serviceAccount:service-879468126116@gcp-sa-dataform.iam.gserviceaccount.com
  role: roles/iam.serviceAccountTokenCreator
etag: BwY_Bsf6NgE=
version: 1


# Define Variables and get project number

In [None]:
repo_name="spark-data-eng25"
pipeline_name="spark-pipeline25"
display_name="spark-pipeline25"
workspace_id="spark_pipeline3"
dataform_region = location
service_account = f"""service-{project_number}@gcp-sa-dataform.iam.gserviceaccount.com"""

# Function to Create BigQuery Pipeline

In [None]:
def create_bigquery_pipeline(pipeline_name: str, display_name:str,dataform_region:str, service_account:str, project_id) -> dict: # Changed to def
    """
    Creates a new Dataform repository, referred to as a BigQuery Pipeline, if it does not already exist.

    This function uses a specific label '{"bigquery-workflow":"preview"}' which
    is a temporary method until an official API is released for this functionality.

    Args:
        pipeline_name (str): The name/ID for the new repository. This will be used as the
                    repositoryId,  and name.
        display_name (str): The display name seen in the user interface.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_bigquery_pipeline",
            "query": None,
            "created": True / False if the workspace was created,
            "messages": ["List of messages during processing"],
            "results": { ... API response from Dataform ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION")
    #service_account = os.getenv("AGENT_ENV_DATAFORM_SERVICE_ACCOUNT")

    # Check if the repository already exists before attempting to create it.
    #existence_check = exists_dataform_repository(pipeline_name) # Added await
    #messages = existence_check.get("messages", [])

    #if existence_check["status"] == "failed":
    #    return existence_check

    #if existence_check["results"]["exists"]:
        # If the repository exists, return a success message indicating it wasn't re-created.
    #   return {
    #        "status": "success",
    #        "tool_name": "create_bigquery_pipeline",
    #        "query": None,
    #        "created": False,
    #        "messages": messages,
    #        "results": {"name": existence_check["results"]["name"]}
    #    }
    messages=[]
    messages.append(f"Creating BigQuery Pipeline (Dataform Repository) '{pipeline_name}' in region '{dataform_region}'.")
    # The repositoryId is passed as a query parameter. [2]
    url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories?repositoryId={pipeline_name}"

    # The request body contains the configuration for the new repository.
    request_body = {
        "serviceAccount": service_account,
        "displayName": display_name,
        "name": pipeline_name,
        # This label is a temporary "hack" until a formal API is available.
        "labels": {"bigquery-workflow": "preview"}
    }

    #logger.debug(f"request_body: {request_body}")

    try:
        # Call the REST API helper to execute the POST request. [2]
        json_result = rest_api_helper(url, "POST", request_body) # Added await

        messages.append(f"Successfully initiated the creation of repository '{pipeline_name}'.")
    #    logger.debug(f"create_bigquery_pipeline json_result: {json_result}")

        return {
            "status": "success",
            "tool_name": "create_bigquery_pipeline",
            "query": None,
            "created": True,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = f"An error occurred while creating the BigQuery Pipeline '{pipeline_name}': {e}"
        messages.append(error_message)
    #    logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "create_bigquery_pipeline",
            "query": None,
            "created": False,
            "messages": messages,
            "results": None
        }


# Function to create Dataform Pipeline

In [None]:
def create_dataform_pipeline(repository_id: str, display_name: str, dataform_region, service_account,project_id) -> dict: # Changed to def
    """
    Creates a new, standard Dataform repository if it does not already exist.

    Args:
        name (str): The name/ID for the new repository. This will be used as the
                    repositoryId and name.
        display_name (str): The display name for the repository.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_dataform_pipeline",
            "query": None,
            "created": True / False if the workspace was created,
            "messages": ["List of messages during processing"],
            "results": { ... API response from Dataform ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION")
    #service_account = os.getenv("AGENT_ENV_DATAFORM_SERVICE_ACCOUNT")

    # Check if the repository already exists before attempting to create it.
    #existence_check = exists_dataform_repository(repository_id) # Added await
    messages = []


    messages.append(f"Creating standard Dataform Repository '{repository_id}' in region '{dataform_region}'.")
    # The repositoryId is passed as a query parameter. [2]
    url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories?repositoryId={repository_id}"

    # The request body for a standard Dataform repository.
    request_body = {
        "serviceAccount": service_account,
        "displayName": display_name,
        "name": repository_id,
    }

    try:
        # Call the REST API helper to execute the POST request. [2]
        json_result = rest_api_helper(url, "POST", request_body) # Added await

        messages.append(f"Successfully initiated the creation of repository '{repository_id}'.")
        #logger.debug(f"create_dataform_pipeline json_result: {json_result}")

        return {
            "status": "success",
            "tool_name": "create_dataform_pipeline",
            "query": None,
            "created": True,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = f"An error occurred while creating the Dataform repository '{repository_id}': {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "create_dataform_pipeline",
            "query": None,
            "created": False,
            "messages": messages,
            "results": None
        }

# Function to Create Workspace

In [None]:
def create_workspace(repository_id: str, workspace_id: str, dataform_region:str, project_id) -> dict: # Changed to def
    """
    Creates a new Dataform workspace in a repository if it does not already exist.

    Args:
        repository_id (str): The ID of the repository where the workspace will be created.
        workspace_id (str): The ID for the new workspace.  The workspace display name will also be the workspace id.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "create_workspace",
            "query": None,
            "created": True / False if the workspace was created,
            "messages": ["List of messages during processing"],
            "results": { ... API response from Dataform ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")

    # Check if the workspace already exists before attempting to create it.
    #existence_check = exists_dataform_workspace(repository_id, workspace_id) # Added await
    #messages = existence_check.get("messages", [])

    #if existence_check["status"] == "failed":
    #    return existence_check

    #if existence_check["results"]["exists"]:
    #    # If the workspace exists, return a success message indicating it wasn't re-created.
    #    return {
    #        "status": "success",
    #        "tool_name": "create_workspace",
    #        "query": None,
    #        "created": False,
    #        "messages": messages,
    #        "results": {"name": existence_check["results"]["name"]}
    #    }

    #messages.append(f"Creating workspace '{workspace_id}' in repository '{repository_id}'.")
    # The workspaceId is passed as a query parameter. [2]
    url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workspaces?workspaceId={workspace_id}"

    # The request body for creating a workspace.
    request_body = {
        "name": workspace_id
    }
    messages=[]
    try:
        # Call the REST API helper to execute the POST request. [2]
        json_result = rest_api_helper(url, "POST", request_body) # Added await

        messages.append(f"Successfully initiated the creation of workspace '{workspace_id}'.")
        #logger.debug(f"create_workspace json_result: {json_result}")

        return {
            "status": "success",
            "tool_name": "create_workspace",
            "query": None,
            "created": True,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = f"An error occurred while creating the workspace '{workspace_id}': {e}"
        messages.append(error_message)
    #    logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "create_workspace",
            "query": None,
            "created": False,
            "messages": messages,
            "results": None
        }

# Function to Write Workflow Settings File

In [None]:
def write_workflow_settings_file(repository_id: str, workspace_id: str,dataform_region,project_id) -> dict: # Changed to def
    """
    Writes the 'workflow_settings.yaml' file to a Dataform workspace.

    This function creates the 'workflow_settings.yaml' file with a predefined
    template, populating it with the current project ID and location. This is
    a standard initialization step for Dataform workspaces.

    Args:
        repository_id (str): The ID of the repository containing the workspace.
        workspace_id (str): The ID of the workspace where the file will be written.

    Returns:
        dict: A dictionary containing the status and the result of the writeFile operation.
        {
            "status": "success" or "failed",
            "tool_name": "write_workflow_settings_file",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... API response from the writeFile operation ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")
    messages = []

    # Define the specific file path and content template within the function
    file_path = "workflow_settings.yaml"
    file_content_template = """defaultProject: {project_id}
defaultLocation: {location}
defaultDataset: dataform
defaultAssertionDataset: dataform_assertions
dataformCoreVersion: 3.0.16"""

    try:
        messages.append(f"Preparing to write file '{file_path}' to workspace '{workspace_id}'.")

        # Populate the template with the project and location details.
        final_file_contents = file_content_template.format(
            project_id=project_id,
            location=dataform_region
        )
        messages.append("Successfully formatted file content template.")

        # Base64 encode the populated string.
        encoded_contents = base64.b64encode(final_file_contents.encode('utf-8')).decode('utf-8')
        messages.append("Successfully Base64 encoded file contents.")

        write_url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workspaces/{workspace_id}:writeFile"

        write_request_body = {
            "path": file_path,
            "contents": encoded_contents
        }

        # Execute the writeFile request
        write_result = rest_api_helper(write_url, "POST", write_request_body) # Added await
        messages.append(f"Successfully wrote file '{file_path}'.")
        #logger.debug(f"write_workflow_settings_file result: {write_result}")

        return {
            "status": "success",
            "tool_name": "write_workflow_settings_file",
            "query": None,
            "messages": messages,
            "results": write_result
        }

    except Exception as e:
        error_message = f"An error occurred during the write_workflow_settings_file process: {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "write_workflow_settings_file",
            "query": None,
            "messages": messages,
            "results": None
        }

# Function to Write Actions Yaml file

In [None]:
def write_actions_yaml_file(repository_id: str, workspace_id: str,dataform_region, project_id) -> dict: # Changed to def
    """
    Writes a placeholder 'actions.yaml' file to a Dataform workspace.

    This function is specifically designed to create the 'definitions/actions.yaml'
    file with the content 'actions: []', which is often required for initializing
    BigQuery Pipelines.

    Args:
        repository_id (str): The ID of the repository containing the workspace.
        workspace_id (str): The ID of the workspace where the file will be written.

    Returns:
        dict: A dictionary containing the status and the result of the writeFile operation.
        {
            "status": "success" or "failed",
            "tool_name": "write_actions_yaml_file",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... API response from the writeFile operation ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")
    messages = []

    # Define the specific file path and content within the function
    file_path = "definitions/actions.yaml"
    file_contents = "actions: []"

    try:
        messages.append(f"Writing placeholder file '{file_path}' to workspace '{workspace_id}'.")

        write_url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workspaces/{workspace_id}:writeFile"

        # Base64 encode the predefined file contents
        encoded_contents = base64.b64encode(file_contents.encode('utf-8')).decode('utf-8')

        write_request_body = {
            "path": file_path,
            "contents": encoded_contents
        }

        # Execute the writeFile request
        write_result = rest_api_helper(write_url, "POST", write_request_body) # Added await
        messages.append(f"Successfully wrote file '{file_path}'.")
        #logger.debug(f"write_actions_yaml_file result: {write_result}")

        return {
            "status": "success",
            "tool_name": "write_actions_yaml_file",
            "query": None,
            "messages": messages,
            "results": write_result
        }

    except Exception as e:
        error_message = f"An error occurred during the write_actions_yaml_file process: {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "write_actions_yaml_file",
            "query": None,
            "messages": messages,
            "results": None
        }

#Function to Commit code in Workspace

In [None]:
def commit_workspace(repository_id: str, workspace_id: str, author_name: str, author_email: str, commit_message: str) -> dict: # Changed to def
    """
    Commits pending changes in a Dataform workspace.

    Args:
        repository_id (str): The ID of the repository containing the workspace.
        workspace_id (str): The ID of the workspace with pending changes to commit.
        author_name (str): The name of the user to be credited as the author of the commit.
        author_email (str): The email address of the commit author.
        commit_message (str): The message describing the changes being committed.

    Returns:
        dict: A dictionary containing the status and results of the operation.
        {
            "status": "success" or "failed",
            "tool_name": "commit_workspace",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... API response from Dataform ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")
    messages = []

    # The API endpoint for committing to a workspace.
    url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workspaces/{workspace_id}:commit"

    # The request body containing the author and commit message.
    request_body = {
        "author": {
            "name": author_name,
            "emailAddress": author_email
        },
        "commitMessage": commit_message
    }

    try:
        messages.append(f"Attempting to commit changes to workspace '{workspace_id}' in repository '{repository_id}'.")

        # Call the REST API helper to execute the POST request.
        json_result = rest_api_helper(url, "POST", request_body) # Added await

        messages.append(f"Successfully committed changes with message: '{commit_message}'.")
        #logger.debug(f"commit_workspace json_result: {json_result}")

        return {
            "status": "success",
            "tool_name": "commit_workspace",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = f"An error occurred while committing to the workspace '{workspace_id}': {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "commit_workspace",
            "query": None,
            "messages": messages,
            "results": None
        }

# Function to Compile and Run Your Workflow

In [None]:
def compile_and_run_dataform_workflow(repository_id: str, workspace_id: str,dataform_region,project_id,dataform_service_account) -> dict: # Changed to def
    """
    Compiles a Dataform repository from a workspace and then runs the resulting workflow.

    This function performs two sequential operations:
    1. It creates a compilation result from the specified workspace.
    2. It starts a workflow invocation using the successful compilation result.

    Args:
        repository_id (str): The ID of the Dataform repository to compile and run.
        workspace_id (str): The ID of the workspace containing the code to be compiled.

    Returns:
        dict: A dictionary containing the status and the final response from the workflow invocation API call.
        {
            "status": "success" or "failed",
            "tool_name": "compile_and_run_dataform_workflow",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... API response from the workflow invocation ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")
    #dataform_service_account = os.getenv("AGENT_ENV_DATAFORM_SERVICE_ACCOUNT")
    messages = []

    try:
        # --- Step 1: Compile the repository from the workspace ---
        messages.append(f"Step 1: Compiling repository '{repository_id}' from workspace '{workspace_id}'.")

        compile_url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/compilationResults"

        workspace_full_path = f"projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workspaces/{workspace_id}"

        compile_request_body = {
            "workspace": workspace_full_path
        }

        compile_result = rest_api_helper(compile_url, "POST", compile_request_body) # Added await
        compilation_result_name = compile_result.get("name")

        # You might want to check the status of the compilation and only start it if it is "success"!

        if not compilation_result_name:
            raise Exception("Failed to get compilation result name from the compilation API response.")

        messages.append(f"Successfully compiled. Compilation result name: {compilation_result_name}")

        # --- Step 2: Run the workflow using the compilation result ---
        messages.append(f"Step 2: Starting workflow execution for compilation '{compilation_result_name}'.")

        invoke_url = f"https://dataform.googleapis.com/v1/projects/{project_id}/locations/{dataform_region}/repositories/{repository_id}/workflowInvocations"

        invoke_request_body = {
            "compilationResult": compilation_result_name,
              "invocationConfig": {
                "serviceAccount": dataform_service_account
              }
        }

        invoke_result = rest_api_helper(invoke_url, "POST", invoke_request_body) # Added await

        messages.append("Successfully initiated workflow invocation.")
        #logger.debug(f"compile_and_run_dataform_workflow invoke_result: {invoke_result}") # This comment had (f"...")

        return {
            "status": "success",
            "tool_name": "compile_and_run_dataform_workflow",
            "query": None,
            "messages": messages,
            "workflow_invocation_id": invoke_result["name"].rsplit('/', 1)[-1],
            "results": invoke_result
        }

    except Exception as e:
        error_message = f"An error occurred during the compile and run process: {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "compile_and_run_dataform_workflow",
            "query": None,
            "messages": messages,
            "results": None
        }

# Function to Call Data Eng Agent

In [None]:
def call_bigquery_data_engineering_agent(repository_name: str, workspace_name: str, prompt: str,dataform_region,project_id) -> dict: # Changed to def
    """
    Sends a natural language prompt to the internal BigQuery Data Engineering agent which will generate/update
    the Dataform pipeline code based upon the prompt. The BigQuery Data Engineering agent updates the ETL logic
    within the specified Dataform workspace.

    Args:
        repository_name (str): The ID of the Dataform repository to use for the pipeline.
        workspace_name (str): The ID of the Dataform workspace within the repository.
        prompt (str): The natural language prompt describing the data engineering task to be performed (e.g., "uppercase the 'city' column").

    Returns:
        dict: A dictionary containing the status and the response from the API, which may include the generated code and task status.
        {
            "status": "success" or "failed",
            "tool_name": "call_bigquery_data_engineering_agent",
            "query": None,
            "messages": ["List of messages during processing"],
            "results": { ... API response from Gemini Data Analytics service ... }
        }
    """
    #project_id = os.getenv("AGENT_ENV_PROJECT_ID")
    #dataform_region = os.getenv("AGENT_ENV_DATAFORM_REGION", "us-central1")
    messages = []

    # The global endpoint for the Gemini Data Analytics service.
    # NOTE: Do not take a hard dependency on this REST API call, it will be changing in the future!
    url = f"https://geminidataanalytics.googleapis.com/v1alpha1/projects/{project_id}/locations/global:run"

    # The pipeline_id is the full resource name of the Dataform workspace.
    pipeline_id = f"projects/{project_id}/locations/{dataform_region}/repositories/{repository_name}/workspaces/{workspace_name}"

    # The request body containing the pipeline and the user's prompt.
    request_body = {
      "parent": f"projects/{project_id}/locations/global",
      "pipeline_id": pipeline_id,
      "messages": [
        {
          "user_message": {
            "text": prompt
          }
        }
      ]
    }

    try:
        messages.append(f"Attempting to generate/update data engineering code in workspace '{workspace_name}' for repository '{repository_name}' with prompt: '{prompt}'.")

        # Call the REST API helper to execute the POST request.
        json_result = rest_api_helper(url, "POST", request_body) # Added await

        messages.append("Successfully submitted the data engineering task to the Gemini Data Analytics service.")
        #logger.debug(f"call_bigquery_data_engineering_agent json_result: {json_result}")

        return {
            "status": "success",
            "tool_name": "call_bigquery_data_engineering_agent",
            "query": None,
            "messages": messages,
            "results": json_result
        }
    except Exception as e:
        error_message = f"An error occurred while calling the BigQuery Data Engineering agent: {e}"
        messages.append(error_message)
        #logger.debug(error_message)
        return {
            "status": "failed",
            "tool_name": "call_bigquery_data_engineering_agent",
            "query": None,
            "messages": messages,
            "results": None
        }

# Function to Call Gemini

In [None]:
def GeminiLLM(prompt,model_name="gemini-2.5-pro", response_schema=None,
                 temperature=1.0, top_p=1.0, top_k=32):


  # Load the specified Gemini model.
  model = GenerativeModel(model_name)

  # Configure the generation parameters.
  generation_config = GenerationConfig(
      temperature=temperature,
      top_p=top_p,
      top_k=top_k,
      max_output_tokens=8192,
      response_mime_type="application/json" if response_schema else "text/plain",
      response_schema=response_schema
  )

  # Generate the content.
  response = model.generate_content(
      [prompt],
      generation_config=generation_config,
  )

  return response.text

# Now we are Gonna Call all the functions to generate Pipeline and kickoff execution

# Create BigQuery Pipeline

In [None]:
create_bigquery_pipeline(pipeline_name, display_name,dataform_region, service_account, project_id)

#Create Dataform Pipeline - Choose either BigQuery Pipeline or Dataform Pipeline

In [None]:
create_dataform_pipeline(pipeline_name, display_name,dataform_region, service_account, project_id)

# Create Workspace

In [None]:
create_workspace(pipeline_name,workspace_id,dataform_region,project_id)

# Create Workflow Settings File

In [None]:
write_workflow_settings_file(pipeline_name,workspace_id,dataform_region,project_id)

# Commit Workspace Code

In [None]:
commit_workspace(pipeline_name,workspace_id,"n", "n@example.com","BQ pipeline")

# Construct prompt to get instructions for Data Eng Agent

In [None]:
prompt_code = f"""I'm trying to convert below pyspark code into SQL pipeline using Data Engineering Agent, Data eng Agent takes
            instructions in natural Language and gives me SQL pipeline, take the below code and generate step by step
            explanation of the code which I can feed into Data eng Agent, ignore any steps where you are connecting to spark, give me steps in raw string
            example:
            Step 1: Read data from BQ table A, Table B, Table C
            Step 2: join tables
            Step 3: write data into a new table Table D


        Pyspark Code: {sample_pyspark_code}
        """

In [None]:
print(prompt_code)

# Call Gemini Pro to generate instructions for Data Eng agent

In [None]:
# Use LLM to generate data
llm_response_raw = GeminiLLM(prompt_code)

In [None]:
print(llm_response_raw)

# Call Data Eng Agent to generate SQL Pipeline Code

In [None]:
call_bigquery_data_engineering_agent(pipeline_name,workspace_id,llm_response_raw, dataform_region,project_id)

# Commit Generated Code

In [None]:
commit_workspace(pipeline_name,workspace_id,"n", "n@example.com","BQ dataengpipeline")

# Write Actions File

In [None]:
write_actions_yaml_file(pipeline_name,workspace_id,dataform_region,project_id)

# Commit Workspace Code

In [None]:
commit_workspace(pipeline_name,workspace_id,"n", "n@example.com","BQ actionsfile")

# Compile and Run the Workflow

In [None]:
compile_and_run_dataform_workflow(pipeline_name,workspace_id,dataform_region,project_id,service_account)

# Look at Dataform repository

In [None]:
# You can view BigQuery pipelines in the BigQuery UI or the Dataform UI (click link below)
# The below line will open a Dataform repo

print(f"""https://console.cloud.google.com/bigquery/dataform/locations/us-central1/repositories/{pipeline_name}/workspaces/{workspace_id}""")