In [None]:
# Copyright 2022 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# E2E ML on GCP: MLOps stage 3 : formalization: get started with Apache Airflow and Vertex AI Pipelines

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_airflow_and_vertex_pipelines.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
    <td>
        <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage3/get_started_with_airflow_and_vertex_pipelines.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
        </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/ml_ops/stage3/get_started_with_airflow_and_vertex_pipelines.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview

This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 3 : formalization: get started with Apache Airflow and Vertex AI Pipelines.

### Objective

In this tutorial, you learn how to use Apache Airflow with `Vertex AI Pipelines`.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`
- `Vertex AI Dataset, Model and Endpoint` resources
- `BigQuery`
- `Cloud Composer`

The steps performed include:

- Create Cloud Composer environment.
- Upload Airflow DAG to Composer environment that performs data processing -- i.e., creates a BigQuery table from a CSV file.
- Create a `Vertex AI Pipeline` that triggers the Airflow DAG.
- Execute the `Vertex AI Pipeline`.

### Dataset

The dataset used for this tutorial is [Condensed Game Data](gs://example-datasets/game_data_condensed.csv), which comes from the [Apache Beam examples](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/complete/game). The version used in this tutorial is stored in a Cloud Storage bucket.

### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI
- Cloud Storage
- BigQuery

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Installations

Install the following packages for executing this MLOps notebook.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"
    
! pip3 install {USER_FLAG} google-cloud-aiplatform==1.0.0 --upgrade -q
! pip3 install {USER_FLAG} kfp google-cloud-pipeline-components==0.1.1 --upgrade -q
! pip3 install {USER_FLAG} apache-airflow[celery]==2.3.2 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.3.2/constraints-3.7.txt" \
                           apache-airflow-providers-google --upgrade -q

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so that it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Check package versions

Check that you have correctly installed the packages. The KFP SDK version should be >=1.6:

In [None]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com) and [Composer API](https://console.cloud.google.com/flows/enableapi?apiid=composer.googleapis.com).

1. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Vertex AI Notebook Notebooks**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

In the Cloud Console, go to the [Create service account key](https://console.cloud.google.com/apis/credentials/serviceaccountkey) page.

1. **Click Create service account**.

2. In the **Service account name** field, enter a name, and click **Create**.

3. In the **Grant this service account access to project** section, click the Role drop-down list. Type "Vertex" into the filter box, and select **Vertex Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

4. Click Create. A JSON file that contains your key downloads to your local environment.

5. Enter the path to your service account key as the GOOGLE_APPLICATION_CREDENTIALS variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex AI SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + TIMESTAMP
    BUCKET_URI = f"gs://{BUCKET_NAME}"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

#### Service Account

You use a service account to create Vertex AI Pipeline jobs. If you do not want to use your project's Compute Engine service account, set `SERVICE_ACCOUNT` to another service account ID.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step. You only need to run this step once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Import libraries

In [None]:
from google.cloud import aiplatform
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import Artifact, Output, component

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

## Introduction: Trigger Airflow DAG in Cloud Composer from a Vertex AI Pipeline

Apache Airflow is a popular choice for data pipelining in general. However, arguably not a good choice to run Machine learning pipelines due to lack of ML metadata tracking, artifact lineage, tracking ML metrics across metrics etc. [Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) solves this problem and automates, monitors, and governs your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata.

`Cloud Composer` is fully managed workflow orchestration service built on Apache Airflow.

Learn more about [Cloud Composer](https://cloud.google.com/composer).

In this tutorial, we will show you how you can trigger a data pipeline i.e. Airflow DAG on Cloud Composer from a ML pipeline running on Vertex AI Pipelines.

![Trigger Airflow DAG on Cloud Composer from Vertex Pipeline](images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png)

### Create Cloud Composer Environment

In this tutorial, you create a bare minimum `Cloud Composer` environment. 

To trigger an Airflow DAG from `Vertex Pipeline`, we will using Airflow web server REST API. By default, the API authentication feature is disabled in Airflow 1.10.11 and above which would deny all requests made to Airflow web server. To trigger DAG, you enable this feature. To enable the API authentication feature you override `auth_backend` configuration in `Cloud Composer` environment to `airflow.api.auth.backend.default`.

**NOTE:** Cloud Composer environment creation may take up to 30 min. Grab your favorite beverage until then.

Learn more about [Creating Cloud Composer environment](https://cloud.google.com/composer/docs/how-to/managing/creating#).

In [None]:
COMPOSER_ENV_NAME = "test-composer-env"
ZONE = f"{REGION}-f"
MACHINE_TYPE = "n1-standard-2"

! gcloud beta composer environments create $COMPOSER_ENV_NAME \
    --location $REGION \
    --zone $ZONE \
    --machine-type $MACHINE_TYPE \
    --image-version composer-latest-airflow-1.10.15 \
    --airflow-configs=api-auth_backend=airflow.api.auth.backend.default

### Get Composer Environment configuration

You get the Composer environment configuration such as webserver URL and client ID to use in the Vertex AI Pipeline using the script `get_composer_client_id.py`

In [None]:
%%writefile get_composer_config.py
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Get the client ID associated with a Cloud Composer environment."""

import argparse


def get_client_id(project_id, location, composer_environment):
    # [START composer_get_environment_client_id]
    import google.auth
    import google.auth.transport.requests
    import requests
    import six.moves.urllib.parse

    # Authenticate with Google Cloud.
    # See: https://cloud.google.com/docs/authentication/getting-started
    credentials, _ = google.auth.default(
        scopes=['https://www.googleapis.com/auth/cloud-platform'])
    authed_session = google.auth.transport.requests.AuthorizedSession(
        credentials)

    # project_id = 'YOUR_PROJECT_ID'
    # location = 'us-central1'
    # composer_environment = 'YOUR_COMPOSER_ENVIRONMENT_NAME'

    environment_url = (
        'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}'
        '/environments/{}').format(project_id, location, composer_environment)
    composer_response = authed_session.request('GET', environment_url)
    environment_data = composer_response.json()
    airflow_uri = environment_data['config']['airflowUri']
    print(airflow_uri)
    dag_gcs_prefix = environment_data['config']['dagGcsPrefix']
    print(dag_gcs_prefix)

    # The Composer environment response does not include the IAP client ID.
    # Make a second, unauthenticated HTTP request to the web server to get the
    # redirect URI.
    redirect_response = requests.get(airflow_uri, allow_redirects=False)
    redirect_location = redirect_response.headers['location']

    # Extract the client_id query parameter from the redirect.
    parsed = six.moves.urllib.parse.urlparse(redirect_location)
    query_string = six.moves.urllib.parse.parse_qs(parsed.query)
    print(query_string['client_id'][0])
    # [END composer_get_environment_client_id]


# Usage: python get_client_id.py your_project_id your_region your_environment_name
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('project_id', help='Your Project ID.')
    parser.add_argument(
        'location', help='Region of the Cloud Composer environment.')
    parser.add_argument(
        'composer_environment', help='Name of the Cloud Composer environment.')

    args = parser.parse_args()
    get_client_id(
        args.project_id, args.location, args.composer_environment)


In [None]:
# This code is modified version of https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py

shell_output = ! python3 get_composer_config.py $PROJECT_ID $REGION $COMPOSER_ENV_NAME
COMPOSER_WEB_URI = shell_output[0]
COMPOSER_DAG_GCS = shell_output[1]
COMPOSER_CLIENT_ID = shell_output[2]

print(f"COMPOSER_WEB_URI = {COMPOSER_WEB_URI}")
print(f"COMPOSER_DAG_GCS = {COMPOSER_DAG_GCS}")
print(f"COMPOSER_CLIENT_ID = {COMPOSER_CLIENT_ID}")

### Display the Airflow webserver UI

You can navigate to Airflow webserver by going to this URL.

In [None]:
COMPOSER_WEB_URI

## Upload DAG to Cloud Composer environment

You have a sample data processing DAG `data_orchestration_bq_example_dag.py` that reads a CSV file from GCS bucket and writes to BigQuery. We will add this file to the GCS bucket configure for the Composer environment that Airflow watches.

In [None]:
COMPOSER_DAG_NAME = "dag_gcs_to_bq_orch"
COMPOSER_DAG_FILENAME = "data_orchestration_bq_example_dag.py"

In [None]:
%%writefile $COMPOSER_DAG_FILENAME

"""An example Composer workflow integrating GCS and BigQuery.

A CSV is read from a GCS bucket to a BigQuery table; a query is made, and the
result is written back to a different BigQuery table within a new dataset.
"""

from datetime import datetime, timedelta
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.operators.bash_operator import BashOperator

YESTERDAY = datetime.combine(
    datetime.today() - timedelta(days=1), datetime.min.time())
BQ_DATASET_NAME = 'bq_demos'

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': YESTERDAY,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

# Solution: pass a schedule_interval argument to DAG instantiation.
with DAG('dag_gcs_to_bq_orch', default_args=default_args,
         schedule_interval=None) as dag:
  create_bq_dataset_if_not_exist = """
    bq ls {0}
    if [ $? -ne 0 ]; then
      bq mk {0}
    fi
  """.format(BQ_DATASET_NAME)

  # Create destination dataset.
  t1 = BashOperator(
      task_id='create_destination_dataset',
      bash_command=create_bq_dataset_if_not_exist,
      dag=dag)

  # Create a bigquery table from a CSV file located in a GCS bucket
  # (gs://example-datasets/game_data_condensed.csv).
  # Store it in our dataset.
  t2 = GoogleCloudStorageToBigQueryOperator(
      task_id='gcs_to_bq',
      bucket='example-datasets',
      source_objects=['game_data_condensed.csv'],
      destination_project_dataset_table='{0}.composer_game_data_table'
      .format(BQ_DATASET_NAME),
      schema_fields=[
          {'name': 'name', 'type': 'string', 'mode': 'nullable'},
          {'name': 'team', 'type': 'string', 'mode': 'nullable'},
          {'name': 'total_score', 'type': 'integer', 'mode': 'nullable'},
          {'name': 'timestamp', 'type': 'integer', 'mode': 'nullable'},
          {'name': 'window_start', 'type': 'string', 'mode': 'nullable'},
      ],
      write_disposition='WRITE_TRUNCATE')

  # Run example query (http://shortn/_BdF1UTEYOb) and save result to the
  # destination table.
  t3 = BigQueryOperator(
      task_id='bq_example_query',
      bql=f"""
        SELECT
          name, team, total_score
        FROM
          {BQ_DATASET_NAME}.composer_game_data_table
        WHERE total_score > 15
        LIMIT 100;
      """,
      destination_dataset_table='{0}.gcp_example_query_result'
      .format(BQ_DATASET_NAME),
      write_disposition='WRITE_TRUNCATE')

  t1 >> t2 >> t3

In [None]:
!gsutil cp $COMPOSER_DAG_FILENAME $COMPOSER_DAG_GCS/

In [None]:
!gsutil ls -l $COMPOSER_DAG_GCS/$COMPOSER_DAG_FILENAME

### View the DAG in the Composer UI

You should see the DAG in your Airflow webserver -- specified by `COMPOSER_WEB_URI`.

*Note:* There may be a momentary delay before your DAG is loaded and appears in the UI.

![](images/airflow_webserver_with_dag.png)

![](images/airflow_dag.png)

In [None]:
# view Composer UI at this URL
COMPOSER_WEB_URI

### Create a Python function based component to trigger Airflow DAG

Using the KFP SDK, you can create components based on Python functions. The component takes an Airflow DAG name `dag_name` a string as input and returns response from Airflow web server as an `Artifact` that contains Airflow DAG run information. The component makes a request to Airflow REST API of your Cloud Composer environment. Airflow processes this request and runs a DAG. The DAG outputs information about the change that is logged as artifact.

Understanding the component structure:

- The **`@component`** decorator compiles this function to a component when the pipeline is run. You'll use this anytime you write a custom component.
- The **`base_image parameter`** specifies the container image this component will use.
- The **`output_component_file`** parameter is optional, and specifies the yaml file to write the compiled component to.
- The **`packages_to_install`** parameter installs required python packages in the container to run the component

In [None]:
@component(
    base_image="gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.3",
    output_component_file="composer-trigger-dag-component.yaml",
    packages_to_install=["requests"],
)
def trigger_airflow_dag(
    dag_name: str,
    composer_client_id: str,
    composer_webserver_id: str,
    response: Output[Artifact],
):
    # [START composer_trigger]

    import json
    import os

    import requests
    from google.auth.transport.requests import Request
    from google.oauth2 import id_token

    data = '{"replace_microseconds":"false"}'

    """Makes a POST request to the Composer DAG Trigger API

    When called via Google Cloud Functions (GCF),
    data and context are Background function parameters.

    For more info, refer to
    https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python

    To call this function from a Python script, omit the ``context`` argument
    and pass in a non-null value for the ``data`` argument.
    """

    # Form webserver URL to make REST API calls
    webserver_url = f"{composer_webserver_id}/api/experimental/dags/{dag_name}/dag_runs"
    # print(webserver_url)

    # This code is copied from
    # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py
    # START COPIED IAP CODE
    def make_iap_request(url, client_id, method="GET", **kwargs):
        """Makes a request to an application protected by Identity-Aware Proxy.
        Args:
          url: The Identity-Aware Proxy-protected URL to fetch.
          client_id: The client ID used by Identity-Aware Proxy.
          method: The request method to use
                  ('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE')
          **kwargs: Any of the parameters defined for the request function:
                    https://github.com/requests/requests/blob/master/requests/api.py
                    If no timeout is provided, it is set to 90 by default.
        Returns:
          The page body, or raises an exception if the page couldn't be retrieved.
        """
        # Set the default timeout, if missing
        if "timeout" not in kwargs:
            kwargs["timeout"] = 90

        # Obtain an OpenID Connect (OIDC) token from metadata server or using service
        # account.
        google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id)

        # Fetch the Identity-Aware Proxy-protected URL, including an
        # Authorization header containing "Bearer " followed by a
        # Google-issued OpenID Connect token for the service account.
        resp = requests.request(
            method,
            url,
            headers={"Authorization": "Bearer {}".format(google_open_id_connect_token)},
            **kwargs,
        )
        if resp.status_code == 403:
            raise Exception(
                "Service account does not have permission to "
                "access the IAP-protected application."
            )
        elif resp.status_code != 200:
            raise Exception(
                "Bad response from application: {!r} / {!r} / {!r}".format(
                    resp.status_code, resp.headers, resp.text
                )
            )
        else:
            print(f"response = {resp.text}")
            # not executed when testing locally
            if response:
                file_path = os.path.join(response.path)
                os.makedirs(file_path)
                with open(os.path.join(file_path, "airflow_response.json"), "w") as f:
                    json.dump(resp.text, f)

    # END COPIED IAP CODE

    # Make a POST request to IAP which then Triggers the DAG
    make_iap_request(
        webserver_url,
        composer_client_id,
        method="POST",
        json={"conf": data, "replace_microseconds": "false"},
    )

    # [END composer_trigger]

### Test Triggering Airflow DAG from Notebook

Next, you can optionally test your component locally. To do so, before running comment out @component decorator from the above definition of `trigger_airflow_dag`.

Afterwards, you will need to add the @component decorator back to be used as a pipeline component.

In [None]:
try:
    trigger_airflow_dag(
        dag_name=COMPOSER_DAG_NAME,
        composer_client_id=COMPOSER_CLIENT_ID,
        composer_webserver_id=COMPOSER_WEB_URI,
        response=None,
    )
except Exception as e:
    print(e)

### Create a pipeline with the component

Next, you create a pipeline definition for executing your custom Airflow DAG component.

`PIPELINE_ROOT` is the Cloud Storage path where the artifacts created by the pipeline will be written.

In [None]:
PATH = %env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/"
print(PIPELINE_ROOT)


@dsl.pipeline(
    name="pipeline-trigger-airflow-dag",
    description="Trigger Airflow DAG from Vertex AI Pipelines",
    pipeline_root=PIPELINE_ROOT,
)

# BLAH, don't see params
# You can change the `text` and `emoji_str` parameters here to update the pipeline output
def pipeline():
    data_processing_task_dag_name = COMPOSER_DAG_NAME
    data_processing_task = trigger_airflow_dag(
        dag_name=data_processing_task_dag_name,
        composer_client_id=COMPOSER_CLIENT_ID,
        composer_webserver_id=COMPOSER_WEB_URI,
    )

### Compile and execute the pipeline

Next, you compile the pipeline and then exeute it. 

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="pipeline-trigger-airflow-dag.json"
)

pipeline = aiplatform.PipelineJob(
    display_name="airflow_pipeline",
    template_path="pipeline-trigger-airflow-dag.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={},
    enable_caching=False,
)

pipeline.run()

! rm -f pipeline-trigger-airflow-dag.json

### Monitor Vertex Pipeline status

From Cloud Console, you can monitor the pipeline run status and view the output artifact

![](images/pipeline_run.png)

### Monitor Airflow DAG run

Go to Airflow webserver and monitor the status of data processing DAG. Airflow webserver URL is

In [None]:
COMPOSER_WEB_URI + "/admin/airflow/tree?dag_id=dag_gcs_to_bq_orch"

![](images/airflow_dag_run.png)

### View the pipeline execution results

In [None]:
import tensorflow as tf

PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)


def print_pipeline_output(job, output_task_name):
    PROJECT_NUMBER = job.gca_resource.name.split("/")[1]
    print(PROJECT_NUMBER)

    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        EVAL_METRICS = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/evaluation_metrics"
        )
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            ! gsutil cat $EXECUTE_OUTPUT
            return EXECUTE_OUTPUT
        elif tf.io.gfile.exists(GCP_RESOURCES):
            ! gsutil cat $GCP_RESOURCES
            return GCP_RESOURCES
        elif tf.io.gfile.exists(EVAL_METRICS):
            ! gsutil cat $EVAL_METRICS
            return EVAL_METRICS

    return None


print_pipeline_output(pipeline, "tigger-airflow-dag")

### Verify the BigQuery dataset

Finally, verify that the BigQuery dataset was created by the execution of your Airflow DAG.

In [None]:
! bq ls | grep bq_demos

## Cleaning Up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

- Cloud Storage bucket
- Cloud Composer environment
- BigQuery table

In [None]:
delete_bucket = False

# Delete the Cloud Composer environment
! gcloud beta composer environments delete $COMPOSER_ENV_NAME \
    --location $REGION \
    --quiet

# Delete the temporary BigQuery dataset
! bq rm -r -f $PROJECT_ID:$DATASET_NAME

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}
    
! rm get_composer_config.py data_orchestration_bq_example_dag.py