In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Pipelines:  TPU model train, upload, and deploy using google-cloud-pipeline-components

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_TPU_model_train_upload_deploy.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fpipelines%2Fgoogle_cloud_pipeline_components_TPU_model_train_upload_deploy.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/pipelines/google_cloud_pipeline_components_TPU_model_train_upload_deploy.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_TPU_model_train_upload_deploy.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This notebook shows how to use the components defined in [`google_cloud_pipeline_components`](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud) SDK in conjunction with an experimental `run_as_aiplatform_custom_job` method, to build a [Vertex AI Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines) workflow that:
1. trains a [custom model](https://cloud.google.com/vertex-ai/docs/training/containers-overview) using TPUs
1. uploads the model as a Vertex AI model resource
1. creates a Vertex AI endpoint resource, and 
1. deploys the model resource to the endpoint resource

Learn more about [Training with TPU accelerators](https://cloud.google.com/vertex-ai/docs/training/training-with-tpu-vm). 

### Dataset

The dataset used for this tutorial is the [cifar10 dataset](https://www.tensorflow.org/datasets/catalog/cifar10) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). The version of the dataset you use is built into TensorFlow. The trained model predicts which type of class an image is from the following ten classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, or truck.

### Objective

In this tutorial, you learn how to create a custom model using a pipeline with components from `google_cloud_pipeline_components` and a custom pipeline component you build.

This tutorial uses the following Vertex AI services and resources:

- Vertex AI Training
- Vertex AI Pipelines
- Google Cloud Pipeline Components

The steps performed include:

- Build a custom container for the custom model.
- Train the custom model with TPUs.
- Upload the trained model to Vertex AI Model Registry.
- Create a Vertex AI endpoint resource.
- Deploy the model resource to the endpoint resource.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* Artifact Registry
* Cloud Build

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [Artifact Registry pricing](https://cloud.google.com/artifact-registry/pricing), [Cloud Build pricing](https://cloud.google.com/build/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 google-cloud-pipeline-components \
                                 kfp

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To run this tutorial, you must have an existing Google Cloud project. Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step. You only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Import libraries and define constants

In [None]:
import os
from typing import Any, Dict, List

import kfp
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.v1.custom_job.component import \
    custom_training_job as CustomTrainingJobOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,
                                                          ModelDeployOp)
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from kfp import compiler
from kfp.dsl import importer_node

#### Vertex AI Pipelines constants

Setup up the following constants for Vertex AI Pipelines:

In [None]:
# Define the pipeline root folder
PIPELINE_ROOT = "{}/pipeline_root/tpu_cifar10_pipeline".format(BUCKET_URI)

# Create working dir to pass to job spec
WORKING_DIR = f"{PIPELINE_ROOT}/model"

# Set the display name for model
MODEL_DISPLAY_NAME = "tpu_train_deploy"

### Initialize Vertex AI SDK for Python

To get started using Vertex AI, you must [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

### Set hardware accelerators

You can set hardware accelerators for both training and prediction.



Set the variables `TRAIN_TPU/TRAIN_NTPU` to use a container training image supporting a TPU and the number of TPUs allocated and `DEPLOY_GPU/DEPLOY_NGPU` to user a container deployment image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. 

See the [locations where accelerators are available](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators).

Otherwise specify `(None, None)` to use a container image to run on a CPU.

In [None]:
TRAIN_TPU, TRAIN_NTPU = (
    gapic.AcceleratorType.TPU_V2,
    8,
)  # Using TPU_V2 with 8 accelerators

DEPLOY_GPU, DEPLOY_NGPU = (gapic.AcceleratorType.NVIDIA_TESLA_K80, 1)

### Set pre-built containers

Vertex AI provides pre-built containers to run training and prediction.

For the latest list, see [Pre-built containers for training](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers) and [Pre-built containers for prediction](https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers)

In [None]:
DEPLOY_VERSION = "tf2-gpu.2-9"

DEPLOY_IMAGE = "us-docker.pkg.dev/cloud-aiplatform/prediction/{}:latest".format(
    DEPLOY_VERSION
)

print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

### Set machine types

Next, set the machine types to use for training and prediction.

- Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to configure your compute resources for training and prediction.
 - `machine type`
     - `cloud-tpu` : used for TPU training. See the [TPU Architecture site for details](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
     - `n1-standard`: 3.75GB of memory per vCPU
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

**Note**: The following is not supported for training:

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

**Note**: You may also use n2 and e2 machine types for training and deployment, but they don't support GPUs.

In [None]:
MACHINE_TYPE = "cloud-tpu"

# TPU VMs don't require VCPU definition
TRAIN_COMPUTE = MACHINE_TYPE
print("Train machine type", TRAIN_COMPUTE)

MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

Define the other constants required for training.

In [None]:
if not TRAIN_NTPU or TRAIN_NTPU < 2:
    TRAIN_STRATEGY = "single"
else:
    TRAIN_STRATEGY = "tpu"

EPOCHS = 20
STEPS = 10000

TRAINER_ARGS = [
    "--epochs=" + str(EPOCHS),
    "--steps=" + str(STEPS),
    "--distribute=" + TRAIN_STRATEGY,
]

## Create a custom container

Create a directory for writing the container build artifacts.

In [None]:
CONTAINER_ARTIFACTS_DIR = "tpu-container-artifacts"

!mkdir {CONTAINER_ARTIFACTS_DIR}

### Write the Dockerfile

In [None]:
dockerfile = """FROM python:3.8

WORKDIR /root

# Copies the trainer code to the docker image.
COPY train.py /root/train.py

RUN pip3 install tensorflow-datasets

# Install TPU Tensorflow and dependencies.
# libtpu.so must be under the '/lib' directory.
RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/20210525/libtpu.so -O /lib/libtpu.so
RUN chmod 777 /lib/libtpu.so

RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/20210525/tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl
RUN pip3 install tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl
RUN rm tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl

ENTRYPOINT ["python3", "train.py"]
"""

with open(os.path.join(CONTAINER_ARTIFACTS_DIR, "Dockerfile"), "w") as f:
    f.write(dockerfile)

### Create the training script

In the next cell, write the contents of the training script to `train.py`. 

In summary, you training script does the following:

- Gets the directory where to save the model artifacts from the environment variable `AIP_MODEL_DIR`. This variable is set by the training service.
- Loads CIFAR10 dataset from TF Datasets (tfds).
- Builds a model using TF.Keras model API.
- Compiles the model (`compile()`).
- Sets a training distribution strategy according to the argument `args.distribute`.
- Trains the model (`fit()`) with epochs and steps according to the arguments `args.epochs` and `args.steps`
- Saves the trained model (`save(MODEL_DIR)`) to the specified model directory.
- Runs the below TPU specific tasks:
    - Finds the TPU cluster, connects to it, and sets the training strategy to TPUStrategy.
    - Saves the trained TPU model to the local device, so that it can be saved to the `AIP_MODEL_DIR`.

In [None]:
%%writefile {CONTAINER_ARTIFACTS_DIR}/train.py
# Single, Mirror and Multi-Machine Distributed Training for CIFAR-10

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
import argparse
import os
import sys
tfds.disable_progress_bar()

parser = argparse.ArgumentParser()
parser.add_argument('--lr', dest='lr',
                    default=0.01, type=float,
                    help='Learning rate.')
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--steps', dest='steps',
                    default=200, type=int,
                    help='Number of steps per epoch.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='distributed training strategy')
args = parser.parse_args()

print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
print('DEVICES', device_lib.list_local_devices())

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple TPU devices
elif args.distribute == 'tpu':
    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
    tf.config.experimental_connect_to_cluster(cluster_resolver)
    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
    strategy = tf.distribute.TPUStrategy(cluster_resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Multi-worker configuration
print('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))

# Preparing dataset
BUFFER_SIZE = 10000
BATCH_SIZE = 64

def make_datasets_unbatched():
  # Scaling CIFAR10 data from (0, 255] to (0., 1.]
  def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.0
    return image, label

  datasets, info = tfds.load(name='cifar10',
                            with_info=True,
                            as_supervised=True)
  return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE).repeat()


# Build the Keras model
def build_and_compile_cnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(32, 32, 3)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(10, activation='softmax')
  ])
  model.compile(
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      optimizer=tf.keras.optimizers.SGD(learning_rate=args.lr),
      metrics=['accuracy'])
  return model

# Train the model
NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS
MODEL_DIR = os.getenv("AIP_MODEL_DIR")

train_dataset = make_datasets_unbatched().batch(GLOBAL_BATCH_SIZE)

with strategy.scope():
  # Creation of dataset, and model building/compiling need to be within
  # `strategy.scope()`.
  model = build_and_compile_cnn_model()

model.fit(x=train_dataset, epochs=args.epochs, steps_per_epoch=args.steps)
if args.distribute=="tpu":
    save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
    model.save(MODEL_DIR, options=save_locally)
else:
    model.save(MODEL_DIR)

### Build the training container image

Now, build and push the training container image to Artifact Registry using the Dockerfile.

In this section, you run the following steps:
1. Enable the Artifact Registry API.
1. Create a private repository in Artifact Registry.
1. Configure authentication to Artifact Registry.
1. Submit the training container image using Cloud Build.

#### Enable Artifact Registry API
You must enable the Artifact Registry API service for your project.

<a href="https://cloud.google.com/artifact-registry/docs/enable-service">Learn more about Enabling service</a>.

In [None]:
! gcloud services enable artifactregistry.googleapis.com

if os.getenv("IS_TESTING"):
    ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
    ! gcloud components update --quiet

#### Create a private repository

Run the below cell to create a private docker repository in Artifact Registry.

In [None]:
# Set the repository name
REPOSITORY = "tpu-training-repository"

# Create the repository
!gcloud artifacts repositories create $REPOSITORY --repository-format=docker \
--location=$LOCATION --description="Vertex TPU training repository"

#### Configure authentication to your private repo

Before you push or pull container images, configure docker to use the gcloud command-line tool to authenticate requests to Artifact Registry for your region.

In [None]:
! gcloud auth configure-docker $LOCATION-docker.pkg.dev --quiet

#### Submit the training container image

Submit the training container image using Cloud Build. The image gets saved to the repository path that is provided in the tag.

In [None]:
# Set the image name in the repository
IMAGE = "tpu-train"
# Set the training container image
TRAIN_IMAGE = f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:latest"
# Submit the build source
!gcloud builds submit {CONTAINER_ARTIFACTS_DIR} --region={LOCATION} --tag={TRAIN_IMAGE}

## Define the pipeline 

Next, define the pipeline. The components required for the key tasks of the pipeline are defined using  [`google_cloud_pipeline_components`](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud). These tasks involve: upload the model, create an endpoint, and deploy the model to the endpoint.


The pipeline has four main steps:

1) The `CustomTrainingJobOp` runs the docker container which executes the training task.

2) The `ModelUploadOp` uploads the trained model to Vertex AI Model Registry.

3) The `EndpointCreateOp` creates the Vertex AI endpoint.

4) Finally, the `ModelDeployOp` deploys the model to the endpoint.

**Note:** While not shown in this example, the model-deploy component creates an endpoint if one is not provided.

In [None]:
@kfp.dsl.pipeline(name="train-endpoint-deploy")
def pipeline(
    trainer_args: str,
    working_dir: str,
    train_image: str,
    train_compute: str,
    train_tpu: str,
    train_ntpu: str,
    project: str = PROJECT_ID,
    model_display_name: str = MODEL_DISPLAY_NAME,
    serving_container_image_uri: str = DEPLOY_IMAGE,
):

    # Run the custom training job
    custom_job_task = CustomTrainingJobOp(
        display_name="tpu model training",
        worker_pool_specs=[
            {
                "containerSpec": {
                    "args": trainer_args,
                    "env": [{"name": "AIP_MODEL_DIR", "value": working_dir}],
                    "imageUri": train_image,
                },
                "replicaCount": "1",
                "machineSpec": {
                    "machineType": train_compute,
                    "accelerator_type": train_tpu,
                    "accelerator_count": train_ntpu,
                },
            }
        ],
    )

    # Import the trained model
    import_unmanaged_model_task = importer_node.importer(
        artifact_uri=WORKING_DIR,
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "containerSpec": {
                "imageUri": serving_container_image_uri  # "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest",
            },
        },
    ).after(custom_job_task)

    # Upload the model
    model_upload_op = ModelUploadOp(
        project=project,
        display_name=model_display_name,
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    )

    # Create an endpoint
    endpoint_create_op = EndpointCreateOp(
        project=project,
        display_name="tpu-pipeline-created-endpoint",
    )

    # Deploy the model to the endpoint
    _ = ModelDeployOp(
        endpoint=endpoint_create_op.outputs["endpoint"],
        model=model_upload_op.outputs["model"],
        deployed_model_display_name=model_display_name,
        dedicated_resources_machine_type=DEPLOY_COMPUTE,
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=1,
        dedicated_resources_accelerator_type=DEPLOY_GPU.name,
        dedicated_resources_accelerator_count=DEPLOY_NGPU,
    )

## Compile the pipeline

Next, compile the pipeline to a JSON file.

In [None]:
PIPELINE_PACKAGE_FILE = "tpu_train_cifar10_pipeline.json"
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=PIPELINE_PACKAGE_FILE,
)

## Run the pipeline

Next, create and run the pipeline job.

In [None]:
# Set the display name for the pipeline job
DISPLAY_NAME = "tpu_cifar10_training"

# Define the pipeline parameters
PIPELINE_PARAMS = {
    "trainer_args": TRAINER_ARGS,
    "working_dir": WORKING_DIR,
    "train_image": TRAIN_IMAGE,
    "train_compute": TRAIN_COMPUTE,
    "train_tpu": TRAIN_TPU,
    "train_ntpu": TRAIN_NTPU,
    "project: str": PROJECT_ID,
    "model_display_name": MODEL_DISPLAY_NAME,
    "serving_container_image_uri": DEPLOY_IMAGE,
}

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PIPELINE_PACKAGE_FILE,
    pipeline_root=PIPELINE_ROOT,
    parameter_values=PIPELINE_PARAMS,
)

job.run()

Click on the generated link to see your run in the Cloud Console.

In the UI, many of the pipeline DAG nodes expand or collapse when you click on them. 

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
# Define a function to get details of a task


def get_task_detail(
    task_details: List[Dict[str, Any]], task_name: str
) -> List[Dict[str, Any]]:
    for task_detail in task_details:
        if task_detail.task_name == task_name:
            return task_detail

In [None]:
# Get the pipeline task details
pipeline_task_details = (
    job.gca_resource.job_detail.task_details
)  # fetch pipeline task details

# Fetch endpoint from pipeline
endpoint_task = get_task_detail(pipeline_task_details, "endpoint-create")
endpoint_resourceName = (
    endpoint_task.outputs["endpoint"].artifacts[0].metadata["resourceName"]
)
endpoint = aiplatform.Endpoint(endpoint_resourceName)

# Undeploy model from endpoint
endpoint.undeploy_all()

# Delete the endpoint
endpoint.delete()

# Fetch model from pipeline
model_task = get_task_detail(pipeline_task_details, "model-upload")
model_resourceName = model_task.outputs["model"].artifacts[0].metadata["resourceName"]
model = aiplatform.Model(model_resourceName)

# Delete the model
model.delete()

# Delete the pipeline job
job.delete()

In [None]:
# Warning: Setting this to true deletes everything in your bucket
delete_bucket = True
if delete_bucket:
    ! gsutil rm -r $BUCKET_URI

# Remove the pipeline package file
! rm $PIPELINE_PACKAGE_FILE