In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with machine management for Vertex AI Pipelines

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/get_started_with_machine_management.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/get_started_with_machine_management.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/pipelines/get_started_with_machine_management.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

This tutorial demonstrates how to manage machine resources when training as a component in `Vertex AI Pipelines`.

### Objective

In this tutorial, you learn how to convert a self-contained custom training component into a `Vertex AI CustomJob`, whereby:

    - The training job and artifacts are trackable.
    - Set machine resources, such as machine-type, cpu/gpu, memory, disk, etc.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`

The steps performed in this tutorial include:

- Create a custom component with a self-contained training job.
- Execute pipeline using component-level settings for machine resources
- Convert the self-contained training component into a `Vertex AI CustomJob`.
- Execute pipeline using customjob-level settings for machine resources 

### Dataset

The dataset is the MNIST dataset. The dataset consists of 28x28 grayscale images of the digits 0 .. 9.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Installations

Install the packages required for executing this notebook.

In [None]:
import os

! pip3 install --upgrade google-cloud-aiplatform \
                         'google-cloud-pipeline-components<2' --quiet
! pip3 install --upgrade 'kfp<2' --quiet
! pip3 install --upgrade tensorflow==2.7 --quiet

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        # print("shell_output=", shell_output)
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Set up variables

Next, set up some variables used throughout the tutorial.

### Import libraries

In [None]:
import json

import tensorflow as tf
from google.cloud import aiplatform
from google_cloud_pipeline_components.v1.custom_job import \
    create_custom_training_job_from_component
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

#### Set hardware accelerators

You can set hardware accelerators for training and prediction.

Set the variables `TRAIN_GPU/TRAIN_NGPU` and `DEPLOY_GPU/DEPLOY_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa K80 GPUs allocated to each VM, you would specify:

    (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, 4)


Otherwise specify `(None, None)` to use a container image to run on a CPU.

Learn more about [hardware accelerator support for your region](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators).

*Note*: TF releases before 2.3 for GPU support will fail to load the custom model in this tutorial. It is a known issue and fixed in TF 2.3. This is caused by static graph ops that are generated in the serving function. If you encounter this issue on your own custom models, use a container image for TF 2.3 with GPU support.

In [None]:
TRAIN_GPU, TRAIN_NGPU = (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, 1)

DEPLOY_GPU, DEPLOY_NGPU = (None, None)

#### Set pre-built containers

Set the pre-built Docker container image for training and prediction.


For the latest list, see [Pre-built containers for training](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).


For the latest list, see [Pre-built containers for prediction](https://cloud.google.com/ai-platform-unified/docs/predictions/pre-built-containers).

In [None]:
TF = "2.5".replace(".", "-")

if TRAIN_GPU:
    TRAIN_VERSION = "tf-gpu.{}".format(TF)
else:
    TRAIN_VERSION = "tf-cpu.{}".format(TF)
if DEPLOY_GPU:
    DEPLOY_VERSION = "tf2-gpu.{}".format(TF)
else:
    DEPLOY_VERSION = "tf2-cpu.{}".format(TF)


TRAIN_IMAGE = "{}-docker.pkg.dev/vertex-ai/training/{}:latest".format(
    REGION.split("-")[0], TRAIN_VERSION
)
DEPLOY_IMAGE = "{}-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(
    REGION.split("-")[0], DEPLOY_VERSION
)

print("Training:", TRAIN_IMAGE, TRAIN_GPU, TRAIN_NGPU)
print("Deployment:", DEPLOY_IMAGE, DEPLOY_GPU, DEPLOY_NGPU)

#### Set machine type

Next, set the machine type to use for training and prediction.

- Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to configure  the compute resources for the VMs you will use for for training and prediction.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

*Note: The following is not supported for training:*

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

*Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs*.

In [None]:
TRAIN_COMPUTE = "n1-standard-4"
print("Train machine type", TRAIN_COMPUTE)

DEPLOY_COMPUTE = "n1-standard-4"
print("Deploy machine type", DEPLOY_COMPUTE)

## Create a self-contained custom training component

First, you create a component that self-contains the entire training step. This component trains a simple MNIST model using TensorFlow framework. The training is wholly self-contained in the component:

    - Get and preprocess the data.
    - Get/build the model.
    - Train the model.
    - Save the model.
    
The component takes the following parameters:

- `model_dir`: The Cloud Storage location to save the trained model artifacts.
- `epochs`: The number of epochs to train the model.

In [None]:
@component(
    output_component_file="demo_componet.yaml",
    base_image=TRAIN_IMAGE,
    packages_to_install=["tensorflow"],
)
def self_contained_training_component(
    model_dir: str,
    epochs: int,
) -> str:
    import numpy as np

    def get_data():
        from tensorflow.keras.datasets import mnist

        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train = (x_train / 255.0).astype(np.float32)
        x_test = (x_test / 255.0).astype(np.float32)

        return (x_train, y_train, x_test, y_test)

    def get_model():
        from tensorflow.keras import Sequential
        from tensorflow.keras.layers import Dense, Flatten

        model = Sequential(
            [
                Flatten(input_shape=(28, 28, 1)),
                Dense(128, activation="relu"),
                Dense(256, activation="relu"),
                Dense(128, activation="relu"),
                Dense(10, activation="softmax"),
            ]
        )

        model.compile(
            optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["acc"]
        )

        return model

    def train_model(x_train, y_train, model, epochs):
        history = model.fit(x_train, y_train, epochs=epochs)
        return history

    (x_train, y_train, _, _) = get_data()
    model = get_model()
    train_model(x_train, y_train, model, epochs)

    model.save(model_dir)
    return model_dir

## Create the self-contained-training pipeline

Next, you create the pipeline for training this component, consisting of the following steps:

- *Train the model*. For this component, you set the following component level resources:
    - `cpu_limit`: The number of CPUs for the container's VM instance.
    - `memory_limit`: The amount of memory for the container's VM instance.
    - `node_selector_constraint` The type of GPU for the container's VM instance.
    - `gpu_limit`: The number of GPUs for the container's VM instance.
- *Import model artifacts into a Model Container artifact*.
- *Upload the Container artifact into a `Vertex AI Model` resource*.

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/machine_settings".format(BUCKET_URI)

CPU_LIMIT = "8"  # vCPUs
MEMORY_LIMIT = "8G"


@dsl.pipeline(
    name="component-level-set-resources",
    description="A simple pipeline that requests component-level machine resource",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(epochs: int, model_dir: str, project: str = PROJECT_ID):
    from google_cloud_pipeline_components.types import artifact_types
    from google_cloud_pipeline_components.v1.model import ModelUploadOp
    from kfp.v2.components import importer_node

    training_job_task = (
        self_contained_training_component(epochs=epochs, model_dir=model_dir)
        .set_display_name("self-contained-training")
        .set_cpu_limit(CPU_LIMIT)
        .set_memory_limit(MEMORY_LIMIT)
        .add_node_selector_constraint(
            value=TRAIN_GPU.name, label_name="cloud.google.com/gke-accelerator"
        )
        .set_gpu_limit(TRAIN_NGPU)
    )

    import_unmanaged_model_task = importer_node.importer(
        artifact_uri=training_job_task.output,
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "containerSpec": {
                "imageUri": DEPLOY_IMAGE,
            },
        },
    ).after(training_job_task)

    _ = ModelUploadOp(
        project=project,
        display_name="mnist_model",
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    ).after(import_unmanaged_model_task)

### Compile and execute the pipeline

Next, you compile the pipeline and then execute it. The pipeline takes the following parameters, which are passed as the dictionary `parameter_values`:

- `model_dir`: The Cloud Storage location to save the model artifacts.
- `epochs`: The number of epochs to train the model.
- `project`: Your project ID.

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path="component_level_settings.json",
)

pipeline = aiplatform.PipelineJob(
    display_name="component-level-settings",
    template_path="component_level_settings.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"model_dir": BUCKET_URI, "epochs": 20, "project": PROJECT_ID},
    enable_caching=False,
)

pipeline.run()

! rm -rf component_level_settings.json

### View the pipeline results

Once the pipeline has completed, you can view the artifact outputs for each component step.

In [None]:
PROJECT_NUMBER = pipeline.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)


def print_pipeline_output(job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        EVAL_METRICS = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/evaluation_metrics"
        )
        if tf.io.gfile.exists(EXECUTE_OUTPUT):
            ! gsutil cat $EXECUTE_OUTPUT
            return EXECUTE_OUTPUT
        elif tf.io.gfile.exists(GCP_RESOURCES):
            ! gsutil cat $GCP_RESOURCES
            return GCP_RESOURCES
        elif tf.io.gfile.exists(EVAL_METRICS):
            ! gsutil cat $EVAL_METRICS
            return EVAL_METRICS

    return None


print("self-contained-training")
artifacts = print_pipeline_output(pipeline, "self-contained-training")
print("\n\n")
print("importer")
artifacts = print_pipeline_output(pipeline, "importer")
print("\n\n")
print("model-upload")
artifacts = print_pipeline_output(pipeline, "model-upload")
output = !gsutil cat $artifacts
output = json.loads(output[0])
model_id = output["artifacts"]["model"]["artifacts"][0]["metadata"]["resourceName"]
print("\n")
print("MODEL ID", model_id)
print("\n\n")

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the model

You can delete the `Model` resource generated by your pipeline with the `delete()` method.

In [None]:
model = aiplatform.Model(model_id)
model.delete()

## Convert self-contained training component to a `Vertex AI CustomJob`.

Next, you use the utility `create_custom_training_job_from_component()` into a `Vertex AI CustomJob`. This provides the benefits of:

- Adds additional ML Metadata tracking as a custom job.
- Can set resource controls specific to the custom job.
    - `machine_type`: The machine (VM) instance for the `CustomJob`.
    - `accelerator_type`: The type (if any) of GPU or TPU.
    - `accerlator_count`: The number of HW acclerators (GPU/TPU) or zero.
    - `replica_count`: The number of VM instances for the job (Default is 1).
    - `boot_disk_type`: Type of the boot disk (default is "pd-ssd"). 
    - `boot_disk_size_gb`:  Size in GB of the boot disk (default is 100GB).

In [None]:
custom_job_op = create_custom_training_job_from_component(
    self_contained_training_component,
    display_name="test-component",
    machine_type=TRAIN_COMPUTE,
    accelerator_type=TRAIN_GPU.name,
    accelerator_count=TRAIN_NGPU,
)

### Create the CustomJob pipeline

Next, you create the pipeline for training this component, consisting of the following steps:

- *Train the model*. For this component, you set the following custom-job level resources:
    - `machine_type`: The machine (VM) instance.
    - `accelerator_type`: The type of GPU for the container's VM instance.
    - `accelerator_count`: The number of GPUs for the container's VM instance.
    - `replica_count`: The number of machine (VM) instances.
- *Import model artifacts into a Model Container artifact*.
- *Upload the Container artifact into a `Vertex AI Model` resource*.

In [None]:
@dsl.pipeline(
    name="customjob-set-resources",
    description="A simple pipeline that requests customjob-level machine resource",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(
    epochs: int, model_dir: str, project: str = PROJECT_ID, region: str = REGION
):
    from google_cloud_pipeline_components.types import artifact_types
    from google_cloud_pipeline_components.v1.model import ModelUploadOp
    from kfp.v2.components import importer_node

    training_job_task = custom_job_op(
        epochs=epochs, model_dir=model_dir, project=project, location=region
    )

    import_unmanaged_model_task = importer_node.importer(
        artifact_uri=training_job_task.outputs["output"],
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "containerSpec": {
                "imageUri": DEPLOY_IMAGE,
            },
        },
    ).after(training_job_task)

    _ = ModelUploadOp(
        project=project,
        display_name="mnist_model",
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    ).after(import_unmanaged_model_task)

### Compile and execute the pipeline

Next, you compile the pipeline and then execute it. The pipeline takes the following parameters, which are passed as the dictionary `parameter_values`:

- `model_dir`: The Cloud Storage location to save the model artifacts.
- `epochs`: The number of epochs to train the model.
- `project`: Your project ID.

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path="customjob_level_settings.json",
)

pipeline = aiplatform.PipelineJob(
    display_name="customjob-level-settings",
    template_path="customjob_level_settings.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"model_dir": BUCKET_URI, "epochs": 20, "project": PROJECT_ID},
    enable_caching=False,
)

pipeline.run()

! rm -rf customjob_level_settings.json

### View the pipeline results

Once the pipeline has completed, you can view the artifact outputs for each component step.

In [None]:
print("self-contained-training-component")
artifacts = print_pipeline_output(pipeline, "self-contained-training-component")
print("\n\n")
print("importer")
artifacts = print_pipeline_output(pipeline, "importer")
print("\n\n")
print("model-upload")
artifacts = print_pipeline_output(pipeline, "model-upload")
output = !gsutil cat $artifacts
output = json.loads(output[0])
model_id = output["artifacts"]["model"]["artifacts"][0]["metadata"]["resourceName"]
print("\n")
print("MODEL ID", model_id)
print("\n\n")

### Delete a pipeline job

After a pipeline job is completed, you can delete the pipeline job with the method `delete()`.  Prior to completion, a pipeline job can be canceled with the method `cancel()`.

In [None]:
pipeline.delete()

### Delete the model

You can delete the `Model` resource generated by your pipeline with the `delete()` method.

In [None]:
model = aiplatform.Model(model_id)
model.delete()

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

In [None]:
import os

# Set this to true only if you'd like to delete your bucket
delete_bucket = False

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI