In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Model Garden Pipeline Templates for BERT models


<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_template_pipelines_bert.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_template_pipelines_bert.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/communitymodel_garden/model_garden_template_pipelines_bert.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview


This tutorial demonstrates how to modify, compile and execute a prebuilt Vertex AI Model Garden pipeline template with Vertex AI Pipelines.

Learn more about [Create a pipeline template](https://cloud.google.com/vertex-ai/docs/pipelines/create-pipeline-template).

### Objective

In this tutorial, you learn how to use a prebuilt pipeline template with `Vertex AI Pipelines` to fine-tune a BERT text classification model, where the model is accessed from `Vertex AI Model Garden`.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Pipelines`
- `Vertex AI Training`
- `Vertex AI Model Garden`
- `Google Cloud Pipeline Components`


The steps performed include:

- Create a user-defined repository in the `Artifact Registry`.
- Upload the prebuilt pipeline template to the `Artifact Registry`.
- Create a pipeline job with the prebuilt pipeline template to fine-tune a BERT model.
- Execute the pipeline using `Vertex AI Pipelines`.
    - Load BERT model from Vertex AI Model Garden
    - Fine-tune train the model
    - Do batch prediction
    - Evaluate the model from the batch prediction results
- Obtain the Vertex AI Model resource from the pipeline artifacts.
- Deploy the model to a Vertex AI Endpoint
- Make a prediction

### Model

This tutorial uses a pre-trained BERT text classification model from `Vertex AI Model Garden`, which is then fine-tuned (transfer learning) on a dataset of text phrases which are classified as either FirstClass or SecondClass.

Learn more about [BERT pretrained encoder model]( https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3). 

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* Dataflow

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and [Dataflow pricing](https://cloud.google.com/dataflow/pricing)
and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installations

Install the packages required for executing this notebook.

*Note:* This tutorial requires KFP 2.x.

In [None]:
import os

! pip3 install --upgrade google-cloud-aiplatform \
                         google-cloud-pipeline-components \
                         kfp==2.0.0b15

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin


### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Enable APIs

You can enable the required APIs using `gcloud`.

In [None]:
! gcloud services enable compute.googleapis.com         \
                         containerregistry.googleapis.com  \
                         aiplatform.googleapis.com  \
                         artifactregistry.googleapis.com

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator  
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer  

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import json
import os

import google.cloud.aiplatform as aiplatform
from kfp.registry import RegistryClient

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

### Enable Artifact Registry API
You must enable the Artifact Registry API service for your project.

<a href="https://cloud.google.com/artifact-registry/docs/enable-service">Learn more about Enabling service</a>.

In [None]:
! gcloud services enable artifactregistry.googleapis.com

if os.getenv("IS_TESTING"):
    ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
    ! gcloud components update --quiet

## Create repo in Artifact Registry

First, you create your own (user-defined) repository in the `Artifact Registry`. You use this repository to upload and retrieve your pipeline templates.

In [None]:
REPO_NAME = "my-docker-repo-unique"

! gcloud artifacts repositories create {REPO_NAME} --location={REGION} --repository-format=KFP

### Upload the pipeline template

Next, you instantiate a client interface to the Artifact Registry. Then with the `upload_pipeline()` method you upload your pipeline template.

In [None]:
BERT_YAML = "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/pipeline.yaml"

! gsutil cp {BERT_YAML} pipeline.yaml

client = RegistryClient(
    host=f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/quickstart-kfp-repo"
)

templateName, versionName = client.upload_pipeline(
    file_name="pipeline.yaml",
    tags=["v1", "latest"],
    extra_headers={
        "description": "This is a pipeline template for fine-tuning a BERT model."
    },
)

! rm pipeline.yaml

### View your artifacts in your registry

Next, using the `gcloud artifacts files` command you view the artifacts, inclusive of the pipeline template, in your artifacts repository.

In [None]:
! gcloud artifacts files list  --repository={REPO_NAME} --location={REGION}

## Load and execute the pipeline job

Next, you create a Vertex AI Pipeline job from your BERT pipeline template by instantiating a PipelineJob(), with the following parameters:

- `display_name`: The human readable name for the pipeline job.
- `template_path`: The path to the pipeline template in the Artifact Registry.
- `enable_caching`: On re-runs, use the results from previous successful and unchanged steps.
- `pipeline_root`: A Cloud storage location for storing pipeline results.
- `parameter_values`: The parameters and values that are input to the template pipeline. In this example, they are:
    - `project`: Your project ID.
    - `class_labels`: A list of valid class labels, in cardinal order.
    - `root_dir`: A Cloud Storage scratch area.
    - `training_data_path`: A Cloud Storage location to the training data.
    - `ground_truth_gcs_source_uris`: A Cloud Storage location to evaluation data.

In [None]:
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/bert-finetuning"

job = aiplatform.PipelineJob(
    display_name="bert-finetuning",
    template_path=f"https://{REGION}-kfp.pkg.dev/{PROJECT_ID}/quickstart-kfp-repo/{templateName}/{versionName}",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
    parameter_values={
        "project": PROJECT_ID,
        "class_labels": ["FirstClass", "SecondClass", "[UNK]"],
        "root_dir": BUCKET_URI,
        "ground_truth_gcs_source_uris": [
            "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl"
        ],
        "training_data_path": "gs://cloud-samples-data/vertex-ai/dataset-management/datasets/bert_finetuning/wide_and_deep_trainer_container_tests_input.jsonl",
    },
)

job.run()

### View the pipeline results

In [None]:
PROJECT_NUMBER = job.gca_resource.name.split("/")[1]
print(PROJECT_NUMBER)


def print_pipeline_output(job, output_task_name):
    JOB_ID = job.name
    print(JOB_ID)
    artifact = ""
    for _ in range(len(job.gca_resource.job_detail.task_details)):
        TASK_ID = job.gca_resource.job_detail.task_details[_].task_id
        EXECUTE_OUTPUT = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/executor_output.json"
        )
        GCP_RESOURCES = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/gcp_resources"
        )
        EVALUATION_METRICS = (
            PIPELINE_ROOT
            + "/"
            + PROJECT_NUMBER
            + "/"
            + JOB_ID
            + "/"
            + output_task_name
            + "_"
            + str(TASK_ID)
            + "/evaluation_metrics"
        )
        # Check if file exists, 0 is success
        !gsutil -q stat $EXECUTE_OUTPUT
        if _exit_code == 0:
            ! gsutil cat $EXECUTE_OUTPUT
            artifact = EXECUTE_OUTPUT
            break
        !gsutil -q stat $GCP_RESOURCES
        if _exit_code == 0:
            ! gsutil cat $GCP_RESOURCES
            artifact = GCP_RESOURCES
            break
        !gsutil -q stat $EVALUATION_METRICS
        if _exit_code == 0:
            ! gsutil cat $EVALUATION_METRICS
            artifact = EVALUATION_METRICS
            break

    return artifact


print("get-vertex-model")
artifacts = print_pipeline_output(job, "get-vertex-model")
output = !gsutil cat $artifacts
print(output)
output = json.loads(output[0])
model_id = output["artifacts"]["model"]["artifacts"][0]["metadata"]["resourceName"]
print("\n\n")

### Delete the pipeline job

The method 'delete()' will delete the pipeline job.

In [None]:
job.delete()

### Deploy the model

Next, you deploy the model to an endpoint:

- Use the `model_id` obtained from the pipeline artifacts to instaniate a Vertex AI Model resource instance.
- Deploy the Vertex AI Model resource to a Vertex AI Endpoint resource.


In [None]:
model = aiplatform.Model(model_id)
endpoint = model.deploy(
    accelerator_count=1,
    accelerator_type=aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4.name,
    machine_type="n1-standard-4",
)
print(endpoint)

### Make a prediction

Finally, you make a prediction with the deployed model.

In [None]:
endpoint.predict(["this is a test"])

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
delete_bucket = False

endpoint.undeploy_all()
endpoint.delete()
model.delete()

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI

! rm -rf custom custom.tar.gz

! gcloud artifacts repositories delete $REPO_NAME --project {PROJECT_ID} --location {REGION} --quiet