In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Pipelines: AutoML text classification pipelines using google-cloud-pipeline-components

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_automl_text.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_automl_text.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/google_cloud_pipeline_components_automl_text.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
     </a>
  </td>
</table>
<br/><br/><br/>

## Overview

This notebook demonstrates how to use the Vertex AI classification model evaluation component to evaluate an AutoML text classification model. Model evaluation helps you determine your model performance based on the evaluation metrics and improve the model if necessary. 

### Objective

In this tutorial, you learn to use `Vertex AI Pipelines` and `Google Cloud Pipeline Components` to build an `AutoML` text classification model.


This tutorial uses the following Google Cloud ML services:

- Vertex AI `Datasets`
- Vertex AI `Training`(AutoML Tabular Classification) 
- Vertex AI `Model Registry`
- Vertex AI `Pipelines`
- Vertex AI `Batch Predictions`

The steps performed include:

- Create a Vertex AI `Dataset`.
- Train a Automl Tabular Classification model on the `Dataset` resource.
- Import the trained `AutoML model resource` into the pipeline.
- Run a `Batch Prediction` job.
- Evaulate the AutoML model using the `Classification Evaluation Component`.
- Import the classification metrics to the AutoML model resource.

The components are [documented here](https://google-cloud-pipeline-components.readthedocs.io/en/latest/google_cloud_pipeline_components.aiplatform.html#module-google_cloud_pipeline_components.aiplatform).

### Dataset

The dataset used for this tutorial is the [Happy Moments dataset](https://www.kaggle.com/ritresearch/happydb) from [Kaggle Datasets](https://www.kaggle.com/ritresearch/happydb). The version of the dataset you will use in this tutorial is stored in a public Cloud Storage bucket.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

If you are using Colab or Google Cloud Notebook, your environment already meets all the requirements to run this notebook. You can skip this step.

Otherwise, make sure your environment meets this notebook's requirements. You need the following:

- The Cloud Storage SDK
- Git
- Python 3
- virtualenv
- Jupyter notebook running in a virtual environment with Python 3

The Cloud Storage guide to [Setting up a Python development environment](https://cloud.google.com/python/setup) and the [Jupyter installation guide](https://jupyter.org/install) provide detailed instructions for meeting these requirements. The following steps provide a condensed set of instructions:

1. [Install and initialize the SDK](https://cloud.google.com/sdk/docs/).

2. [Install Python 3](https://cloud.google.com/python/setup#installing_python).

3. [Install virtualenv](Ihttps://cloud.google.com/python/setup#installing_and_using_virtualenv) and create a virtual environment that uses Python 3.

4. Activate that environment and run `pip3 install Jupyter` in a terminal shell to install Jupyter.

5. Run `jupyter notebook` on the command line in a terminal shell to launch Jupyter.

6. Open this notebook in the Jupyter Notebook Dashboard.


## Installation

Install the packages required for executing this notebook.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform {USER_FLAG} -q
! pip3 install -U google-cloud-storage $USER_FLAG -q
! pip3 install $USER kfp google-cloud-pipeline-components --upgrade -q

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Check the versions of the packages you installed.  The KFP SDK version should be >=1.6.

In [None]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Before you begin

### GPU runtime

This tutorial does not require a GPU runtime.

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the Vertex AI APIs, Compute Engine APIs, and Cloud Storage.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component,storage-component.googleapis.com)

4. [The Google Cloud SDK](https://cloud.google.com/sdk) is already installed in Google Cloud Notebook.

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### UUID

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a uuid for each instance session, and append the uuid onto the name of resources you create in this tutorial.

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebook**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex AI SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + UUID
    BUCKET_URI = "gs://" + BUCKET_NAME

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

#### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [None]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [None]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [None]:
import google.cloud.aiplatform as aip
import kfp
from google.cloud import aiplatform_v1

#### Vertex AI Pipelines constants

Setup up the following constants for Vertex AI Pipelines:

In [None]:
PIPELINE_ROOT = "{}/pipeline_root/happydb".format(BUCKET_URI)

## Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## Define AutoML text classification model pipeline that uses components from `google_cloud_pipeline_components`

Next, you define the pipeline.

Create and deploy an AutoML text classification `Model` resource using a `Dataset` resource.

In [None]:
IMPORT_FILE = "gs://cloud-ml-data/NL-classification/happiness.csv"


@kfp.dsl.pipeline(name="automl-text-classification" + UUID)
def pipeline(
    project: str = PROJECT_ID, region: str = REGION, import_file: str = IMPORT_FILE
):
    from google_cloud_pipeline_components import aiplatform as gcc_aip
    from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,
                                                              ModelDeployOp)

    dataset_create_task = gcc_aip.TextDatasetCreateOp(
        display_name="train-automl-happydb",
        gcs_source=import_file,
        import_schema_uri=aip.schema.dataset.ioformat.text.multi_label_classification,
        project=project,
    )

    training_run_task = gcc_aip.AutoMLTextTrainingJobRunOp(
        dataset=dataset_create_task.outputs["dataset"],
        display_name="train-automl-happydb",
        prediction_type="classification",
        multi_label=True,
        training_fraction_split=0.6,
        validation_fraction_split=0.2,
        test_fraction_split=0.2,
        model_display_name="train-automl-happydb",
        project=project,
    )

    endpoint_op = EndpointCreateOp(
        project=project,
        location=region,
        display_name="train-automl-happydb",
    )

    _= ModelDeployOp(
        model=training_run_task.outputs["model"],
        endpoint=endpoint_op.outputs["endpoint"],
        automatic_resources_min_replica_count=1,
        automatic_resources_max_replica_count=1,
    )

## Compile the pipeline

Next, compile the pipeline.

In [None]:
from kfp.v2 import compiler  # noqa: F811

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path="text_classification_pipeline.json",
)

## Run the pipeline

Next, run the pipeline.

In [None]:
DISPLAY_NAME = "happydb_" + UUID

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="text_classification_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

job.run()

! rm text_classification_pipeline.json

Click on the generated link to see your run in the Cloud Console.

<!-- It should look something like this as it is running:

<a href="https://storage.googleapis.com/amy-jo/images/mp/automl_tabular_classif.png" target="_blank"><img src="https://storage.googleapis.com/amy-jo/images/mp/automl_tabular_classif.png" width="40%"/></a> -->

In the UI, many of the pipeline DAG nodes will expand or collapse when you click on them. Here is a partially-expanded view of the DAG (click image to see larger version).

<a href="https://storage.googleapis.com/amy-jo/images/mp/automl_text_classif.png" target="_blank"><img src="https://storage.googleapis.com/amy-jo/images/mp/automl_text_classif.png" width="40%"/></a>

In [None]:
model_display_name = "train-automl-happydb "
models = aip.Model.list(
        filter=f"display_name={model_display_name}", order_by="create_time"
    )
if models:
    model = models[0]
model

In [None]:
# For existing model, use MODEL_ID to load the model.
# MODEL_ID = '3402729003222564864'
# model = aip.Model(model_name=MODEL_ID)

In [None]:
# Get evaluations
model_evaluations = model.list_model_evaluations()

model_evaluation = list(model_evaluations)[0]
print(model_evaluation)

In [None]:
# Print the evaluation metrics
for evaluation in model_evaluations:
    evaluation = evaluation.to_dict()
    print("Model's evaluation metrics from Training:\n")
    metrics = evaluation["metrics"]
    for metric in metrics.keys():
        print(f"metric: {metric}, value: {metrics[metric]}\n")

## Get batch predictions from your model

You can get batch predictions from a text classification model without deploying it. You must first format all of your prediction instances (prediction input) in JSONL format and you must store the JSONL file in a Google Cloud Storage bucket. You must also provide a Google Cloud Storage bucket to hold your prediction output.

To start, you must first create your predictions input file in JSONL format. Each line in the JSONL document needs to be formatted like so:

```
{ "content": "gs://sourcebucket/datasets/texts/source_text.txt", "mimeType": "text/plain"}
```

The `content` field in the JSON structure must be a Google Cloud Storage URI to another document that contains the text input for prediction.
[See the documentation for more information.](https://cloud.google.com/ai-platform-unified/docs/predictions/batch-predictions#text)

In [None]:
instances = [
    {"Text": "I went on a successful date with someone I felt sympathy and connection with.", "Labels": "affection"}
,{"Text": "I was happy when my son got 90% marks in his examination", "Labels": "affection"}
,{"Text": "I went to the gym this morning and did yoga.", "Labels": "exercise"}
,{"Text": "We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.", "Labels": "bonding"}
,{"Text": "I went with grandchildren to butterfly display at Crohn Conservatory", "Labels": "affection"}
,{"Text": "I meditated last night.", "Labels": "leisure"}
,{"Text": "I made a new recipe for peasant bread, and it came out spectacular!", "Labels": "achievement"}
,{"Text": "I got gift from my elder brother which was really surprising me", "Labels": "affection"}
,{"Text": "YESTERDAY MY MOMS BIRTHDAY SO I ENJOYED", "Labels": "enjoy_the_moment"}
,{"Text": "Watching cupcake wars with my three teen children", "Labels": "affection"}
,{"Text": "I came in 3rd place in my Call of Duty video game.", "Labels": "leisure"}
,{"Text": "I completed my 5 miles run without break. It makes me feel strong.", "Labels": "exercise"}
,{"Text": "went to movies with my friends it was fun", "Labels": "bonding"}
,{"Text": "I was shorting Gold and made $200 from the trade.", "Labels": "achievement"}
,{"Text": "Hearing Songs It can be nearly impossible to go from angry to happy, so you're just looking for the thought that eases you out of your angry feeling and moves you in the direction of happiness. It may take a while, but as long as you're headed in a more positive direction youall be doing yourself a world of good.", "Labels": "enjoy_the_moment"}
,{"Text": "My son performed very well for a test preparation.", "Labels": "affection"}
,{"Text": "I helped my neighbour to fix their car damages.", "Labels": "bonding"}
,{"Text": "Managed to get the final trophy in a game I was playing.", "Labels": "achievement"}
,{"Text": "A hot kiss with my girl friend last night made my day", "Labels": "bonding"}
,{"Text": "My new BCAAs came in the mail. Yay! Strawberry Lemonade flavored aminos make my heart happy.", "Labels": "affection"}
,{"Text": "Got A in class.", "Labels": "achievement"}
,{"Text": "My sister called me from abroad this morning after some long years. Such a happy occassion for all family members.", "Labels": "affection"}
,{"Text": "The cake I made today came out amazing. It tasted amazing as well.", "Labels": "achievement"}
,{"Text": "There are two types of people in the world: those who choose to be happy, and those who choose to be unhappy. Contrary to popular belief, happiness doesn't come from fame, fortune, other people, or material possessions", "Labels": "enjoy_the_moment"}
,{"Text": "My grandmother start to walk from the bed after a long time.", "Labels": "affection"}
,{"Text": "i was able to hit a top spin serve in tennis", "Labels": "achievement"}
,{"Text": "I napped with my husband on the bed this afternoon and it was sweet to cuddle so close to him.", "Labels": "affection"}
,{"Text": "My co-woker started playing a Carley Rae Jepsen song from her phone while ringing out customers.", "Labels": "leisure"}
,{"Text": "My son woke me up to a fantastic breakfast of eggs, his special hamburger patty and pancakes.", "Labels": "affection"}
,{"Text": "After a long time my brother gave a suprise visit to my house yesterday.", "Labels": "affection"}
]

input_file_name = "happiness-batch-prediction-input.jsonl"

For batch prediction, you must supply the following:

+ All of your prediction instances as individual files on Google Cloud Storage, as TXT files for your instances
+ A JSONL file that lists the URIs of all your prediction instances
+ A Cloud Storage bucket to hold the output from batch prediction

For this tutorial, the following cells create a new Storage bucket, upload individual prediction instances as text files to the bucket, and then create the JSONL file with the URIs of your prediction instances.

In [None]:
# Instantiate the Storage client and create the new bucket
from google.cloud import  storage
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
# Iterate over the prediction instances, creating a new TXT file
# for each.
input_file_data = []
for count, instance in enumerate(instances):
    print(instance)
    instance_name = f"input_{count}.txt"
    instance_file_uri = f"{BUCKET_URI}/batch-prediction-input/{instance_name}"
    # Add the data to store in the JSONL input file.
    tmp_data = {"content": instance_file_uri, "mimeType": "text/plain"}
    input_file_data.append(tmp_data)

    Create the new instance file
    blob = bucket.blob("batch-prediction-input/"+instance_name)
    blob.upload_from_string(instance['Text'])
          

input_str = "\n".join([str(d) for d in input_file_data])
file_blob = bucket.blob(f"{input_file_name}")
file_blob.upload_from_string(input_str)

In [None]:
job_display_name = "happiness-text-classification-batch-prediction-job"
batch_prediction_job = model.batch_predict(
    job_display_name=job_display_name,
    gcs_source=f"{BUCKET_URI}/{input_file_name}",
    gcs_destination_prefix=f"{BUCKET_URI}/output",
    sync=True,
)
batch_prediction_job_name = batch_prediction_job.resource_name

In [None]:
from google.cloud.aiplatform import jobs

batch_job = jobs.BatchPredictionJob(batch_prediction_job_name)
print(f"Batch prediction job state: {str(batch_job.state)}")

# Get predictions for batchPredict job

In [None]:
!pip install ndjson

In [None]:
import ndjson

bp_iter_outputs = batch_job.iter_outputs()

prediction_results = list()
for blob in bp_iter_outputs:
    if blob.name.split("/")[-1].startswith("prediction"):
        prediction_results.append(blob.name)

for prediction_result in prediction_results:
    gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{prediction_result}".replace(
        BUCKET_URI + "/", ""
    )
    data = bucket.get_blob(gfile_name).download_as_string()
    data = ndjson.loads(data)
    # print(data)

# Create input file with ground truth for evaluation 

In [None]:
input_file_name = "happiness-batch-prediction-input-with-groundtruth.jsonl"

In [None]:
# Instantiate the Storage client and create the new bucket
from google.cloud import  storage
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
# Iterate over the prediction instances, creating a new TXT file
# for each.
input_file_data = []
for count, instance in enumerate(instances):
    instance_name = f"input_{count}.txt"
    instance_file_uri = f"{BUCKET_URI}/evaluation-batch-prediction-input/{instance_name}"
    # Add the data to store in the JSONL input file.
    # out_put variable in each json instance is needed to act as ground_truth for the evaluation task
    tmp_data = {"content": instance_file_uri, "mimeType": "text/plain", "out_put": instance['Labels']}
    input_file_data.append(tmp_data)

    # Create the new instance file
    blob = bucket.blob("evaluation-batch-prediction-input/"+instance_name)
    blob.upload_from_string(instance['Text'])
    
import json
input_str = json.dumps(input_file_data[0])
for i in input_file_data[1:]:
    input_str = input_str + "\n" + json.dumps(i)
# input_str = "\n".join([str(d) for d in input_file_data])
file_blob = bucket.blob(f"{input_file_name}")
file_blob.upload_from_string(input_str)

# Create pipeline for model evaluation

Now, you run a Vertex AI BatchPrediction job and generate evaluations its results. 

To do so, you create a Vertex AI pipeline using the components available from the [`google-cloud-pipeline-components`](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.17/index.html) Python package.

### Define the Pipeline

While defining the flow of the pipeline, you get the model resource first. Then, you sample the provided source dataset for batch predictions and create a batch prediction. The explanations are enabled while creating the batch prediction job to generate feature attributions. Once the batch prediction job is completed, you get the classification evaluation metrics and the feature attributions from the results.

The pipeline uses the following components:

- `GetVertexModelOp`: Gets a Vertex AI Model Artifact. 
- `EvaluationDataSamplerOp`: Randomly downsamples an input dataset to a specified size for computing Vertex Explainable AI feature attributions for AutoML Tabular and custom models. Creates a Dataflow job with Apache Beam to downsample the dataset. 
- `EvaluationDataSplitterOp`: Removes the Ground Truth columns from the input dataset for supporting unstructured AutoML models and custom models in Batch Prediction. Creates a Dataflow job with Apache Beam to remove the ground truth columns.
- `ModelBatchPredictOp`: Creates a Google Cloud Vertex BatchPredictionJob and waits for it to complete. 
- `ModelEvaluationClassificationOp`: Compute evaluation metrics on a trained modelâ€™s batch prediction results. Creates a Dataflow job with Apache Beam and TFMA to compute evaluation metrics. Supports mutliclass classification evaluation for tabular, image, video, and text data. 
- `ModelImportEvaluationOp`: Imports a model evaluation artifact to an existing Vertex model with ModelService.ImportModelEvaluation. 

Learn more about [Google Cloud Pipeline Model Evaluation components](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-1.0.20/google_cloud_pipeline_components.experimental.evaluation.html).

In [None]:
@kfp.dsl.pipeline(
    name='automl-text-classification-evaluation')
def evaluation_automl_text_classification_evaluation_pipeline(
    project: str,
    location: str,
    root_dir: str,
    model_name: str,
    target_column_name: str,
    key_columns: list,
    ground_truth_gcs_uri: list,
    batch_predict_gcs_source_uris: list,
    batch_predict_instances_format: str,
    batch_predict_predictions_format: str = 'jsonl',
    batch_predict_machine_type: str = 'n1-standard-4',
    batch_predict_starting_replica_count: int = 5,
    batch_predict_max_replica_count: int = 10,
    batch_predict_explanation_metadata: dict = {},
    batch_predict_explanation_parameters: dict = {},
    batch_predict_explanation_data_sample_size: int = 10000,
    dataflow_machine_type: str = 'n1-standard-4',
    dataflow_max_num_workers: int = 5,
    dataflow_disk_size_gb: int = 50,
    dataflow_service_account: str = '',
    dataflow_subnetwork: str = '',
    dataflow_use_public_ips: bool = True,
    encryption_spec_key_name: str = ''):
  
    from google_cloud_pipeline_components.experimental.evaluation import (
        GetVertexModelOp,
        EvaluationDataSamplerOp,
        EvaluationDataSplitterOp,
        ModelEvaluationClassificationOp, 
        ModelEvaluationFeatureAttributionOp,
        ModelImportEvaluationOp
    )
    from google_cloud_pipeline_components.aiplatform import ModelBatchPredictOp
    
    # Get the Vertex AI model resource
    get_model_task = GetVertexModelOp(model_resource_name=model_name)
    
    # Run Data-sampling task
    data_sampler_task = EvaluationDataSamplerOp(
        project=project,
        location=location,
        root_dir=root_dir,
        gcs_source_uris=ground_truth_gcs_uri,
        instances_format=batch_predict_instances_format,
        sample_size=batch_predict_explanation_data_sample_size
    )
    
    # Run Data-splitter task
    data_splitter_task = EvaluationDataSplitterOp(
        project=project,
        location=location,
        root_dir=root_dir,
        gcs_source_uris=data_sampler_task.outputs["gcs_output_directory"],
        instances_format=batch_predict_instances_format,
        ground_truth_column=target_column_name
    )
    
    # Run Batch Explanations
    batch_predict_task = ModelBatchPredictOp(
                            project=project,
                            location=location,
                            model=get_model_task.outputs['model'],
                            job_display_name='model-registry-batch-predict-evaluation',
                            gcs_source_uris= data_splitter_task.outputs['gcs_output_directory'],
                            instances_format=batch_predict_instances_format,
                            predictions_format=batch_predict_predictions_format,
                            gcs_destination_output_uri_prefix=root_dir,
                            machine_type=batch_predict_machine_type,
                            starting_replica_count=batch_predict_starting_replica_count,
                            max_replica_count=batch_predict_max_replica_count,
                            encryption_spec_key_name=encryption_spec_key_name,
                        )

    # Run evaluation based on prediction type and feature attribution component.
    # After, import the model evaluations to the Vertex model.
    eval_task = ModelEvaluationClassificationOp(
                  project=project,
                  location=location,
                  root_dir=root_dir,
                  key_columns=key_columns,
                  ground_truth_column=target_column_name,
        
                  ground_truth_gcs_source = ground_truth_gcs_uri,
                  ground_truth_format="jsonl",
        
                  predictions_gcs_source=batch_predict_task.outputs['gcs_output_directory'],
                  predictions_format=batch_predict_predictions_format,
                  dataflow_machine_type=dataflow_machine_type,
                  dataflow_max_workers_num=dataflow_max_num_workers,
                  dataflow_disk_size=dataflow_disk_size_gb,
                  dataflow_service_account=dataflow_service_account,
                  dataflow_subnetwork=dataflow_subnetwork,
                  dataflow_use_public_ips=dataflow_use_public_ips,
                  encryption_spec_key_name=encryption_spec_key_name
                )

    ModelImportEvaluationOp(
          classification_metrics=eval_task.outputs['evaluation_metrics'],
          model=get_model_task.outputs['model'],
          dataset_type=batch_predict_instances_format
    )

### Compile the pipeline

Next, compile the pipline to the `automl_text_classification_evaluation.json` file.

In [None]:
from kfp.v2 import compiler
compiler.Compiler().compile(
    pipeline_func=evaluation_automl_text_classification_evaluation_pipeline,
    package_path="automl_text_classification_evaluation.json",
)

### Define the parameters to run the pipeline

Specify the required parameters to run the pipeline.

Set a display name for your pipeline.

In [None]:
PIPELINE_DISPLAY_NAME = "[your-pipeline-display-name]" # @param {type:"string"}

In [None]:
# If no display name is set, use the default one
if PIPELINE_DISPLAY_NAME == "[your-pipeline-display-name]" or \
    PIPELINE_DISPLAY_NAME == "" or PIPELINE_DISPLAY_NAME is None:
    PIPELINE_DISPLAY_NAME = "happiness_" + UUID 

To pass the required arguments to the pipeline, you define the following paramters below:

- `project`: Project ID.
- `location`: Region where the pipeline is run.
- `root_dir`: The GCS directory for keeping staging files and artifacts. A random subdirectory is created under the directory to keep job info for resuming the job in case of failure.
- `model_name`: Resource name of the trained AutoML Tabular Classification model.
- `target_column_name`: Name of the column to be used as the target for classification.
- `batch_predict_gcs_source_uris`: List of the Cloud Storage bucket uris of input instances for batch prediction.
- `batch_predict_instances_format`: Format of the input instances for batch prediction. Format used here is'**jsonl**'.
- `batch_predict_explanation_data_sample_size`: Size of the samples to be considered for batch prediction and evaluation.
- `ground_truth_gcs_uri`: Google Cloud Storage URI(-s) to your instances to run data splitter on. They must match instances_format.
- `key_columns` : The list of fields in the ground truth gcs source to format the joining key. Used to merge prediction instances with ground truth data.

In [None]:
DATA_SOURCE = f"{BUCKET_URI}/{input_file_name}"
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/happiness_{UUID}"
parameters = {
            'project':PROJECT_ID,
            'location':REGION,
            'root_dir':PIPELINE_ROOT,
            'model_name':model.resource_name,
            'target_column_name':"out_put",
            'batch_predict_gcs_source_uris': [DATA_SOURCE],
            'batch_predict_instances_format':'jsonl',
            'batch_predict_explanation_data_sample_size': 10,
            'ground_truth_gcs_uri' : [DATA_SOURCE],
            'key_columns' : ["content","mimeType"]
        }

Create a Vertex AI pipeline job using the following parameters:

- `display_name`: The name of the pipeline, this will show up in the Google Cloud console.
- `template_path`: The path of PipelineJob or PipelineSpec JSON or YAML file. It can be a local path, a Google Cloud Storage URI or an Artifact Registry URI.
- `parameter_values`: The mapping from runtime parameter names to its values that
        control the pipeline run.
- `enable_caching`: Whether to turn on caching for the run.

Learn more about [Class PipelineJob](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.PipelineJob).

After creating, run the pipeline job using the configured `SERVICE_ACCOUNT`.

In [None]:
evaluation_job = aip.PipelineJob(
    display_name=PIPELINE_DISPLAY_NAME,
    template_path="automl_text_classification_evaluation.json",
    parameter_values=parameters,
    enable_caching=True,
)

evaluation_job.run(service_account=SERVICE_ACCOUNT)

# Model Evaluation

In the results from last step, click on the generated link to see your run in the Cloud Console.

In the UI, many of the pipeline directed acyclic graph (DAG) nodes expand or collapse when you click on them. Here is a partially-expanded view of the DAG (click image to see larger version).

<img src="images/automl-text-classification-evaluation-image.PNG">

### Get the Model Evaluation Results

After the evalution pipeline is finished, run the below cell to print the evaluation metrics.

In [None]:
# Iterate over the pipeline tasks
for task in evaluation_job._gca_resource.job_detail.task_details:
    # Obtain the artifacts from the evaluation task
    if (
        ("model-evaluation" in task.task_name)
        and ("model-evaluation-import" not in task.task_name)
        and (
            task.state == aiplatform_v1.types.PipelineTaskDetail.State.SUCCEEDED
            or task.state == aiplatform_v1.types.PipelineTaskDetail.State.SKIPPED
        )
    ):
        evaluation_metrics = task.outputs.get("evaluation_metrics").artifacts[
            0
        ]  # ['artifacts']
        evaluation_metrics_gcs_uri = evaluation_metrics.uri

print(evaluation_metrics)
print(evaluation_metrics_gcs_uri)

### Visualize the metrics

Visualize the available metrics like `auRoc` and `logLoss` using a bar-chart.

In [None]:
import matplotlib.pyplot as plt
metrics = []
values = []
for i in evaluation_metrics.metadata.items():
    metrics.append(i[0])
    values.append(i[1])
plt.figure(figsize=(15, 5))
plt.bar(x=metrics, height=values)
plt.title("Evaluation Metrics")
plt.ylabel("Value")
plt.show()

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial -- *Note:* this is auto-generated and not all resources may be applicable for this tutorial:


- Dataset
- Model
- AutoML Training Job
- Batch Job
- Evaluation Job
- Cloud Storage Bucket

In [None]:
delete_dataset = True
delete_training_pipeline = True
delete_batchpredict_job = True
delete_evaluation_pipeline = True
delete_model = True
delete_endpoint = True
delete_bucket = False

dataset_type = "text"
dataset_display_name = "train-automl-happydb"
model_display_name = "train-automl-happydb"
endpoint_display_name = "train-automl-happydb"


if delete_endpoint:
    endpoints = aip.Endpoint.list(
        filter=f"display_name={endpoint_display_name}", order_by="create_time"
    )
    if endpoints:
        endpoint = endpoints[0]
        endpoint.undeploy_all()
        endpoint.delete()
        print("Deleted endpoint:", endpoint)

if delete_model:
    models = aip.Model.list(
        filter=f"display_name={model_display_name}", order_by="create_time"
    )
    if models:
        model = models[0]
        model.delete()
        print("Deleted model:", model)

if delete_dataset:
    if dataset_type == "tabular":
        datasets = aip.TabularDataset.list(
            filter=f"display_name={dataset_display_name}", order_by="create_time"
        )
        if datasets:
            dataset = datasets[0]
            dataset.delete()
            print("Deleted dataset:", dataset)

    if dataset_type == "image":
        datasets = aip.ImageDataset.list(
            filter=f"display_name={dataset_display_name}", order_by="create_time"
        )
        if datasets:
            dataset = datasets[0]
            dataset.delete()
            print("Deleted dataset:", dataset)

    if dataset_type == "text":
        datasets = aip.TextDataset.list(
            filter=f"display_name={dataset_display_name}", order_by="create_time"
        )
        if datasets:
            dataset = datasets[0]
            dataset.delete()
            print("Deleted dataset:", dataset)

    if dataset_type == "video":
        datasets = aip.VideoDataset.list(
            filter=f"display_name={dataset_display_name}", order_by="create_time"
        )
        if datasets:
            dataset = datasets[0]
            dataset.delete()
            print("Deleted dataset:", dataset)
            
if delete_training_pipeline:
    job.delete()

if delete_batchpredict_job:
    batch_prediction_job.delete()
    
if delete_evaluation_pipeline:
    evaluation_job.delete()


if delete_bucket and os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI