In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Vertex AI Data Labeling

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/datasets/get_started_with_data_labeling.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/datasets/get_started_with_data_labeling.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/datasets/get_started_with_data_labeling.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI in production. This tutorial covers data management: get started with Vertex AI Data Labeling service.

Learn more about [Vertex AI Data Labeling](https://cloud.google.com/vertex-ai/docs/datasets/data-labeling-job).

### Objective

In this tutorial, you learn how to use the `Vertex AI Data Labeling` service.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI Data Labeling`
- `Vertex AI Dataset`

The steps performed include:

- Create a Specialist Pool for data labelers.
- Create a data labeling job.
- Submit the data labeling job.
- List data labeling jobs.
- Cancel a data labeling job.

Learn more about [Request a Vertex AI Data Labeling job](https://cloud.google.com/vertex-ai/docs/datasets/data-labeling-job).

### Dataset

The dataset used for this tutorial is the [Flowers dataset](https://www.tensorflow.org/datasets/catalog/tf_flowers) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). The version of the dataset you use in this tutorial is stored in a public Cloud Storage bucket. The trained model predicts the type of flower an image is from a class of five flowers: daisy, dandelion, rose, sunflower, or tulip.

### Costs 


This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage


Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Installation

Install the following packages to execute this notebook.

In [None]:
import os

# Install the packages
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage 

if os.getenv("IS_TESTING"):
    ! pip3 install --upgrade --quiet google-api-core==2.10 

### Colab only: Uncomment the following cell to restart the kernel

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

#### Email

You need an email address to send labeling job request to. This is the email address will be the manager of the data labeling specialist pool.

In this tutorial, if you don't specify an email address, the email address associated with your project ID will be used.

In [None]:
EMAIL = "[your-email-address]"  # @param {type: "string"}

if os.getenv("IS_TESTING"):
    EMAIL = "noreply@google.com"

In [None]:
if EMAIL == "[your-email-address]":
    shell_output = ! gcloud auth list 2>/dev/null
    EMAIL = shell_output[2].replace("*", "").strip()

print(EMAIL)

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants


#### Import Vertex AI SDK

Import the Vertex AI SDK into our Python environment.


In [None]:
import os
import time

import google.cloud.aiplatform as aip
from google.cloud import storage
from google.cloud.aiplatform import gapic
from google.protobuf.json_format import ParseDict
from google.protobuf.struct_pb2 import Value

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
aip.init(project=PROJECT_ID, location=REGION)

#### Vertex AI constants

Setup up the following constants for Vertex AI:

- `API_ENDPOINT`: The Vertex AI API service endpoint for dataset, model, job, pipeline and endpoint services.
- `PARENT`: The Vertex AI location root path for dataset, model and endpoint resources.

In [None]:
# API Endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

# Vertex AI location root path for your dataset, model and endpoint resources
PARENT = "projects/" + PROJECT_ID + "/locations/" + REGION

#### Schema constants

Next, setup constants for schemas related image classification datasets:

- Data Labeling (Annotations) Schemas: Tells the managed dataset service how the data is labeled (annotated).

In [None]:
# Image labeling task
LABELING_SCHEMA_IMAGE = "gs://google-cloud-aiplatform/schema/datalabelingjob/inputs/image_classification_1.0.0.yaml"

## Create clients

The Vertex AI SDK works as a client/server model. On your side (the Python script) you create a client that sends requests and receives responses from the server (Vertex AI).

You use several clients in this tutorial, so set them all up upfront.

- Specialist pool service for specialist pools
- Job Service for data labeling


In [None]:
# client options same for all services
client_options = {"api_endpoint": API_ENDPOINT}

clients = {}
clients["job"] = gapic.JobServiceClient(client_options=client_options)

# add client for specialist pool
clients["specialist_pool"] = gapic.SpecialistPoolServiceClient(
    client_options=client_options
)

for client in clients.items():
    print(client)

### Create a CSV file for examples to label

Next, you will create a CSV file for the examples you are requesting to be labeled. 

In this example, the examples to label are images. For each row in the CSV file, you specify the Cloud Storage location of the image to label.

In [None]:
test_filename = "labeling.csv"
LABELING_FILES = [
    "gs://cloud-samples-data/vision/automl_classification/flowers/daisy/100080576_f52e8ee070_n.jpg",
    "gs://cloud-samples-data/vision/automl_classification/flowers/daisy/102841525_bd6628ae3c.jpg",
]

IMPORT_FILE = BUCKET_URI + "/labeling.csv"

bucket = storage.Client(project=PROJECT_ID).bucket(BUCKET_URI.replace("gs://", ""))

# creating a blob
blob = bucket.blob(blob_name=test_filename)

# creating data variable
data = LABELING_FILES[0] + "\n" + LABELING_FILES[1] + "\n"

# uploading data variable content to bucket
blob.upload_from_string(data, content_type="text/csv")

# printing path of uploaded file
print(IMPORT_FILE)

# printing content of uploaded file
! gsutil cat $IMPORT_FILE

## Create a unlabeled dataset

Next, you create a dataset for the data to be labeled.

In [None]:
dataset = aip.ImageDataset.create("labeling")
print(dataset)

## Import the unlabeled data

Now, import the unlabeled data to the dataset, i.e., the examples to be labeled.

In [None]:
dataset.import_data(
    gcs_source=[IMPORT_FILE],
    import_schema_uri=aip.schema.dataset.ioformat.image.single_label_classification,
)

## Create a new data specialist pool

Your data labeling job will be sent to a data specialist pool. You may have one or more multiple specialist pools. 

In this next step, you create a new specialist pool with the method `create_specialist_pool()`. The request includes the parameters:

- `name`: The resource name of the specialist pool.
- `display_name`: A human readable name for the specialist pool.
- `specialist_manager_emails`: A list of the email addresses of the manager(s) for the specialist pool.

*Note:* You can use an existing specialist pool if one already existed.

In [None]:
specialist_pool = {
    "name": "labeling",
    "display_name": "labeling",
    "specialist_manager_emails": [EMAIL],
}

request = clients["specialist_pool"].create_specialist_pool(
    parent=PARENT, specialist_pool=specialist_pool
)

result = request.result()
print(result)

specialist_name = result.name

specialist_id = specialist_name.split("/")[-1]

print(specialist_name)

## Create data labeling job

Now that you have a specialist pool, you can send a data labeling request using the `create_data_labeling_job()` method.

Your request will consist of the following:

- The Vertex AI Dataset with the unlabeled data.
- Instructions for labeling.

In [None]:
# create placeholder file for instructions for data labeling
! echo "this is instruction" >> instruction.txt | gsutil cp instruction.txt $BUCKET_URI

In [None]:
LABLEING_SCHEMA = LABELING_SCHEMA_IMAGE
INSTRUCTION_FILE = BUCKET_URI + "/instruction.txt"

inputs = ParseDict({"annotation_specs": ["rose"]}, Value())

data_labeling_job = {
    "display_name": "labeling",
    "datasets": [dataset.resource_name],
    "labeler_count": 1,
    "instruction_uri": INSTRUCTION_FILE,
    "inputs_schema_uri": LABLEING_SCHEMA,
    "inputs": inputs,
    "annotation_labels": {
        "aiplatform.googleapis.com/annotation_set_name": "data_labeling_job_specialist_pool"
    },
    "specialist_pools": [specialist_name],
}

print(data_labeling_job)

request = clients["job"].create_data_labeling_job(
    parent=PARENT, data_labeling_job=data_labeling_job
)

print(request)

labeling_task_name = request.name

print(labeling_task_name)

### Get a data labeling job

You can get information on your data labeling job using the `get_data_labeling_job()` method, with the following parameters:

- `name`: The name of the labeling task.

In [None]:
request = clients["job"].get_data_labeling_job(name=labeling_task_name)
print(request)

### Cancel a data labeling task

You can cancel a data labeling request using the `cancel_data_labeling_job()` method, with the following parameters:

- `name`: The name of the labeling task.

In [None]:
request = clients["job"].cancel_data_labeling_job(name=labeling_task_name)
print(request)

### Wait for labeling job to be canceled

The cancel request is asyncrhonous. The code below polls on the labeling job status until the status is CANCELED.

In [None]:
while True:
    response = clients["job"].get_data_labeling_job(name=labeling_task_name)
    if response.state == gapic.JobState.JOB_STATE_CANCELLED:
        print("Labeling job CANCELED")
        break
    else:
        print("Canceling labeling job:", response.state)
        time.sleep(60)

# Cleaning up

To clean up all GCP resources used in this project, you can [delete the GCP
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.


In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False

# Delete the dataset using the Vertex AI fully qualified identifier for the dataset
dataset.delete()

# Delete the labeling job using the Vertex AI fully qualified identifier for the dataset
request = clients["job"].delete_data_labeling_job(name=labeling_task_name)

# Delete the specialist pool using the Vertex AI fully qualified identifier for the dataset
clients["specialist_pool"].delete_specialist_pool(name=specialist_name)

# Delete the bucket created
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI