In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex SDK for Python: AutoML Video Classification Example


<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/sdk/SDK_AutoML_Video_Classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/sdk/SDK_AutoML_Video_Classification.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/sdk/SDK_AutoML_Video_Classification.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

This notebook demonstrate how to create an AutoML Video Classification Model, with a Vertex AI video dataset, and how to serve the model for batch prediction. It will require you provide a bucket where the dataset will be stored.

Note: you may incur charges for training, prediction, storage or usage of other GCP products in connection with testing this SDK.

### Dataset

##### HMDB: a large human motion database
We prepared some training data and prediction data for the demo using the [HMDB Dataset](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database).

The HMDB Dataset is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/

For more information about this dataset please visit: https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/

### Objective

The objective of this notebook is to build a AutoML Video Classification Model. The following steps have been followed:  

- Create a Dataset on Vertex AI.
- Launch a Training Job and Create a Model on Vertex AI
- Perform batch Prediction Job on the Model
- Clean Up

## Costs

This tutorial uses the following billable components of Google Cloud:

- Vertex AI
- Cloud Storage


Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

**If you are using Colab or Vertex AI Workbench Notebooks**, your environment already meets
all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements.
You need the following:

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps provide a condensed set of
instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip3 install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

### Install additional packages


In [1]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

In [2]:
! pip3 install {USER_FLAG} --upgrade google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.16.1-py2.py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 6.8 MB/s eta 0:00:01
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3
  Downloading google_cloud_resource_manager-1.6.0-py2.py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 54.0 MB/s eta 0:00:01
[?25hCollecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0
  Downloading google_api_core-2.8.2-py3-none-any.whl (114 kB)
[K     |████████████████████████████████| 114 kB 54.2 MB/s eta 0:00:01
Collecting googleapis-common-protos<2.0dev,>=1.56.2
  Downloading googleapis_common_protos-1.56.4-py2.py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 63.7 MB/s eta 0:00:01
Collecting grpc-google-iam-v1<1.0.0dev,>=0.12.4
  Downloading grpc_google_iam_v1-0.12.4-py2.py3-none-any.whl (26 kB)
Installing collected packages: google

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [3]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI, Cloud Storage, and Compute Engine APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component,storage-component.googleapis.com). 

1. [Configure your Google Cloud project for Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project).

1. If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [23]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  vertex-ai-dev


Otherwise, set your project ID here.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

#### UUID

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a uuid for each instance session, and append it onto the name of resources you create in this tutorial.

In [2]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

Set the name of your Cloud Storage bucket below. It must be unique across all
Cloud Storage buckets.

You may also change the `REGION` variable, which is used for operations
throughout the rest of this notebook. Make sure to [choose a region where Vertex AI services are
available](https://cloud.google.com/vertex-ai/docs/general/locations#available_regions). You may
not use a Multi-Regional Storage bucket for training with Vertex AI.

In [3]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "[your-region]"  # @param {type:"string"}

In [5]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + UUID

if REGION == "[your-region]":
    REGION = "us-central1"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [6]:
! gsutil mb -l $REGION $BUCKET_URI

Creating gs://vertex-ai-devaip-l0fpag0g/...


**Finally**, validate access to your Cloud Storage bucket by examining its contents:

In [27]:
! gsutil ls -al $BUCKET_URI

                                 gs://vertex-ai-devaip-l0fpag0g/mbsdk_automl-video-training_classification/


### Import libraries and define constants


In [47]:
from google.cloud import aiplatform
from google.cloud import storage
import json

MY_PROJECT = PROJECT_ID
MY_STAGING_BUCKET = BUCKET_URI  # bucket should be in same region as ucaip

### Set Your Task Name, and GCS Prefix

If you want to centeralize all input and output files under the gcs location.

In [38]:
TASK_TYPE = "mbsdk_automl-video-training"
PREDICTION_TYPE = "classification"
MODEL_TYPE = "CLOUD"

TASK_NAME = f"{TASK_TYPE}_{PREDICTION_TYPE}"
BUCKET_NAME = MY_STAGING_BUCKET.split("gs://")[1]
GCS_PREFIX = TASK_NAME

print(f"Bucket Name:    {BUCKET_NAME}")
print(f"Task Name:      {TASK_NAME}")

Bucket Name:    vertex-ai-devaip-l0fpag0g
Task Name:      mbsdk_automl-video-training_classification


In [10]:
automl_video_demo_train_data = (
    "gs://automl-video-demo-data/hmdb_split1_5classes_all.csv"
)
automl_video_demo_batch_prediction_data = (
    "gs://automl-video-demo-data/hmdb_split1_predict.jsonl"
)

### Copy AutoML Video Demo Train Data for Creating Managed Dataset

In [11]:
gcs_source_train = f"gs://{BUCKET_NAME}/{TASK_NAME}/data/video_classification.csv"

In [12]:
!gsutil cp $automl_video_demo_train_data $gcs_source_train

Copying gs://automl-video-demo-data/hmdb_split1_5classes_all.csv [Content-Type=text/csv]...
/ [1 files][ 52.8 KiB/ 52.8 KiB]                                                
Operation completed over 1 objects/52.8 KiB.                                     


# Run AutoML Video Training with Managed Video Dataset

## Initialize Vertex SDK for Python

Initialize the *client* for Vertex AI.

In [29]:

aiplatform.init(project=MY_PROJECT, staging_bucket=MY_STAGING_BUCKET)

## Create a Dataset on Vertex AI
We will now create a Vertex AI video dataset using the previously prepared csv files. Choose one of the options below. 

Option 1: Using MBSDK VideoDataset class

In [14]:
dataset = aiplatform.VideoDataset.create(
    display_name=f"temp-{TASK_NAME}",
    gcs_source=gcs_source_train,
    import_schema_uri=aiplatform.schema.dataset.ioformat.video.classification,
    sync=False,
)

Creating VideoDataset
Create VideoDataset backing LRO: projects/931647533046/locations/us-central1/datasets/6222780615398260736/operations/2821241627354333184
VideoDataset created. Resource name: projects/931647533046/locations/us-central1/datasets/6222780615398260736
To use this VideoDataset in another session:
ds = aiplatform.VideoDataset('projects/931647533046/locations/us-central1/datasets/6222780615398260736')
Importing VideoDataset data: projects/931647533046/locations/us-central1/datasets/6222780615398260736
Import VideoDataset data backing LRO: projects/931647533046/locations/us-central1/datasets/6222780615398260736/operations/5605592096976142336


Option 2: Using MBSDK Dataset class
```
dataset = aiplatform.Dataset.create(
    display_name=f'temp-{TASK_NAME}',
    metadata_schema_uri=aiplatform.schema.dataset.metadata.video,
    gcs_source=gcs_source_train, 
    import_schema_uri=aiplatform.schema.dataset.ioformat.video.classification,
    sync=False
)
```

In [15]:
dataset.wait()

VideoDataset data imported. Resource name: projects/931647533046/locations/us-central1/datasets/6222780615398260736


## Launch a Training Job and Create a Model on Vertex AI

### Config a Training Job

In [20]:
job = aiplatform.AutoMLVideoTrainingJob(
    display_name=f"temp-{TASK_NAME}",
    prediction_type=PREDICTION_TYPE,
    model_type=MODEL_TYPE,
)

### Run the Training Job

In [21]:
model = job.run(
    dataset=dataset,
    training_fraction_split=0.1,
    test_fraction_split=0.9,
    model_display_name=f"temp-{TASK_NAME}",
    sync=False,
)

View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6896921156079583232?project=931647533046


In [None]:
model.wait()

AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-central1/trainingPipelines/6896921156079583232 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLVideoTrainingJob projects/931647533046/locations/us-

# Batch Prediction Job on the Model

### Copy AutoML Video Demo Prediction Data for Creating Batch Prediction Job

In [39]:
gcs_source_batch_prediction = (
    f"gs://{BUCKET_NAME}/{TASK_NAME}/data/video_classification_batch_prediction.jsonl"
)
gcs_destination_prefix_batch_prediction = (
    f"gs://{BUCKET_NAME}/{TASK_NAME}/batch_prediction"
)

In [40]:
!gsutil cp $automl_video_demo_batch_prediction_data $gcs_source_batch_prediction

Copying gs://automl-video-demo-data/hmdb_split1_predict.jsonl [Content-Type=application/octet-stream]...
/ [1 files][  887.0 B/  887.0 B]                                                
Operation completed over 1 objects/887.0 B.                                      


In [41]:
batch_predict_job = model.batch_predict(
    job_display_name=f"temp-{TASK_NAME}",
    gcs_source=gcs_source_batch_prediction,
    gcs_destination_prefix=gcs_destination_prefix_batch_prediction,
    sync=False,
)

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/931647533046/locations/us-central1/batchPredictionJobs/2836081667074949120
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/931647533046/locations/us-central1/batchPredictionJobs/2836081667074949120')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/2836081667074949120?project=931647533046


In [49]:
batch_predict_job.wait()
bp_iter_outputs = batch_predict_job.iter_outputs()

prediction_results = list()
for blob in bp_iter_outputs:
    if blob.name.split("/")[-1].startswith("prediction"):
        prediction_results.append(blob.name)

In [68]:
client = storage.Client()
bucket = client.get_bucket(BUCKET_URI.replace("gs://", ""))
for prediction_result in prediction_results:
    gfile_name = f"{prediction_result}"
    data = bucket.blob(gfile_name).download_as_string()
    data = json.loads(data)
    print(data)


{'instance': {'content': 'gs://automl-video-demo-data/hmdb51/35_pull_ups_pullup_f_nm_np1_fr_goo_1.mp4', 'mimeType': 'mp4', 'timeSegmentStart': '0.0s', 'timeSegmentEnd': '2.633333s'}, 'prediction': [{'id': '2649711422509940736', 'displayName': 'pullup', 'type': 'segment-classification', 'timeSegmentStart': '0s', 'timeSegmentEnd': '2.633333s', 'confidence': 0.8607105}, {'id': '6108475936330481664', 'displayName': 'ride_horse', 'type': 'segment-classification', 'timeSegmentStart': '0s', 'timeSegmentEnd': '2.633333s', 'confidence': 0.03772395}, {'id': '4955554431723634688', 'displayName': 'golf', 'type': 'segment-classification', 'timeSegmentStart': '0s', 'timeSegmentEnd': '2.633333s', 'confidence': 0.03616389}, {'id': '1496789917903093760', 'displayName': 'kick_ball', 'type': 'segment-classification', 'timeSegmentStart': '0s', 'timeSegmentEnd': '2.633333s', 'confidence': 0.033297308}, {'id': '7261397440937328640', 'displayName': 'cartwheel', 'type': 'segment-classification', 'timeSegmentS

## Clean up
<a name="section-13"></a>

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

In [None]:

# Delete the dataset using the Vertex dataset object
dataset.delete()

# Delete the model using the Vertex model object
model.delete()

# Delete the AutoML or Pipeline training job
job.delete()

# Delete the batch prediction job using the Vertex batch prediction object
batch_predict_job.delete()

# Delete the Cloud Storage bucket

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil -m rm -r $BUCKET_URI