In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Custom training with custom container image and automatic model upload to Vertex AI Model Registry

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/custom/custom_training_container_and_model_registry.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fcustom%2Fcustom_training_container_and_model_registry.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/custom/custom_training_container_and_model_registry.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/custom/custom_training_container_and_model_registry.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This tutorial demonstrates how to use the Vertex AI SDK for Python to train using a custom container image and automatically register the model in Vertex AI Model Registry.

Learn more about [Custom training](https://cloud.google.com/vertex-ai/docs/training/custom-training).

### Objective

In this tutorial, you train a machine learning model custom container image approach for custom training in Vertex AI. The trained model is further registered in the Vertex AI Model Registry automatically. You can alternatively create custom models using `gcloud` command-line tool or online using Cloud Console.

This tutorial uses the following Vertex AI services and resources:

- Vertex AI Model Registry
- Vertex AI Training


The steps performed include:

- Create a Vertex AI custom job for training a model.
- Train and register a TensorFlow model using a custom container.
- List the registered model in the Vertex AI Model Registry.

### Dataset

The dataset used for this tutorial is the [CIFAR10 dataset](https://www.tensorflow.org/datasets/catalog/cifar10) from [TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview). The version of the dataset you use is built into TensorFlow. The trained model predicts which class an image is from the ten classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage
* Artifact Registry
* Cloud Build

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), [Artifact Registry pricing](https://cloud.google.com/artifact-registry/pricing), and [Cloud Build pricing](https://cloud.google.com/build/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Get started

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade google-cloud-aiplatform --quiet

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To run this tutorial, you must have an existing Google Cloud project. Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

### Create a Cloud Storage bucket

Create a storage bucket to store artifacts such as datasets and trained model files.

In [None]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

**If your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $LOCATION -p $PROJECT_ID $BUCKET_URI

### Initialize Vertex AI SDK for Python

To get started using Vertex AI, you must enable the [Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

### Set hardware accelerators

You can set hardware accelerators for training and prediction.

Set the variables `TRAIN_GPU/TRAIN_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa T4 GPUs allocated to each VM, you would specify:

    (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4, 4)


Otherwise specify `(None, None)` to use a container image to run on a CPU.

Learn more [hardware accelerator support](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) for your location.

In [None]:
TRAIN_GPU, TRAIN_NGPU = (None, None)

### Set pre-built container image

Set the pre-built Docker container image for prediction.

Set the variable `TF` to the TensorFlow version of the container image. Replace dot with hyphen in the version number for specifying the image version. For example, `2-1` indicates version 2.1, and `1-15` indicates version 1.15. 

For the latest list of pre-built images, see [Pre-built containers for prediction](https://cloud.google.com/ai-platform-unified/docs/predictions/pre-built-containers).

In [None]:
# Set TF version
TF = "2-7"

# Format the deployment image path
DEPLOY_VERSION = "tf2-cpu.{}".format(TF)
DEPLOY_IMAGE = "{}-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(
    LOCATION.split("-")[0], DEPLOY_VERSION
)

print("Deployment:", DEPLOY_IMAGE)

### Set machine type

Next, set the machine type to use for training and prediction.

 - Set the variables `TRAIN_COMPUTE` and `DEPLOY_COMPUTE` to configure the compute of the VMs used for training and prediction respectively.
 - Set `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

**Note**: The following isn't supported for training:

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

**Note**: You may also use n2 and e2 machine types for training and deployment, but they don't support GPUs.

In [None]:
TRAIN_COMPUTE = "n1-standard-4"
print("Train machine type", TRAIN_COMPUTE)

## Create the custom container image for training

Now, you're ready for training your custom model on CIFAR10 data. There are two ways you can train a custom model using a container image:

- **Use a Google Cloud prebuilt container**. If you use a prebuilt container, you provide a Python package that runs your training code inside the pre-built container.

- **Use your own custom container image**. If you use your own container, you need to build the container image that fetches and your code for training a custom model.

### Create a training folder

In this tutorial, you train a CIFAR10 model using your own custom container image. To run the training in your container, you create a Python training script that trains the model.

First, create a directory for storing the training scripts and other components. Then, create a subdirectory(`trainer/`) for storing your training scripts separately. This subdirectory should include a `__init__.py` to make it a module. Learn more about defining [Python modules](https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container#python-modules).

Eventually, your directory structure looks like:

```
- custom/
    - Dockerfile
    - trainer/
        - __init__.py
        - task.py
```

In [None]:
# Set the name of your app directory
APPLICATION_DIR = "custom"
# Remove if there's any such folder already
! rm -rf $APPLICATION_DIR
# Create your app directory
! mkdir $APPLICATION_DIR
# Create a subdirectory for store the training scripts
! mkdir $APPLICATION_DIR/trainer
# Create the init file
! touch $APPLICATION_DIR/trainer/__init__.py

### Create the training script

In the next cell, define your training script `task.py` inside your training folder. 

In the training script, you perform the following steps sequentially:

1. Load CIFAR10 dataset from TF Datasets (tfds).
1. Build a model using TF.Keras model API.
1. Compile the model (`compile()`).
1. Set a training distribution strategy according to the argument `args.distribute`.
1. Train the model (`fit()`) with epochs and steps according to the arguments `args.epochs` and `args.steps`
1. Save the trained model to the specified base output directory which is accessed by the environment variable `AIP_MODEL_DIR`.

In [None]:
%%writefile $APPLICATION_DIR/trainer/task.py
# Single, Mirror and Multi-Machine Distributed Training for CIFAR-10

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.python.client import device_lib
import argparse
import os
import sys
tfds.disable_progress_bar()

parser = argparse.ArgumentParser()

parser.add_argument('--lr', dest='lr',
                    default=0.01, type=float,
                    help='Learning rate.')
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
parser.add_argument('--steps', dest='steps',
                    default=200, type=int,
                    help='Number of steps per epoch.')
parser.add_argument('--distribute', dest='distribute', type=str, default='single',
                    help='distributed training strategy')
args = parser.parse_args()
print('Python Version = {}'.format(sys.version))
print('TensorFlow Version = {}'.format(tf.__version__))
print('TF_CONFIG = {}'.format(os.environ.get('TF_CONFIG', 'Not found')))
print('DEVICES', device_lib.list_local_devices())

# Single Machine, single compute device
if args.distribute == 'single':
    if tf.test.is_gpu_available():
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
# Single Machine, multiple compute device
elif args.distribute == 'mirror':
    strategy = tf.distribute.MirroredStrategy()
# Multiple Machine, multiple compute device
elif args.distribute == 'multi':
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

# Multi-worker configuration
print('num_replicas_in_sync = {}'.format(strategy.num_replicas_in_sync))

# Preparing dataset
BUFFER_SIZE = 10000
BATCH_SIZE = 64

def make_datasets_unbatched():

  # Scaling CIFAR10 data from (0, 255] to (0., 1.]
  def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.0
    return image, label


  datasets, info = tfds.load(name='cifar10',
                            with_info=True,
                            as_supervised=True)
  return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE).repeat()


# Build the Keras model
def build_and_compile_cnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(32, 32, 3)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Conv2D(32, 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(10, activation='softmax')
  ])
  model.compile(
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      optimizer=tf.keras.optimizers.SGD(learning_rate=args.lr),
      metrics=['accuracy'])
  return model


# Train the model
NUM_WORKERS = strategy.num_replicas_in_sync
# Here the batch size scales up by number of workers since
# `tf.data.Dataset.batch` expects the global batch size.
GLOBAL_BATCH_SIZE = BATCH_SIZE * NUM_WORKERS
train_dataset = make_datasets_unbatched().batch(GLOBAL_BATCH_SIZE)

with strategy.scope():
  # Creation of dataset, and model building/compiling need to be within
  # `strategy.scope()`.
  model = build_and_compile_cnn_model()

MODEL_DIR = os.getenv("AIP_MODEL_DIR")
model.fit(x=train_dataset, epochs=args.epochs, steps_per_epoch=args.steps)
model.save(MODEL_DIR)
print ("Save the model to",MODEL_DIR)

### Write the contents of Dockerfile

To containerize your code, you need to create a Dockerfile. In the Dockerfile you define all the steps needed to run your container. These steps include:

1. Install a pre-defined container image from TensorFlow repository for deep learning images.
2. Copy in the Python training code to the container.
3. Set the entry into the Python training script as `trainer/task.py`. 

**Note**: The extension `.py` is dropped for `task.py` in the ENTRYPOINT command, as it is implied.

In [None]:
%%writefile $APPLICATION_DIR/Dockerfile
# Fetch the base image
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-3

# Set the working dir for the rest of the commands
WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.task"]

### Enable Artifact Registry API

To use your container image with Vertex AI, you need to upload your image to the Artifact Registry. Before you can push your image, you must enable the Artifact Registry API service for your project.

Learn more about [enabling the Artifact Registry service](https://cloud.google.com/artifact-registry/docs/enable-service).

In [None]:
! gcloud services enable artifactregistry.googleapis.com

### Create a repository in Artifact Registry

In [None]:
import os

# Set the repository name
REPO_NAME='custom-train-cifar10'

if os.getenv("IS_TESTING"):
    ! sudo apt-get update --yes && sudo apt-get --only-upgrade --yes install google-cloud-sdk-cloud-run-proxy google-cloud-sdk-harbourbridge google-cloud-sdk-cbt google-cloud-sdk-gke-gcloud-auth-plugin google-cloud-sdk-kpt google-cloud-sdk-local-extract google-cloud-sdk-minikube google-cloud-sdk-app-engine-java google-cloud-sdk-app-engine-go google-cloud-sdk-app-engine-python google-cloud-sdk-spanner-emulator google-cloud-sdk-bigtable-emulator google-cloud-sdk-nomos google-cloud-sdk-package-go-module google-cloud-sdk-firestore-emulator kubectl google-cloud-sdk-datastore-emulator google-cloud-sdk-app-engine-python-extras google-cloud-sdk-cloud-build-local google-cloud-sdk-kubectl-oidc google-cloud-sdk-anthos-auth google-cloud-sdk-app-engine-grpc google-cloud-sdk-pubsub-emulator google-cloud-sdk-datalab google-cloud-sdk-skaffold google-cloud-sdk google-cloud-sdk-terraform-tools google-cloud-sdk-config-connector
    ! gcloud components update --quiet

# Create a repository in the Artifact Registry
!gcloud artifacts repositories create $REPO_NAME --repository-format=docker \
--location=$LOCATION --description="Docker repository"

### Configure authentication to your private repo

Before you push or pull container images, configure Docker to use the `gcloud` command-line tool to authenticate requests to Artifact Registry for your region.

In [None]:
! gcloud auth configure-docker {LOCATION}-docker.pkg.dev --quiet

### Push your container image to Artifact Registry

Specify the image path in the repository as `IMAGE_URI` and use the image path as a tag with the `docker build` command. Then, use `docker push` command to push your container image to Artifact Registry. Artifact registry creates the image in the repository based on the tag specified when you build the image.

**Note**: As Docker is currently not fully supported in Colab, you use Cloud Build to push your container image to Artifact Registry. Learn more about [`gcloud builds submit`](https://cloud.google.com/sdk/gcloud/reference/builds/submit) command.

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules

# Specify the image uri for Artifact Registry
IMAGE_URI = f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{REPO_NAME}/cifar10:latest"

# If the env isn't Colab, run the docker commands
if not IS_COLAB:
    ! docker build -f $APPLICATION_DIR/Dockerfile -t $IMAGE_URI custom
    ! docker push $IMAGE_URI
# If using Colab, run the Cloud Build command
else:
    ! gcloud builds submit {APPLICATION_DIR}/ --region={LOCATION} --tag={IMAGE_URI}

## Create a custom training job

A custom training job is created with the `CustomTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the custom training job.
- `container_uri`: The training container image.
- `model_serving_container_image_uri`: The container image uri for deploying the model.

In [None]:
# Create the custom container training job
job = aiplatform.CustomContainerTrainingJob(
    display_name="cifar10-training",
    container_uri=IMAGE_URI,
    model_serving_container_image_uri=DEPLOY_IMAGE,
)

print(job)

### Set the command-line arguments for training script

In this step, you define the command-line arguments for running your training script. 

In this example, you pass the following two arguments:

  - `epochs`: The number of epochs for training as defined in *EPOCHS*.
  - `steps`: The number of steps per epoch as defined in *STEPS*.
  
Refer to the defined training script for more supported arguments.

In [None]:
# Set the number of epochs
EPOCHS = 1
# Set the number of steps
STEPS = 1

CMDARGS = [
    "--epochs=" + str(EPOCHS),
    "--steps=" + str(STEPS),
]

### Run the custom training job

Next, you run the custom job using the `run()` method, with the following parameters:

- `args`: The command-line arguments to pass to the training script.
- `replica_count`: The number of compute instances for training (replica_count = 1 is single node training).
- `machine_type`: The machine type for the compute instances.
- `accelerator_type`: The type of hardware accelerator to be used.
- `accelerator_count`: The number of accelerators to attach to a worker replica.
- `base_output_dir`: The Cloud Storage location where the model artifacts need to be saved.
- `model_display_name`: A human readable name for the registered model.
- `sync`: Whether to block until completion of the job (Boolean).

In [None]:
# Run the training with GPU
if TRAIN_GPU:
    model = job.run(
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_type=TRAIN_GPU.name,
        accelerator_count=TRAIN_NGPU,
        base_output_dir=BUCKET_URI,
        model_display_name="cifar10",
        sync=True,
    )
# Run the training with CPU
else:
    model = job.run(
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        base_output_dir=BUCKET_URI,
        model_display_name="cifar10",
        sync=True,
    )

### View the model in the Model Registry

The `run()` method returns a Vertex AI model resource which indicates that your model is successfully registered in the Model Registry.

In [None]:
print(model.gca_resource)

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial.

In [None]:
# Delete the artifact registry repo
! gcloud artifacts repositories delete $REPO_NAME --location $LOCATION --quiet

# Delete the custom training job
job.delete()

# Delete model
model.delete()

# Delete the Cloud Storage bucket
delete_bucket = True
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_URI

# Delete application directory
!rm -rf $APPLICATION_DIR