In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Model Garden - TIMM

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/model_garden/model_garden_pytorch_timm.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
Open in Vertex AI Workbench
    </a>
    (a Python-3 CPU notebook is recommended)
  </td>
</table>

## Overview

This notebook demonstrates running local inference using the [timm](https://github.com/rwightman/pytorch-image-models) library, finetuning the PyTorch [timm models](https://github.com/huggingface/pytorch-image-models#models), and deploying the models on [Vertex AI](https://cloud.google.com/vertex-ai).

### Objective

- Setup environment.
- Run inference locally using the timm library.
- Create a custom training job on Vertex AI to train or finetune a model.
- Deploy the model on Vertex AI for online prediction.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Setup environment

### Setup cloud project

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project). Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

1. [Enable Artifact Registry](https://cloud.google.com/artifact-registry/docs/enable-service) and [create a repository](https://cloud.google.com/artifact-registry/docs/repositories/create-repos) for storing docker images.

1. [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs.

1. [Enable the Vertex AI API and Compute Engine API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,compute_component).

1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console) with `Vertex AI User` and `Storage Object Admin` roles for deploying fine tuned model to Vertex AI endpoint.

### Setup required libraries

It's highly recommended to run this notebook on [Vertex AI workbench](https://cloud.google.com/vertex-ai-workbench), where you don't need to manually install any additional libraries.

If you are running this notebook locally, you will need to install the [Cloud SDK](https://cloud.google.com/sdk) and [gsutil](https://cloud.google.com/storage/docs/gsutil_install).

### Install libraries

In [None]:
! pip3 install timm

### Colab Only
Run the following commands for colab and skip this section if you use workbench.

In [None]:
if "google.colab" in str(get_ipython()):
    ! pip3 install --upgrade google-cloud-aiplatform

    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

    from google.colab import auth as google_auth

    google_auth.authenticate_user()

### Setup environment variables

This notebook supports models in https://huggingface.co/docs/timm/models.

You can also run
`python -c "from timm import list_models; print(list_models(pretrained=True))"`
locally to see all pretrained models.

The following models have been manually verified to work with this notebook:

* vit_tiny_patch16_224
* beit_base_patch16_224
* deit3_small_patch16_224
* efficientnet_b2
* mobilenetv2_100
* resnet50
* resnest50d
* convnext_base
* cspdarknet53
* inception_v4

In [None]:
# The cloud project id.
PROJECT_ID = ""  # @param {type:"string"}
# The region for running jobs.
REGION = "us-central1"  # @param {type:"string"}

# The model you want to train and serve. Please select a model from the verified model list above.
# We use a ViT model as the example.
MODEL_NAME = "vit_tiny_patch16_224"  # @param {type:"string"}

# The Cloud Storage bucket name without gs:// prefix for training outputs.
# For example: test_bucket
GCS_BUCKET = ""  # @param {type:"string"}

# The service account for deploying fine tuned model. It looks like:
# '<account_name>@<project>.iam.gserviceaccount.com'
# Follow step 6 above to create this account.
SERVICE_ACCOUNT = ""  # @param {type:"string"}

## Run local inference

This section runs local inference on an image using the model chosen in above section.

### Import libraries


In [None]:
import urllib

import timm
import torch
from PIL import Image
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

### Load a pretrained model

In [None]:
model = timm.create_model(MODEL_NAME, pretrained=True)
model.eval()

### Load and preprocess the image

In [None]:
config = resolve_data_config({}, model=model)
transform = create_transform(**config)

# The example downloads a test image. You can upload and use your own images
# by changing IMAGE_FILENAME.
! wget https://github.com/pytorch/hub/raw/master/images/dog.jpg -O test.jpg
IMAGE_FILENAME = "test.jpg"  # @param {type:"string"}

# You can also copy over images stored in a GCS bucket with the line below.
# ! gsutil cp "gs://path/to/image" "test.jpg"

img = Image.open(IMAGE_FILENAME).convert("RGB")
tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
display(img)

### Get the model predictions

In [None]:
with torch.no_grad():
    out = model(tensor)
probabilities = torch.nn.functional.softmax(out[0], dim=0)
print(probabilities.shape)

### Get the top-5 predictions class names

In [None]:
# Get imagenet class mappings
url, filename = (
    "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt",
    "imagenet_classes.txt",
)
urllib.request.urlretrieve(url, filename)
with open("imagenet_classes.txt") as f:
    categories = [s.strip() for s in f.readlines()]

### Print top categories per image

In [None]:
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())
# prints class names and probabilities like:
# [('Samoyed', 0.6425196528434753), ('Pomeranian', 0.04062102362513542), ('keeshond', 0.03186424449086189), ('white wolf', 0.01739676296710968), ('Eskimo dog', 0.011717947199940681)]

## Run training jobs

This section runs a regular training job or a hyperparameter tuning job on Vertex AI.

Before creating a training job, you need to prepare the dataset for training and evaluation.

For example, you can use [ImageNet-1K](https://huggingface.co/datasets/imagenet-1k) held on a Cloud Storage bucket as the input dataset.

In [None]:
# The prebuilt training docker uri.
TRAIN_DOCKER_URI = (
    "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-timm-train"
)

# The path to data directory on Cloud Storage without gs:// prefix.
# In the form of: <bucket-name>/path-to-data
GCS_DATA_DIR = ""  # @param {type:"string"}

### Create a training job on Vertex AI

This section creates a training job on Vertex AI. If you want to create a hyperparameter tuning job instead, you can skip to the next section.

In [None]:
from google.cloud import aiplatform

# Init common setup.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

# Input and output path.
data_dir = f"/gcs/{GCS_DATA_DIR}"
output_dir = f"/gcs/{GCS_BUCKET}/timm"

# Worker pool spec.
# Single node with multiple GPUs.
machine_type = "n1-highmem-32"
num_nodes = 1
gpu_type = "NVIDIA_TESLA_P100"  # @param {type:"string"}
num_gpus = 4  # @param {type:"integer"}

# Model specific config.
job_name = f"pytorch-{MODEL_NAME}"
batch_size = 32
epochs = 2

job = aiplatform.CustomContainerTrainingJob(
    display_name=job_name,
    container_uri=TRAIN_DOCKER_URI,
)
model = job.run(
    args=[
        "--standalone",
        f"--nnodes={num_nodes}",
        f"--nproc_per_node={num_gpus}",
        "train.py",
        data_dir,
        f"--model={MODEL_NAME}",
        "--pretrained",
        f"--output={output_dir}",
        f"--batch-size={batch_size}",
        f"--epochs={epochs}",
    ],
    replica_count=num_nodes,
    machine_type=machine_type,
    accelerator_type=gpu_type,
    accelerator_count=num_gpus,
)

### Create a hyperparameter tuning job on Vertex AI

You can use a [hyperparameter tuning](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview) job to find the best configuration of your hyperparameters.

You can skip this section if you already trained a model in the previous section and do not want to tune the hyperparameters.

In [None]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

# Init common setup.
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

# Input and output path.
data_dir = f"/gcs/{GCS_DATA_DIR}"
output_dir = f"/gcs/{GCS_BUCKET}/timm"

# Model specific config.
job_name = f"pytorch-hp-{MODEL_NAME}"
batch_size = 32
epochs = 2

# Worker pool spec.
machine_type = "n1-highmem-16"
num_nodes = 1
gpu_type = "NVIDIA_TESLA_V100"  # @param {type:"string"}
num_gpus = 2  # @param {type:"integer"}
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": gpu_type,
            "accelerator_count": num_gpus,
        },
        "replica_count": num_nodes,
        "container_spec": {
            "image_uri": TRAIN_DOCKER_URI,
            "args": [
                "--standalone",
                f"--nnodes={num_nodes}",
                f"--nproc_per_node={num_gpus}",
                "train.py",
                data_dir,
                f"--model={MODEL_NAME}",
                "--pretrained",
                f"--output={output_dir}",
                f"--batch-size={batch_size}",
                f"--epochs={epochs}",
            ],
        },
    }
]

# Hyperparameter job specs.
metric_spec = {"top1_accuracy": "maximize"}
parameter_spec = {
    "lr": hpt.DoubleParameterSpec(min=0.001, max=0.05, scale="log"),
}
max_trial_count = 2
parallel_trial_count = 2

# Launch jobs.
training_job = aiplatform.CustomJob(
    display_name=job_name, worker_pool_specs=worker_pool_specs
)
hp_job = aiplatform.HyperparameterTuningJob(
    display_name=job_name,
    custom_job=training_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=max_trial_count,
    parallel_trial_count=parallel_trial_count,
)
hp_job.run()

## Deploy model for online prediction

This section uploads the model to Model Registry and deploys it on an Endpoint resource.

The model deployment step will take ~15 minutes to complete.

In [None]:
# The prebuilt serving docker uri.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/pytorch-timm-serve"
# The port number used by torchserve traffic.
SERVE_PORT = 7080
# The path to model checkpoint file, including gs:// prefix.
MODEL_PT_PATH = "gs://path_to_model_best.pth.tar"  # @param {type:"string"}
# [Optional] the path to index_to_name.json, including gs:// prefix.
INDEX_TO_NAME_FILE = "gs://path_to_index_to_name.json"  # @param {type:"string"}

### Upload and deploy model on Vertex AI

In [None]:
# Upload model.
serving_env = {
    "MODEL_NAME": MODEL_NAME,
    "MODEL_PT_PATH": MODEL_PT_PATH,
    "INDEX_TO_NAME_FILE": INDEX_TO_NAME_FILE,
}
model = aiplatform.Model.upload(
    display_name=MODEL_NAME,
    serving_container_image_uri=SERVE_DOCKER_URI,
    serving_container_ports=[SERVE_PORT],
    serving_container_predict_route="/predictions/timm_serving",
    serving_container_health_route="/ping",
    serving_container_environment_variables=serving_env,
)
# Or reuse a pre-uploaded model.
# model = aiplatform.Model('projects/123456789/locations/us-central1/models/123456789@1')

# Create an endpoint.
endpoint = aiplatform.Endpoint.create(display_name="pytorch-timm-endpoint")
# Or reuse a pre-created endpoint.
# endpoint = aiplatform.Endpoint('projects/123456789/locations/us-central1/endpoints/123456789')

# Deploy model to endpoint.
model.deploy(
    endpoint=endpoint,
    machine_type="n1-standard-8",
    accelerator_type="NVIDIA_TESLA_T4",
    accelerator_count=1,
    traffic_percentage=100,
    service_account=SERVICE_ACCOUNT,
)

You can mange your uploaded models in the [Model Registry](https://pantheon.corp.google.com/vertex-ai/models) and your endpoints in the [Endpoints](https://pantheon.corp.google.com/vertex-ai/endpoints).

### Test online prediction

You will now test the deployed endpoint. Please prepare an image to predict.

In [None]:
import base64

# You can get the deployed endpoint object by its resource name returned by Endpoint.create(). For example:
# endpoint = aiplatform.Endpoint('projects/816369962409/locations/us-central1/endpoints/8809168414485512192')

# Please upload an image and enter its filename below.
IMAGE_FILENAME = "test.jpg"  # @param {type:"string"}

# Alternatively, uncomment the following line to download a cat image for demonstration.
# ! wget http://images.cocodataset.org/val2017/000000039769.jpg -O test.jpg

with open(IMAGE_FILENAME, "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")
instances = [{"data": {"b64": image_b64}}]

prediction = endpoint.predict(instances=instances)
print(prediction)

### Clean Up Resources

In [None]:
endpoint.undeploy_all()
model.delete()