In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# PyTorch image classification multi-node distributed data parallel training on cpu using Vertex training with custom container

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/multi_node_ddp_gloo_vertex_training_with_custom_container.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/multi_node_ddp_gloo_vertex_training_with_custom_container.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/training/multi_node_ddp_gloo_vertex_training_with_custom_container.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Overview


This tutorial demonstrates how to create a distributed PyTorch training job using Vertex AI SDK for Python and custom containers. This can help your training job scale to handle a large amount of data.

Learn more about [Vertex AI Training](https://cloud.google.com/vertex-ai/docs/training/custom-training).

### Objective

In this tutorial, you learn how to create a distributed PyTorch training job using Vertex AI SDK for Python and custom containers. You set up GCP to use a custom container, a Vertex TensorBoard instance and run a custom training job.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI SDK`
- `Vertex AI TensorBoard`
- `CustomContainerTrainingJob`
- `Artifact Registry`

The steps performed include:

- Setting up your GCP project : Setting up the PROJECT_ID, REGION & SERVICE_ACCOUNT
- Creating a cloud storage bucket
- Building Custom Container using Artifact Registry and Docker
- Create a Vertex AI TensorBoard instance to store your Vertex AI experiment
- Run a Vertex AI SDK CustomContainerTrainingJob

### Dataset

The dataset used for this tutorial is the <a href="http://yann.lecun.com/exdb/mnist/">MNIST database</a>. The MNIST database of handwritten digits has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image.


### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI

* Cloud Storage

* Vertex AI TensorBoard (Note that Vertex AI TensorBoard charges a monthly fee of $300 per unique active user. Active users are measured through the Vertex AI TensorBoard UI. You also pay for Google Cloud resources you use with Vertex AI TensorBoard, such as TensorBoard logs stored in Cloud Storage. <a href='https://cloud.google.com/vertex-ai/pricing#tensorboard'>Check the link for latest prices.</a>)

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/),
        to generate a cost estimate based on your projected usage.


## Installation

Install the following packages required to execute this notebook.

In [None]:
! pip3 install --upgrade google-cloud-aiplatform -q

### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). {TODO: Update the APIs needed for your tutorial. Edit the API names, and update the link to append the API IDs, separating each one with a comma. For example, container.googleapis.com,cloudbuild.googleapis.com}

4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

#### Set your project ID

**If you don't know your project ID**, try the following:
* Run `gcloud config list`.
* Run `gcloud projects list`.
* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

#### Region

You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

### Authenticate your Google Cloud account

Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below.

**1. Vertex AI Workbench**
* Do nothing as you are already authenticated.

**2. Local JupyterLab instance, uncomment and run:**

In [None]:
# ! gcloud auth login

**3. Colab, uncomment and run:**

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**4. Service account or other**
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_URI = "gs://your-bucket-name-unique"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

### Import libraries

In [None]:
from google.cloud import aiplatform

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

#### Service Account

You use a service account to run Vetex AI CustomContainerTrainingJob. If you do not want to use your project's Compute Engine service account, set `SERVICE_ACCOUNT` to another service account ID.

In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

# Create Custom Training Python Package

Before you can perform local training, you must create source code file, requirements file, docker file.

You will create a directory and write all of our files into that folder.

In [None]:
PYTHON_PACKAGE_APPLICATION_DIR = "trainer"

In [None]:
!mkdir -p $PYTHON_PACKAGE_APPLICATION_DIR

### Write the Training Script

In [None]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/task.py


"""
Main program for PyTorch distributed training.
Adapted from: https://github.com/narumiruna/pytorch-distributed-example
"""

import argparse
import os
import shutil

import torch
from torch import distributed
from torch.nn.parallel import DistributedDataParallel
from torch.utils import data
from torch.utils.tensorboard import SummaryWriter

from torchvision import datasets, transforms

def parse_args():

  parser = argparse.ArgumentParser()

  # Using environment variables for Cloud Storage directories
  # see more details in https://cloud.google.com/vertex-ai/docs/training/code-requirements
  parser.add_argument(
      '--model-dir', default=os.getenv('AIP_MODEL_DIR'), type=str,
      help='a Cloud Storage URI of a directory intended for saving model artifacts')
  parser.add_argument(
      '--tensorboard-log-dir', default=os.getenv('AIP_TENSORBOARD_LOG_DIR'), type=str,
      help='a Cloud Storage URI of a directory intended for saving TensorBoard')
  parser.add_argument(
      '--checkpoint-dir', default=os.getenv('AIP_CHECKPOINT_DIR'), type=str,
      help='a Cloud Storage URI of a directory intended for saving checkpoints')

  parser.add_argument(
      '--backend', type=str, default='gloo',
      help='Use the `nccl` backend for distributed GPU training.'
           'Use the `gloo` backend for distributed CPU training.')
  parser.add_argument(
      '--init-method', type=str, default='env://',
      help='URL specifying how to initialize the package.')
  parser.add_argument(
      '--world-size', type=int, default=os.environ.get('WORLD_SIZE', 1),
      help='The total number of nodes in the cluster. '
           'This variable has the same value on every node.')
  parser.add_argument(
      '--rank', type=int, default=os.environ.get('RANK', 0),
      help='A unique identifier for each node. '
           'On the master worker, this is set to 0. '
           'On each worker, it is set to a different value from 1 to WORLD_SIZE - 1.')
  parser.add_argument(
      '--epochs', type=int, default=20)
  parser.add_argument(
      '--no-cuda', action='store_true')
  parser.add_argument(
      '-lr', '--learning-rate', type=float, default=1e-3)
  parser.add_argument(
      '--batch-size', type=int, default=128)
  parser.add_argument(
      '--local-mode', action='store_true', help='use local mode when running on your local machine')

  args = parser.parse_args()

  return args

def makedirs(model_dir):
  if os.path.exists(model_dir) and os.path.isdir(model_dir):
    shutil.rmtree(model_dir)
  os.makedirs(model_dir)
  return

def distributed_is_initialized():
  if distributed.is_available():
    if distributed.is_initialized():
      return True
  return False

class Average(object):

  def __init__(self):
    self.sum = 0
    self.count = 0

  def __str__(self):
    return '{:.6f}'.format(self.average)

  @property
  def average(self):
    return self.sum / self.count

  def update(self, value, number):
    self.sum += value * number
    self.count += number

class Accuracy(object):

  def __init__(self):
    self.correct = 0
    self.count = 0

  def __str__(self):
    return '{:.2f}%'.format(self.accuracy * 100)

  @property
  def accuracy(self):
    return self.correct / self.count

  @torch.no_grad()
  def update(self, output, target):
    pred = output.argmax(dim=1)
    correct = pred.eq(target).sum().item()

    self.correct += correct
    self.count += output.size(0)

class Net(torch.nn.Module):

  def __init__(self, device):
    super(Net, self).__init__()
    self.fc = torch.nn.Linear(784, 10).to(device)

  def forward(self, x):
    return self.fc(x.view(x.size(0), -1))

class MNISTDataLoader(data.DataLoader):

  def __init__(self, root, batch_size, train=True):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,)),
    ])

    dataset = datasets.MNIST(root, train=train, transform=transform, download=True)
    sampler = None
    if train and distributed_is_initialized():
      sampler = data.DistributedSampler(dataset)

    super(MNISTDataLoader, self).__init__(
        dataset,
        batch_size=batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
    )

class Trainer(object):

  def __init__(self,
      model,
      optimizer,
      train_loader,
      test_loader,
      device,
      model_name,
      checkpoint_path
  ):
    self.model = model
    self.optimizer = optimizer
    self.train_loader = train_loader
    self.test_loader = test_loader
    self.device = device
    self.model_name = model_name
    self.checkpoint_path = checkpoint_path

  def save(self, model_dir):
    model_path = os.path.join(model_dir, self.model_name)
    torch.save(self.model.state_dict(), model_path)

  def fit(self, epochs, is_chief, writer):

    for epoch in range(1, epochs + 1):

      print('Epoch: {}, Training ...'.format(epoch))
      train_loss, train_acc = self.train()

      if is_chief:
        test_loss, test_acc = self.evaluate()
        writer.add_scalar('Loss/train', train_loss.average, epoch)
        writer.add_scalar('Loss/test', test_loss.average, epoch)
        writer.add_scalar('Accuracy/train', train_acc.accuracy, epoch)
        writer.add_scalar('Accuracy/test', test_acc.accuracy, epoch)
        torch.save(self.model.state_dict(), self.checkpoint_path)

        print(
            'Epoch: {}/{},'.format(epoch, epochs),
            'train loss: {}, train acc: {},'.format(train_loss, train_acc),
            'test loss: {}, test acc: {}.'.format(test_loss, test_acc),
        )

  def train(self):

    self.model.train()

    train_loss = Average()
    train_acc = Accuracy()

    for data, target in self.train_loader:
      data = data.to(self.device)
      target = target.to(self.device)

      output = self.model(data)
      loss = torch.nn.functional.cross_entropy(output, target)

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

      train_loss.update(loss.item(), data.size(0))
      train_acc.update(output, target)

    return train_loss, train_acc

  @torch.no_grad()
  def evaluate(self):
    self.model.eval()

    test_loss = Average()
    test_acc = Accuracy()

    for data, target in self.test_loader:
      data = data.to(self.device)
      target = target.to(self.device)

      output = self.model(data)
      loss = torch.nn.functional.cross_entropy(output, target)

      test_loss.update(loss.item(), data.size(0))
      test_acc.update(output, target)

    return test_loss, test_acc

def main():

  args = parse_args()

  local_data_dir = './tmp/data'
  local_model_dir = './tmp/model'
  local_tensorboard_log_dir = './tmp/logs'
  local_checkpoint_dir = './tmp/checkpoints'

  model_dir = args.model_dir or local_model_dir
  tensorboard_log_dir = args.tensorboard_log_dir or local_tensorboard_log_dir
  checkpoint_dir = args.checkpoint_dir or local_checkpoint_dir

  gs_prefix = 'gs://'
  gcsfuse_prefix = '/gcs/'
  if model_dir and model_dir.startswith(gs_prefix):
    model_dir = model_dir.replace(gs_prefix, gcsfuse_prefix)
  if tensorboard_log_dir and tensorboard_log_dir.startswith(gs_prefix):
    tensorboard_log_dir = tensorboard_log_dir.replace(gs_prefix, gcsfuse_prefix)
  if checkpoint_dir and checkpoint_dir.startswith(gs_prefix):
    checkpoint_dir = checkpoint_dir.replace(gs_prefix, gcsfuse_prefix)

  writer = SummaryWriter(tensorboard_log_dir)

  is_chief = args.rank == 0
  if is_chief:
    makedirs(checkpoint_dir)
    print(f'Checkpoints will be saved to {checkpoint_dir}')

  checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint.pt')
  print(f'checkpoint_path is {checkpoint_path}')

  if args.world_size > 1:
    print('Initializing distributed backend with {} nodes'.format(args.world_size))
    distributed.init_process_group(
          backend=args.backend,
          init_method=args.init_method,
          world_size=args.world_size,
          rank=args.rank,
      )
    print(f'[{os.getpid()}]: '
          f'world_size = {distributed.get_world_size()}, '
          f'rank = {distributed.get_rank()}, '
          f'backend={distributed.get_backend()} \n', end='')

  if torch.cuda.is_available() and not args.no_cuda:
    device = torch.device('cuda:{}'.format(args.rank))
  else:
    device = torch.device('cpu')

  model = Net(device=device)
  if distributed_is_initialized():
    model.to(device)
    model = DistributedDataParallel(model)

  if is_chief:
    # All processes should see same parameters as they all start from same
    # random parameters and gradients are synchronized in backward passes.
    # Therefore, saving it in one process is sufficient.
    torch.save(model.state_dict(), checkpoint_path)
    print(f'Initial chief checkpoint is saved to {checkpoint_path}')

  # Use a barrier() to make sure that process 1 loads the model after process
  # 0 saves it.
  if distributed_is_initialized():
    distributed.barrier()
    # configure map_location properly
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    print(f'Initial chief checkpoint is saved to {checkpoint_path} with map_location {device}')
  else:
    model.load_state_dict(torch.load(checkpoint_path))
    print(f'Initial chief checkpoint is loaded from {checkpoint_path}')

  optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

  train_loader = MNISTDataLoader(
      local_data_dir, args.batch_size, train=True)
  test_loader = MNISTDataLoader(
      local_data_dir, args.batch_size, train=False)

  trainer = Trainer(
      model=model,
      optimizer=optimizer,
      train_loader=train_loader,
      test_loader=test_loader,
      device=device,
      model_name='mnist.pt',
      checkpoint_path=checkpoint_path,
  )
  trainer.fit(args.epochs, is_chief, writer)

  if model_dir == local_model_dir:
    makedirs(model_dir)
    trainer.save(model_dir)
    print(f'Model is saved to {model_dir}')

  print(f'Tensorboard logs are saved to: {tensorboard_log_dir}')

  writer.close()

  if is_chief:
    os.remove(checkpoint_path)

  if distributed_is_initialized():
    distributed.destroy_process_group()

  return

if __name__ == '__main__':
  main()


### Write requirements file

In [None]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/requirements.txt

torch
torchvision
tensorboard

### Write the docker file

In [None]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/Dockerfile


FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime

RUN apt-get update && \
    apt-get install -y curl gnupg && \
    echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && \
    apt-get update -y && \
    apt-get install google-cloud-sdk -y

COPY . /trainer

WORKDIR /trainer

RUN pip install -r requirements.txt

ENTRYPOINT ["python", "-m", "task"]


## Local Training


In [None]:
! ls trainer
! cat trainer/requirements.txt
! pip install -r trainer/requirements.txt
! cat trainer/task.py

In [None]:
%run trainer/task.py --epochs 5 --no-cuda --local-mode

In [None]:
! ls ./tmp

Clean up temporary files

In [None]:
! rm -rf ./tmp

## Vertex AI Training using a custom container

### Build Custom Container


#### Enable Artifact Registry API
You must enable the Artifact Registry API service for your project.

<a href="https://cloud.google.com/artifact-registry/docs/enable-service">Learn more about Enabling service</a>.

In [None]:
! gcloud services enable artifactregistry.googleapis.com

### Create a private Docker repository
Your first step is to create your own Docker repository in Google Artifact Registry.

1 - Run the gcloud artifacts repositories create command to create a new Docker repository with your region with the description "docker repository".

2 - Run the gcloud artifacts repositories list command to verify that your repository was created.

In [None]:
PRIVATE_REPO = "my-docker-repo"

! gcloud artifacts repositories create {PRIVATE_REPO} --repository-format=docker --location={REGION} --description="Docker repository"

! gcloud artifacts repositories list

In [None]:
DEPLOY_IMAGE = (
    f"{REGION}-docker.pkg.dev/" + PROJECT_ID + f"/{PRIVATE_REPO}" + "/tf_serving"
)

In [None]:
print("Deployment:", DEPLOY_IMAGE)

## Executes in Workbench


### Configure authentication to your private repo
Before you push or pull container images, configure Docker to use the gcloud command-line tool to authenticate requests to Artifact Registry for your region.

In [None]:
import sys

IS_COLAB = "google.colab" in sys.modules

if not IS_COLAB:
    ! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet

### Container (Docker) image for serving
Set the TensorFlow Serving Docker container image for serving prediction.

1. Pull the corresponding CPU or GPU Docker image for TF Serving from Docker Hub.
2. Create a tag for registering the image with Artifact Registry
3. Register the image with Artifact Registry.

<a href="https://www.tensorflow.org/tfx/serving/docker">Learn more about TensorFlow Serving</a>.

In [None]:
if not IS_COLAB:
    ! cd trainer && docker build -t $DEPLOY_IMAGE -f Dockerfile .
    ! docker run --rm $DEPLOY_IMAGE --epochs 5 --no-cuda --local-mode
    ! docker push $DEPLOY_IMAGE

## Executes in Colab

Build and push a Docker image with Cloud Build

In [None]:
if IS_COLAB:
    ! cd trainer && gcloud builds submit --timeout=1800s --region={REGION} --tag $DEPLOY_IMAGE

### Create a Vertex AI Tensorboard instance

NOTE: <a href="https://cloud.google.com/vertex-ai/pricing#tensorboard">Vertex AI TensorBoard </a> charges a monthly fee of $300 per unique active user. Active users are measured through the Vertex AI TensorBoard UI. You also pay for Google Cloud resources you use with Vertex AI TensorBoard, such as TensorBoard logs stored in Cloud Storage.</a>Please check above link for latest prices.

In [None]:
content_name = "pt-img-cls-multi-node-ddp-cust-cont"
content_name = content_name + "-cpu-unique"

In [None]:
tensorboard = aiplatform.Tensorboard.create(
    display_name=content_name,
)

#### Option: Use a previously created Vertex AI Tensorboard instance

```
tensorboard_name = "Your Tensorboard Resource Name or Tensorboard ID"
tensorboard = aiplatform.Tensorboard(tensorboard_name=tensorboard_name)
```

### Run a Vertex AI SDK CustomContainerTrainingJob

In [None]:
display_name = content_name
gcs_output_uri_prefix = f"{BUCKET_URI}/{display_name}"

replica_count = 4
machine_type = "n1-standard-4"

args = [
    "--backend",
    "gloo",
    "--no-cuda",
    "--batch-size",
    "128",
    "--epochs",
    "25",
]

In [None]:
custom_container_training_job = aiplatform.CustomContainerTrainingJob(
    display_name=display_name,
    container_uri=DEPLOY_IMAGE,
)

In [None]:
custom_container_training_job.run(
    args=args,
    base_output_dir=gcs_output_uri_prefix,
    replica_count=replica_count,
    machine_type=machine_type,
    tensorboard=tensorboard.resource_name,
    service_account=SERVICE_ACCOUNT,
)

In [None]:
print(f"Custom Training Job Name: {custom_container_training_job.resource_name}")
print(f"GCS Output URI Prefix: {gcs_output_uri_prefix}")

### View training output artifact

In [None]:
! gsutil ls $gcs_output_uri_prefix

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Vertex AI Tensorboard
- Cloud Storage Bucket

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False
delete_tensorboard = False

! gsutil rm -rf $gcs_output_uri_prefix

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI

if delete_tensorboard or os.getenv("IS_TESTING"):
    tensorboard.delete()