In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TF-Keras Text Classification Distributed Single Worker GPUs using Vertex Training with Local Mode Container

<table align="left">
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/community-content/tf_keras_text_classification_distributed_single_worker_gpus_with_gcloud_local_run_and_vertex_sdk/vertex_training_with_local_mode_container.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Setup

In [None]:
PROJECT_ID = "YOUR PROJECT ID"
BUCKET_NAME = "gs://YOUR BUCKET NAME"
REGION = "YOUR REGION"
SERVICE_ACCOUNT = "YOUR SERVICE ACCOUNT"

In [None]:
content_name = "tf-keras-txt-cls-dist-single-worker-gpus-local-mode-cont"

## Local Training with Vertex Local Mode and Auto Packaging

In [None]:
BASE_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-5:latest"
SCRIPT_PATH = "trainer/task.py"
OUTPUT_IMAGE_NAME = "gcr.io/{}/{}:latest".format(PROJECT_ID, content_name)
ARGS = "--epochs 5 --batch-size 16 --local-mode"

In [None]:
! gcloud ai custom-jobs local-run \
  --executor-image-uri=$BASE_IMAGE_URI \
  --script=$SCRIPT_PATH \
  --output-image-uri=$OUTPUT_IMAGE_NAME \
  -- \
  $ARGS

## Vertex Training using Vertex SDK and Vertex Local Mode Container

### Container Built by Vertex Local Mode

In [None]:
custom_container_image_uri = OUTPUT_IMAGE_NAME

In [None]:
! docker push $custom_container_image_uri

In [None]:
! gcloud container images list --repository "gcr.io"/$PROJECT_ID

### Initialize Vertex SDK

In [None]:
! pip install -r requirements.txt

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    location=REGION,
)

### Create a Vertex Tensorboard Instance

In [None]:
tensorboard = aiplatform.Tensorboard.create(
    display_name=content_name,
)

#### Option: Use a Previously Created Vertex Tensorboard Instance

```
tensorboard_name = "Your Tensorboard Resource Name or Tensorboard ID"
tensorboard = aiplatform.Tensorboard(tensorboard_name=tensorboard_name)
```

### Run a Vertex SDK CustomContainerTrainingJob

In [None]:
display_name = content_name
gcs_output_uri_prefix = f"{BUCKET_NAME}/{display_name}"

machine_type = "n1-standard-8"
accelerator_count = 4
accelerator_type = "NVIDIA_TESLA_P100"

args = [
    "--epochs",
    "100",
    "--batch-size",
    "128",
    "--num-gpus",
    f"{accelerator_count}",
]

In [None]:
custom_container_training_job = aiplatform.CustomContainerTrainingJob(
    display_name=display_name,
    container_uri=custom_container_image_uri,
)

In [None]:
custom_container_training_job.run(
    args=args,
    base_output_dir=gcs_output_uri_prefix,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    tensorboard=tensorboard.resource_name,
    service_account=SERVICE_ACCOUNT,
)

In [None]:
print(f"Custom Training Job Name: {custom_container_training_job.resource_name}")
print(f"GCS Output URI Prefix: {gcs_output_uri_prefix}")

### Training Output Artifact

In [None]:
! gsutil ls $gcs_output_uri_prefix

## Clean Up Artifact

In [None]:
! gsutil rm -rf $gcs_output_uri_prefix