In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/xgboost_data_parallel_training_on_cpu_using_dask.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/xgboost_data_parallel_training_on_cpu_using_dask.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/training/xgboost_data_parallel_training_on_cpu_using_dask.ipynb">
        <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

# Create a distributed custom training job
## Overview

This tutorial shows you how to create a distributed custom training job on Vertex AI that can handle large amounts of training data. 

### Objective

In this tutorial, you learn how to create a distributed training job using Vertex AI SDK for Python. You build a custom docker container with simple Dask configuration to run a custom training job.

This tutorial uses the following Google Cloud ML services:

- `Vertex AI SDK`
- `CustomContainerTrainingJob`
- `Artifact Registry`

The steps performed include:

- Configure the `PROJECT_ID` and `REGION` variables for your Google Cloud project.
- Create a Cloud Storage bucket to store your model artifacts.
- Build a custom Docker container that hosts your training code and push the container image to Artifact Registry.
- Run a Vertex AI SDK CustomContainerTrainingJob


### Data 

This tutorial uses the <a href="https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html">IRIS dataset</a>, which consists of different types of irises. 

### Costs
 
This tutorial uses billable components of Google Cloud:

* Vertex AI

* Cloud Storage

* Artifact Registry

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing) and [Artifact Registry](https://cloud.google.com/artifact-registry/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/),
        to generate a cost estimate based on your projected usage.


## Installation

Install the packages required for executing this notebook.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform {USER_FLAG} -q

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Restart the kernel after pip installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)

3. [Enable the Vertex AI APIs, Compute Engine APIs, and Cloud Storage.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component,storage-component.googleapis.com)

4. [The Google Cloud SDK](https://cloud.google.com/sdk) is already installed in Google Cloud Notebook.

5. Enter your project ID in the cell below. Then run the  cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### UUID

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a uuid for each instance session, and append it onto the name of resources you create in this tutorial.

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebook**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the [**Create service account key**
   page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex AI SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}"

if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = PROJECT_ID + "aip-" + UUID
    BUCKET_URI = f"gs://{BUCKET_NAME}"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

### Import libraries and define constants

In [None]:
from google.cloud import aiplatform

# Create a custom training Python package 

Before you can perform local training, you must a create a training script file and a docker file.

Create a `trainer` directory for all of your training code.

In [None]:
PYTHON_PACKAGE_APPLICATION_DIR = "trainer"

In [None]:
!mkdir -p $PYTHON_PACKAGE_APPLICATION_DIR

### Write the Training Script

The `train.py` file checks whether the current node is the chief node or a worker node and runs `dask-scheduler` for the chief node and `dask-worker` for worker nodes. Worker nodes connect to the chief node through the IP address and port number specified in `CLUSTER_SPEC`.

After the Dask scheduler is set up and connected to worker nodes, call `xgb.dask.train` to train a model through Dask. Once model training is complete, the model is uploaded to `AIP_MODEL_DIR`.

In [None]:
%%writefile trainer/train.py
from dask.distributed import Client, wait
from xgboost.dask import DaskDMatrix
from google.cloud import storage
import xgboost as xgb
import dask.dataframe as dd
import sys
import os
import subprocess
import time
import json

IRIS_DATA_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_data.csv'
IRIS_TARGET_FILENAME = 'gs://cloud-samples-data/ai-platform/iris/iris_target.csv'
MODEL_FILE = 'model.bst'
MODEL_DIR = os.getenv("AIP_MODEL_DIR")
XGB_PARAMS = {
    'verbosity': 2,
    'learning_rate': 0.1,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'subsample': 0.6,
    'gamma': 1,
    'verbose_eval': True,
    'tree_method': 'hist',
    'nthread': 1
}


def square(x):
    return x ** 2

def neg(x):
    return -x

def launch(cmd):
    """ launch dask workers
    """
    return subprocess.check_call(cmd, stdout=sys.stdout, stderr=sys.stderr, shell=True)

def get_chief_ip(cluster_config_dict):
    ip_address = cluster_config_dict['cluster']['workerpool0'][0].split(":")[0]
    print('The ip address of workerpool 0 is : {}'.format(ip_address))
    return ip_address

def get_chief_port(cluster_config_dict):
    print("The open port is: {}".format(cluster_config_dict['open_ports'][0]))
    return cluster_config_dict['open_ports'][0]

if __name__ == '__main__':
    cluster_config_str = os.environ.get('CLUSTER_SPEC')
    cluster_config_dict  = json.loads(cluster_config_str)
    print(json.dumps(cluster_config_dict, indent=2))
    print('The workerpool type is:', flush=True)
    print(cluster_config_dict['task']['type'], flush=True)
    workerpool_type = cluster_config_dict['task']['type']
    chief_ip = get_chief_ip(cluster_config_dict)
    chief_port = get_chief_port(cluster_config_dict)
    chief_address = "{}:{}".format(chief_ip, chief_port)

    if workerpool_type == "workerpool0":
      print('Running the dask scheduler.', flush=True)
      proc_scheduler = launch('dask-scheduler --dashboard --dashboard-address 8888 --port {} &'.format(chief_port))
      print('Done the dask scheduler.', flush=True)

      client = Client(chief_address)
      print('Waiting the scheduler to be connected.', flush=True)
      client.wait_for_workers(1)

      X = dd.read_csv(IRIS_DATA_FILENAME, header=None)
      y = dd.read_csv(IRIS_TARGET_FILENAME, header=None)
      X.persist()
      y.persist()
      wait(X)
      wait(y)
      dtrain = DaskDMatrix(client, X, y)
      
      output = xgb.dask.train(client, XGB_PARAMS, dtrain,  num_boost_round=100, evals=[(dtrain, 'train')])
      print("Output: {}".format(output), flush=True)
      print("Saving file to: {}".format(MODEL_FILE), flush=True)
      output['booster'].save_model(MODEL_FILE)
      bucket_name = MODEL_DIR.replace("gs://", "").split("/", 1)[0]
      folder = MODEL_DIR.replace("gs://", "").split("/", 1)[1]
      bucket = storage.Client().bucket(bucket_name)
      print("Uploading file to: {}/{}{}".format(bucket_name, folder, MODEL_FILE), flush=True)
      blob = bucket.blob('{}{}'.format(folder, MODEL_FILE))
      blob.upload_from_filename(MODEL_FILE)
      print("Saved file to: {}/{}".format(MODEL_DIR, MODEL_FILE), flush=True)

      client.shutdown()

    else:
      print('Running the dask worker.', flush=True)
      client = Client(chief_address, timeout=1200)
      print('client: {}.'.format(client), flush=True)
      launch('dask-worker {}'.format(chief_address))
      print('Done with the dask worker.', flush=True)


### Write the docker file

In [None]:
%%writefile Dockerfile
FROM us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-9:latest
WORKDIR /root

RUN apt-get update
RUN apt-get install -y telnet netcat iputils-ping  net-tools
RUN python3.8 -m pip install dask==2022.7.1 distributed==2022.7.1 bokeh==2.1.1 dask-cuda  --upgrade
RUN python3.8 -m pip install 'xgboost>=1.4.2' 'dask-ml[complete]==2022.5.27'  #'dask[complete]==2022.7,1' --upgrade
RUN python3.8 -m pip install gcsfs --upgrade


## Make sure gsutil will use the default service account
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg

# Copies the trainer code
RUN mkdir /root/trainer
COPY trainer/train.py /root/trainer/train.py

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python3.8", "trainer/train.py"]


## Create a custom training job

### Build a custom training container

#### Enable Artifact Registry API
You must enable the Artifact Registry API for your project. You will store your custom training container in Artifact Registry.

<a href="https://cloud.google.com/artifact-registry/docs/enable-service">Learn more about Enabling service</a>.


In [None]:
! gcloud services enable artifactregistry.googleapis.com

### Create a private Docker repository
Your first step is to create a Docker repository in Artifact Registry.

1 - Run the `gcloud artifacts repositories create` command to create a new Docker repository with your region with the description `docker repository`.

2 - Run the `gcloud artifacts repositories list` command to verify that your repository was created.

In [None]:
PRIVATE_REPO = "my-docker-repo"

! gcloud artifacts repositories create {PRIVATE_REPO} --repository-format=docker --location={REGION} --description="Docker repository"

! gcloud artifacts repositories list

In [None]:
DEPLOY_IMAGE = (
    f"{REGION}-docker.pkg.dev/" + PROJECT_ID + f"/{PRIVATE_REPO}" + "/dask_support"
)
print("Deployment:", DEPLOY_IMAGE)

## Authenticate Docker to your repository
### Configure authentication to your private repo
Before you can push or pull container images to or from your Artifact Registry repository, you must configure Docker to use the gcloud command-line tool to authenticate requests to Artifact Registry for your region. On Colab, you'll have to use Cloud Build as the docker command is not available,

In [None]:
if not IS_COLAB:
    ! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet

### Set the custom Docker container image
Set the custom Docker container image.

1. Pull the corresponding CPU or GPU Docker image from Docker Hub.
2. Create a tag for registering the image with Artifact Registry
3. Register the image with Artifact Registry.

In [None]:
if not IS_COLAB:
    ! docker build -t $DEPLOY_IMAGE -f Dockerfile .
    ! docker push $DEPLOY_IMAGE

## Build and push the custom docker container image by using Cloud Build

Build and push a Docker image with Cloud Build

In [None]:
if IS_COLAB:
    !  gcloud builds submit --timeout=1800s --region={REGION} --tag $DEPLOY_IMAGE

### Initialize Vertex AI SDK

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    staging_bucket=BUCKET_URI,
    location=REGION,
)

### Run a Vertex AI SDK CustomContainerTrainingJob

In [None]:
gcs_output_uri_prefix = f"{BUCKET_URI}/output"
replica_count = 2
machine_type = "n1-standard-4"
display_name = "test_display_name"

custom_container_training_job = aiplatform.CustomContainerTrainingJob(
    display_name=display_name,
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest",
    container_uri=DEPLOY_IMAGE,
)

custom_container_training_job.run(
    base_output_dir=gcs_output_uri_prefix,
    replica_count=replica_count,
    machine_type=machine_type,
)

In [None]:
print(f"Custom Training Job Name: {custom_container_training_job.resource_name}")
print(f"GCS Output URI Prefix: {gcs_output_uri_prefix}")

### View training output artifact

In [None]:
! gsutil ls $gcs_output_uri_prefix/model/

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Cloud Storage Bucket

In [None]:
# Set this to true only if you'd like to delete your bucket
delete_bucket = False

! gsutil rm -rf $gcs_output_uri_prefix

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI