In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Model Training formalization with Ray and KFP on Vertex AI

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/tree/main/notebooks/official/ray_on_vertex_ai/ray_train/model_training_formalization_with_ray_and_kfp.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fray_on_vertex%2Fray_train%2model_training_formalization_with_ray_and_kfp.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/tree/main/notebooks/official/ray_on_vertex_ai/ray_train/model_training_formalization_with_ray_and_kfp.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/tree/main/notebooks/official/ray_on_vertex_ai/ray_train/model_training_formalization_with_ray_and_kfp.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

## Overview

This tutorial demonstrates how to orchestrate a distributed training using Ray on Vertex AI and Vertex AI Pipelines for an XGBoost model.

Learn more about [Ray on Vertex AI overview](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview).

### Objective

In this notebook, you learn how to build a KFP pipeline component to preprocess, train and register a XGBoost model using Ray Train on Vertex AI.

This tutorial uses the following Google Cloud ML services and resources:

- Cloud storage
- BigQuery
- Ray on Vertex AI
- Vertex AI Pipelines
- Vertex AI Model Registry

The steps performed include:

- Build a `VertexRayJob` pipeline component
- Preprocess data
- Build features
- Train model
- Register model on Model Registry

### Dataset

The [Chicago Taxi Trips dataset](https://cloud.google.com/bigquery/public-data/) is one of public datasets hosted with BigQuery, which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The taxi_trips table size is 70.72 GB and includes more than 195 million records. The dataset includes information about the trips, like pickup and dropoff datetime and location, passengers count, miles travelled, and trip toll.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* BigQuery
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing)
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Before to start

### Install Vertex AI SDK for Python and other required packages


In [None]:
! pip3 install --upgrade --quiet google-cloud-bigquery google-cloud-aiplatform[ray] kfp google-cloud-pipeline-components
! pip3 install --upgrade --quiet etils

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

Set your project ID.


In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}

Indicate the project number.

In [None]:
PROJECT_NUMBER = "[your-project-number]"  # @param {type:"string"}

Set the region used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "us-central1"  # @param {type: "string"}

Create a storage bucket to store intermediate artifacts such as datasets.

In [None]:
BUCKET_NAME = "your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

BUCKET_URI = f"gs://{BUCKET_NAME}"  # @param {type:"string"}

In [None]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

Specify a service account with the following permissions:

-   `Vertex AI User` to call Vertex LLM API
-   `Storage Admin` to manage GCS bucket.
-   `Storage Object Admin` to read and write to your GCS bucket.

[Check out the documentation](https://cloud.google.com/iam/docs/manage-access-service-accounts#iam-view-access-sa-gcloud) to know how to grant those permissions to a single service account.


In [None]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

SERVICE_ACCOUNT = f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com"

In [None]:
! gcloud projects add-iam-policy-binding {PROJECT_ID} \
    --member="serviceAccount:{SERVICE_ACCOUNT}" \
    --role="roles/storage.admin"

! gcloud projects add-iam-policy-binding {PROJECT_ID} \
    --member="serviceAccount:{SERVICE_ACCOUNT}" \
    --role="roles/storage.objectAdmin"

! gcloud projects add-iam-policy-binding {PROJECT_ID} \
    --member="serviceAccount:{SERVICE_ACCOUNT}" \
    --role="roles/aiplatform.admin"

! gcloud projects add-iam-policy-binding {PROJECT_ID} \
    --member="serviceAccount:{SERVICE_ACCOUNT}" \
    --role="roles/iam.serviceAccountUser"

### Set workspace

Create a workspace to store pipelines deliverables.

In [None]:
from etils import epath

WORKSPACE_FOLDER_URI = epath.Path(BUCKET_URI) / "chicago_taxitrips"

epath.Path(WORKSPACE_FOLDER_URI).mkdir(parents=True, exist_ok=True)

### Prepare the BigQuery dataset

You create a Bigquery dataset to extract, transform and load ML dataset for training the model.

In [None]:
from google.cloud import bigquery


def create_bq_dataset(dataset_id, project_id, location):
    """Create a BigQuery dataset."""
    bq_client = bigquery.Client(project=project_id, location=location)
    dataset_uri = f"{bq_client.project}.{dataset_id}"
    dataset = bigquery.Dataset(dataset_uri)
    dataset = bq_client.create_dataset(dataset, exists_ok=True)
    print(f"Created dataset {dataset.dataset_id}!")
    return dataset


def run_bq_query(sql, project_id, location):
    """Run a BigQuery query."""
    bq_client = bigquery.Client(project=project_id, location=location)
    job_config = bigquery.QueryJobConfig()
    job = bq_client.query(sql, job_config=job_config)
    job.result()

In [None]:
LOCATION = REGION.split("-")[0]
DATASET_ID = "rov_dataset"
TABLE_ID = "chicago_taxitrips"
YEAR = 2023
LIMIT = 100000

In [None]:
_ = create_bq_dataset(DATASET_ID, PROJECT_ID, LOCATION)

In [None]:
SQL_QUERY = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}` AS
SELECT
    trip_start_timestamp,
    EXTRACT(MONTH FROM trip_start_timestamp) AS trip_month,
    EXTRACT(DAY FROM trip_start_timestamp) AS trip_day,
    EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_day_of_week,
    EXTRACT(HOUR FROM trip_start_timestamp) AS trip_hour,
    trip_seconds,
    trip_miles,
    payment_type,
    ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.1)) AS pickup_grid,
    ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(dropoff_longitude, dropoff_latitude), 0.1)) AS dropoff_grid,
    ST_DISTANCE(ST_GEOGPOINT(pickup_longitude, pickup_latitude), ST_GEOGPOINT(dropoff_longitude, dropoff_latitude)) AS euclidean,
    CONCAT(ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(pickup_longitude, pickup_latitude), 0.1)), ST_ASTEXT(ST_SNAPTOGRID(ST_GEOGPOINT(dropoff_longitude, dropoff_latitude), 0.1))) AS loc_cross,
    IF((tips / fare >= 0.2), 1, 0) AS tip_bin,
    IF(RAND() <= 0.8, 'UNASSIGNED', 'TEST') AS ML_use
FROM
    `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
    pickup_longitude IS NOT NULL
    AND pickup_latitude IS NOT NULL
    AND dropoff_longitude IS NOT NULL
    AND dropoff_latitude IS NOT NULL
    AND trip_miles > 0
    AND trip_seconds > 0
    AND fare > 0
    AND EXTRACT(YEAR FROM trip_start_timestamp) = {YEAR}
LIMIT {LIMIT}
"""
_ = run_bq_query(SQL_QUERY, PROJECT_ID, LOCATION)

### Set a Ray cluster on Vertex AI

Before running the code below, make sure to [set up](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/set-up) Ray on Vertex AI and [create](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/create-cluster) at least one Ray cluster on Vertex AI.

In [None]:
import vertex_ray
from google.cloud import aiplatform as vertex_ai
from vertex_ray import Resources

#### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project.

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

#### Define cluster configuration

To provision a Ray cluster on Vertex AI, you can use a default provisioning request or you can specify the replica count (number of nodes), machine type, disk_spec, and accelerator as needed.

In [None]:
# Ray component
VERTEX_RAY_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest"  # @param ["us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest", "europe-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest", "asia-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest"] {allow-input: true}

In [None]:
head_node_type = Resources(
    machine_type="n1-standard-16",
    node_count=1,
    custom_image=VERTEX_RAY_IMAGE_URI,
)

worker_node_types = [
    Resources(
        machine_type="n1-standard-16",
        node_count=2,
        custom_image=VERTEX_RAY_IMAGE_URI,
        accelerator_type=None,
        accelerator_count=0,
    )
]

#### Create the Ray cluster

Create the Ray cluster using the Vertex AI SDK for Python version used with Ray.

In [None]:
cluster_name = "ray-cluster-train-pipeline-tutorial"

In [None]:
ray_cluster_name = vertex_ray.create_ray_cluster(
    head_node_type=head_node_type,
    worker_node_types=worker_node_types,
    cluster_name=cluster_name,
)

#### Get the Ray cluster

Use the Vertex AI SDK for Python to get the Ray cluster.

In [None]:
ray_cluster = vertex_ray.get_ray_cluster(ray_cluster_name)

In [1]:
from pprint import pprint

pprint(ray_cluster)

Pretty printing has been turned OFF


## Build a Ray-based ML pipeline using Vertex AI Pipelines

In this tutorial, you build and run a simple ML pipeline using KFP SDK on Vertex AI Pipeline.

The pipeline covers the following main tasks:

- `create_tabular_dataset_task` to store the ML dataset as Vertex AI Tabular dataset.

- `run_ray_training_task` to submit a Ray Train job on a Ray on Vertex AI cluster.

- `get_eval_metrics_task` to collect evaluation metrics to use in a blessing condition to register the model in Vertex AI Model Registry.

- `upload_model_task` to version the model in Vertex AI Model Registry.


### Import libraries

Import the required libraries.

In [None]:
import random
import sys
# General
import uuid
from typing import NamedTuple

# ML Pipeline
import kfp
import ray
from google_cloud_pipeline_components.types.artifact_types import (
    VertexDataset, VertexModel)
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from kfp import compiler, dsl
from kfp.dsl import Condition, Input, Metrics, Output, component

In [None]:
print("Ray version:", ray.__version__)
print("Python version:", sys.version)
print("Kfp version:", kfp.__version__)

### Define constants

Define contants for the tutorial.

In [None]:
# Pipeline
WORKING_URI = WORKSPACE_FOLDER_URI / "src"
ARTIFACT_STORE = WORKSPACE_FOLDER_URI / "artifacts"
PIPELINE_NAME = "chicago-taxitrips-train-pipeline"
PIPELINE_FILE_PATH = "pipeline.json"
PIPELINE_ROOT = str(ARTIFACT_STORE / PIPELINE_NAME)
MODEL_STORE = ARTIFACT_STORE / "models"

### Define helpers

Define helpers to use in the tutorial.

In [None]:
def get_id(n=5):
    """Generate a random string of letters and digits."""
    return "".join(random.sample(str(uuid.uuid4()), n))

### Build Ray-based ML pipeline components

You start building the component of your Ray-based ML pipeline.

#### Build Dataset component

The `GetDatasetUriOp` component gets an [`VertexDataset`](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.0.0/api/artifact_types.html), an artifact representing a Vertex AI Dataset resource, and returns its BigQuery URI.


In [None]:
@dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        "kfp",
        "google-cloud-aiplatform",
        "google-cloud-pipeline-components",
    ],
)
def GetDatasetUriOp(
    dataset: Input[VertexDataset], project: str, region: str, bucket_uri: str
) -> NamedTuple("outputs", dataset_uri=str):
    """Get the dataset resource name."""

    import logging

    from google.cloud import aiplatform as vertex_ai
    from google.protobuf.json_format import MessageToDict

    # Configure logging
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )

    # Initialize the Vertex AI SDK
    logging.info("Initializing Vertex AI SDK.")
    vertex_ai.init(project=project, location=region, staging_bucket=bucket_uri)

    # Get the dataset resource name
    logging.info(
        f"Getting dataset resource name from Vertex AI dataset '{dataset.metadata['resourceName']}'."
    )
    vertex_ai_dataset = vertex_ai.TabularDataset(
        dataset_name=dataset.metadata["resourceName"]
    )
    bq_uri = MessageToDict(vertex_ai_dataset.gca_resource._pb)["metadata"][
        "inputConfig"
    ]["bigquerySource"]["uri"].replace("bq://", "")

    component_outputs = NamedTuple("outputs", dataset_uri=str)
    return component_outputs(bq_uri)

#### Build a `VertexRayJobOp` component to run training job using Ray Jobs API

To orchestrate a Ray Job in a Vertex AI Pipeline, you need to create a KFP pipeline component which takes a Python script and submit the script to an existing Ray cluster on Vertex AI using the Ray Jobs API.

##### Build the `VertexRayJobOp` component

The `VertexRayJobOp` component submit the Ray Jobs API throught the public Ray dashboard address. **It is important to hightlight that  is accessible from outside the VPC, including the public internet. Use this component for experimentation only**. For production application, consider to set up connectivity from Vertex AI to your VPN and be sure to deploy the Ray on Vertex AI cluster in the same network.

In [None]:
@dsl.component(
    base_image=VERTEX_RAY_IMAGE_URI,
    packages_to_install=["google-cloud-aiplatform[ray]"],
)
def VertexRayJobOp(
    cluster_name: str,
    entrypoint: str,
    runtime_env: dict,
) -> NamedTuple("outputs", job_id=str):
    """
    Submit a Ray job to a Vertex AI cluster.
    """

    # Import Libraries
    import logging
    import time

    from ray.job_submission import JobStatus, JobSubmissionClient

    # Configure logging
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )

    # Initialize Ray Job client
    ray_job_client = JobSubmissionClient(f"vertex_ray://{cluster_name}")
    logging.info(f"Initialized Ray Job client for cluster '{cluster_name}'.")

    # Submit Ray Job
    job_id = ray_job_client.submit_job(entrypoint=entrypoint, runtime_env=runtime_env)
    logging.info(f"Submitted Ray job with ID '{job_id}'.")

    # Monitor Ray Job
    while True:
        try:
            ray_job_client = JobSubmissionClient(f"vertex_ray://{cluster_name}")
            job_status = ray_job_client.get_job_status(job_id)
            logging.info(f"Job '{job_id}' status: {job_status.value}")

            if job_status == JobStatus.SUCCEEDED:
                logging.info(f"Job '{job_id}' succeeded!")
                break
            elif job_status == JobStatus.FAILED:
                log_message = f"Job '{job_id}' failed! Logs: {ray_job_client.get_job_logs(job_id)}"
                logging.error(log_message)
                raise Exception("Job failed!")
            else:
                time.sleep(60)

        except Exception as e:
            print(f"An error occurred while monitoring the job: {e}")
            raise e

    component_outputs = NamedTuple("outputs", job_id=str)
    return component_outputs(job_id)

##### Prepare the training component code

Write the Python script to train the model using Ray.

In [None]:
training_script = """

# Libraries
import argparse
from uuid import uuid4

import ray
from ray.runtime_env import RuntimeEnv
from ray.data import preprocessors
from ray.train.xgboost import XGBoostTrainer, XGBoostCheckpoint
from ray.train import ScalingConfig, RunConfig, CheckpointConfig



def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--bq_dataset_uri', type=str, required=True)
    parser.add_argument('--training_dir', type=str, required=True)
    parser.add_argument('--training_run_id', type=str, required=True)
    parser.add_argument('--project_id', type=str, required=True)
    parser.add_argument('--region', type=str, required=True)
    parser.add_argument('--bucket_uri', type=str, required=True)
    parser.add_argument('--test_size', type=float, default=0.2)
    parser.add_argument('--seed', type=int, default=8)
    parser.add_argument('--objective', type=str, default='binary:logistic')
    parser.add_argument('--eval_metric', type=str, default='error')
    parser.add_argument('--eta', type=float, default=0.01)
    parser.add_argument('--max_depth', type=int, default=30)
    parser.add_argument('--subsample', type=float, default=0.2)
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--use_gpu', action=argparse.BooleanOptionalAction)
    args = parser.parse_args()
    return args


if __name__ == '__main__':

    ###################################
    # Get arguments and set variables #
    ###################################

    args = get_args()

    ############################################
    # Initialize vertex ai sdk and ray session #
    ############################################

    ray.init()
    ctx = ray.data.DataContext.get_current()
    ctx.execution_options.preserve_order = True

    #############
    # Read data #
    #############

    query = f'''
      SELECT
        IF(trip_month IS NULL, -1, trip_month) trip_month,
        IF(trip_day IS NULL, -1, trip_day) trip_day,
        IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
        IF(trip_hour IS NULL, -1, trip_hour) trip_hour,
        IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
        IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
        IF(payment_type IS NULL, 'NA', payment_type) payment_type,
        IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
        IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
        IF(euclidean IS NULL, -1, euclidean) euclidean,
        IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross,
        tip_bin
        FROM `{args.bq_dataset_uri}`
        WHERE ML_use = 'UNASSIGNED'
      '''

    bq_dataset = ray.data.read_bigquery(
      query=query,
      project_id=args.project_id,
    )

    ################################################
    # Train/test splitting and feature engineering #
    ################################################

     # train/test splitting
    bq_train_dataset, bq_valid_dataset = bq_dataset.train_test_split(
    test_size=args.test_size, seed=args.seed
    )

    # min_max_scaling
    min_max_scaler = preprocessors.MinMaxScaler(
    columns=["trip_seconds", "trip_miles", "euclidean"]
    )
    min_max_scaler.fit(bq_train_dataset)
    train_dataset = min_max_scaler.transform(bq_train_dataset)
    valid_dataset = min_max_scaler.transform(bq_valid_dataset)

    # ordinal encoding
    ordinal_encoder = preprocessors.OrdinalEncoder(
    columns=["trip_month", "trip_day", "trip_hour", "pickup_grid",
              "dropoff_grid", "loc_cross", "payment_type", "trip_day_of_week"]
    )
    ordinal_encoder.fit(bq_train_dataset)
    train_dataset = ordinal_encoder.transform(bq_train_dataset)
    valid_dataset = ordinal_encoder.transform(bq_valid_dataset)


    ###############
    # Train model #
    ###############

    # xgboost configuration
    xgboost_config = {
        'objective': args.objective,
        'eval_metric': [args.eval_metric],
        'eta': args.eta,
        'max_depth': args.max_depth,
        'subsample': args.subsample
    }

    additional_config = {
        'num_boost_round': args.n_estimators
    }

    # scaling config
    scaling_config = ScalingConfig(
        num_workers=1,
        use_gpu=args.use_gpu
    )

    # run config
    run_config = RunConfig(
      storage_path=args.training_dir,
      checkpoint_config=CheckpointConfig(
        num_to_keep=5
      ),
      name=args.training_run_id,
    )

    # train model
    trainer = XGBoostTrainer(
        scaling_config=scaling_config,
        run_config=run_config,
        label_column='tip_bin',
        params=xgboost_config,
        datasets={'train': train_dataset, 'valid': valid_dataset},
        **additional_config
    )

    xgb_result = trainer.fit()
"""

##### Upload the training code to the Cloud Bucket

Upload the training code to the Cloud Bucket.

In [None]:
with (WORKING_URI / "train.py").open("w") as train_file:
    train_file.write(training_script)
train_file.close()

#### Build Model Evalution component

The `GetMetricsOp` component reads evaluation metrics (error) and store them as Metrics artifact.

In [None]:
@dsl.component(
    base_image=VERTEX_RAY_IMAGE_URI,
    packages_to_install=[
        "google-cloud-aiplatform[ray]",
        "ray[data]==2.9.3",
        "ray[data]==2.9.3",
        "ray[train]==2.9.3",
        "kfp",
        "google-cloud-pipeline-components",
        "etils",
        "importlib_resources",
    ],
)
def GetMetricsOp(
    training_dir: str, training_run_id: str, metrics: Output[Metrics]
) -> NamedTuple("outputs", threshold_metric=float):
    """Get evaluation metrics from training run."""

    import logging

    from etils import epath
    from ray.tune import ExperimentAnalysis

    eval_metrics = "error"
    threshold_metric = "valid-error"

    # Configure logging
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )

    # restore training result and log metrics
    logging.info("Restoring training result.")
    training_dir = training_dir.replace("gs://", "/gcs/")
    training_path = epath.Path(training_dir)
    training_id_path = training_path / training_run_id
    experiment_analysis = ExperimentAnalysis(training_id_path)
    results = experiment_analysis.results_df.reset_index().to_dict(orient="records")

    # log metrics and store threshold metrics
    logging.info("Logging metrics and store threshold metrics.")
    for result in results:
        for key in result.keys():
            if eval_metrics in key:
                metrics.log_metric(key, round(result[key], 5))

    threshold_metric = results[0].get(threshold_metric)

    component_outputs = NamedTuple("outputs", threshold_metric=float)
    return component_outputs(round(threshold_metric, 5))

#### Build Model Versioning component

The `UploadRayModelOp` component collects the best training checkpoint and it stores the model to Vertex AI Model Registry with/without explainability configuration.

In [None]:
@component(
    base_image=VERTEX_RAY_IMAGE_URI,
    packages_to_install=[
        "google-cloud-aiplatform[ray]",
        "ray[data]==2.9.3",
        "ray[data]==2.9.3",
        "ray[train]==2.9.3",
        "xgboost==2.0.3",
        "xgboost_ray==0.1.19",
        "kfp",
        "google-cloud-pipeline-components",
        "etils",
        "importlib_resources",
    ],
)
def UploadRayModelOp(
    training_dir: str,
    training_run_id: str,
    dataset_uri: str,
    model_uri: str,
    explain: bool,
    project: str,
    region: str,
    bucket_uri: str,
    model: Output[VertexModel],
) -> NamedTuple("outputs", model_uri=str):
    """Upload the best training checkpoint to Vertex AI Model Registry."""

    import logging
    import shutil

    from etils import epath
    from google.cloud import aiplatform as vertex_ai
    from ray.train.xgboost import XGBoostCheckpoint
    from ray.tune import ExperimentAnalysis
    from vertex_ray.predict import xgboost as vertex_xgboost

    threshold_metric = "valid-error"
    mode = "min"

    def get_explanation_config():
        """A function to get explanation config."""
        explanation_config = {
            "inputs": {},
            "outputs": {},
            "params": {"sampled_shapley_attribution": {"path_count": 10}},
        }

        input_names = [
            "trip_month",
            "trip_day",
            "trip_day_of_week",
            "trip_hour",
            "trip_seconds",
            "trip_miles",
            "payment_type",
            "pickup_grid",
            "dropoff_grid",
            "euclidean",
            "loc_cross",
        ]
        for input_name in input_names:
            explanation_config["inputs"][input_name] = {}

        explanation_config["outputs"]["predictions"] = {}

        return explanation_config

    def get_model_path(checkpoint_path):
        """A function to copy the model."""

        model_filename = "model.json"

        # Extract the model filename from the checkpoint path
        local_checkpoint_dir = checkpoint_path.replace("/gcs/", "")
        source_model_filepath = "/gcs/" + Path(local_checkpoint_dir) / model_filename

        # Create the model destination path
        destination_model_path = Path.cwd() / "model"
        destination_model_path.mkdir(parents=True, exist_ok=True)

        # Copy the model
        destination_model_filepath = destination_model_path / model_filename
        shutil.copy(source_model_filepath, destination_model_filepath)

        return destination_model_path

    # Configure logging
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )

    # Initialize the Vertex AI SDK
    logging.info("Initializing Vertex AI SDK.")
    vertex_ai.init(project=project, location=region, staging_bucket=bucket_uri)

    # Restore training result
    logging.info("Restoring training result.")
    training_dir = training_dir.replace("gs://", "/gcs/")
    training_path = epath.Path(training_dir)
    training_id_path = training_path / training_run_id
    experiment_analysis = ExperimentAnalysis(training_id_path)

    # Load the best xgb_model_checkpoint
    logging.info("Loading the best xgb_model_checkpoint.")
    log_path = experiment_analysis.get_best_trial(metric=threshold_metric, mode=mode)
    xgb_model_checkpoint = experiment_analysis.get_best_checkpoint(
        log_path, metric=threshold_metric, mode=mode
    )

    # Register the model
    logging.info("Registering the model.")
    with xgb_model_checkpoint.as_directory() as local_checkpoint_dir:

        destination_model_path = get_model_path(local_checkpoint_dir)

        if explain:
            explanation_config = get_explanation_config()

            explanation_metadata = vertex_ai.explain.ExplanationMetadata(
                inputs=explanation_config["inputs"],
                outputs=explanation_config["outputs"],
            )
            explanation_parameters = vertex_ai.explain.ExplanationParameters(
                explanation_config["params"]
            )

            registered_model = vertex_xgboost.register_xgboost(
                checkpoint=XGBoostCheckpoint.from_directory(destination_model_path),
                artifact_uri=model_uri,
                explanation_metadata=explanation_metadata,
                explanation_parameters=explanation_parameters,
                labels={
                    "dataset": dataset_uri.split(".")[-1],
                    "experiment": training_run_id,
                },
            )
        else:
            registered_model = vertex_xgboost.register_xgboost(
                checkpoint=XGBoostCheckpoint.from_directory(destination_model_path),
                artifact_uri=model_uri,
                labels={
                    "dataset": dataset_uri.split(".")[-1],
                    "experiment": training_run_id,
                },
            )

    model.uri = registered_model.uri

    component_outputs = NamedTuple("outputs", model_uri=str)
    return component_outputs(registered_model.uri)

### Build a ML pipeline

Define your workflow using Kubeflow Pipelines DSL package by assembling components.

In [None]:
@kfp.dsl.pipeline(name="Chicago Taxi Trips Pipeline", pipeline_root=str(PIPELINE_ROOT))
def pipeline(
    bq_dataset_name: str,
    bq_dataset_source: str,
    training_entrypoint: str,
    training_dir: str,
    training_run_id: str,
    cluster_name: str,
    runtime_env: dict,
    threshold: float,
    model_uri: str,
    explain: bool,
    project: str,
    region: str,
    bucket_uri: str,
):

    # get data task
    create_tabular_dataset_task = TabularDatasetCreateOp(
        display_name=bq_dataset_name,
        bq_source=bq_dataset_source,
    ).set_display_name("Create Vertex AI BigQuery dataset")

    get_dataset_uri_task = (
        GetDatasetUriOp(
            dataset=create_tabular_dataset_task.outputs["dataset"],
            project=project,
            region=region,
            bucket_uri=bucket_uri,
        )
        .set_display_name("Get BigQuery table")
        .after(create_tabular_dataset_task)
    )

    # training task
    run_ray_training_task = (
        VertexRayJobOp(
            entrypoint=f"""{training_entrypoint} --bq_dataset_uri={get_dataset_uri_task.outputs['dataset_uri']} \
                                        --training_dir={training_dir} \
                                        --training_run_id={training_run_id} \
                                        --project_id={project} \
                                        --region={region} \
                                        --bucket_uri={bucket_uri}""",
            cluster_name=cluster_name,
            runtime_env=runtime_env,
        )
        .set_display_name("Run Ray Training")
        .after(get_dataset_uri_task)
    )

    # get eval metrics task
    get_eval_metrics_task = (
        GetMetricsOp(training_dir=training_dir, training_run_id=training_run_id)
        .set_display_name("Get Metrics")
        .after(run_ray_training_task)
    )

    # evaluate condition
    with Condition(
        get_eval_metrics_task.outputs["threshold_metric"] < threshold,
        name="Blessing condition",
    ):

        _ = (
            UploadRayModelOp(
                training_dir=training_dir,
                training_run_id=training_run_id,
                dataset_uri=get_dataset_uri_task.outputs["dataset_uri"],
                model_uri=model_uri,
                explain=explain,
                project=project,
                region=region,
                bucket_uri=bucket_uri,
            )
            .set_display_name("Upload Model")
            .after(get_eval_metrics_task)
        )

### Compile your pipeline into a YAML file

Compile the pipeline into YAML format. The YAML file includes all the information for executing your pipeline on Vertex AI Pipelines.


In [None]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path=PIPELINE_FILE_PATH)

### Run your pipeline

You use the Vertex AI Python client to submit and run your pipeline as Vertex AI Pipeline job.

#### Set the pipeline parameters

You set the following parameters to run the pipeline.

In [None]:
# Unique identifier for the pipeline run, generated using the `get_id()
ID = get_id()

# Path to the working directory used for the pipeline.
WORKING_DIR = str(WORKING_URI).replace("gs://", "/gcs/")

# Specifies the runtime environment for the Ray Train job.
RUNTIME_ENV = {
    "working_dir": WORKING_DIR,
    "pip": [
        "google-cloud-bigquery-storage",
        "google-cloud-aiplatform[ray]",
        "ray[data]==2.9.3",
        "ray[data]==2.9.3",
        "ray[train]==2.9.3",
        "xgboost==2.0.3",
        "xgboost_ray==0.1.19",
        "etils",
        "importlib_resources",
    ],
}

# The name of the Vertex AI BigQuery dataset to store the training dataset.
VERTEX_BQ_DATASET_NAME = f"chicago_taxitrips_{ID}"

# The fully qualified BigQuery URI pointing to the data source used for training.
VERTEX_BQ_DATASET_URI = f"bq://{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"

# The path where training code is stored.
TRAINING_DIR = str(ARTIFACT_STORE / "train_jobs")

# Unique ID for the training run, generated using the `ID`.
TRAINING_RUN_ID = f"chicago-taxitrips-xgboost-run-{ID}"

# The percentage of data to be used for testing.
TEST_SIZE = 0.2

# Random seed for reproducibility.
SEED = 8

# Specifies the learning objective for the XGBoost model (in this case, binary classification).
OBJECTIVE = "binary:logistic"

# The evaluation metric used to monitor model performance during training.
EVAL_METRIC = "error"

# The learning rate for the XGBoost model.
ETA = 0.01

# Maximum depth of trees in the XGBoost model.
MAX_DEPTH = 30

# Fraction of the training data to be used for each tree.
SUBSAMPLE = 0.2

# Number of trees to build in the XGBoost model.
N_ESTIMATORS = 100

# Specifies whether to use a GPU for training on Ray on Vertex AI (set to `False` here).
USE_GPU = False

# The entrypoint command to execute the training script (`train.py`) on Ray on Vertex AI cluster.
TRAINING_ENTRYPOINT = f"""python3 train.py --test_size={TEST_SIZE} --seed={SEED} \
                        --objective={OBJECTIVE} --eval_metric={EVAL_METRIC} \
                        --eta={ETA} --max_depth={MAX_DEPTH} \
                        --subsample={SUBSAMPLE} --n_estimators={N_ESTIMATORS} \
                        {'--use_gpu' if USE_GPU else '--no-use_gpu'}"""

# The dashboard address of the Ray cluster where the training runs.
CLUSTER_NAME = ray_cluster.dashboard_address

# The blessing threshold on training perfomance for registering the model in Vertex AI Model Registry.
THRESHOLD = 0.5

# The location to store the trained XGBoost model in the Vertex AI artifact store on Google Cloud bucket.
MODEL_URI = str(MODEL_STORE / f"chicago-taxitrips-xgboost-model-{ID}")

# Flag to enable Vertex AI Prediction Explainability configuration to deploy the model.
EXPLAIN = True

#### Prepare the pipeline job

Initiate the pipeline job with required configuration.


In [None]:
parameter_values = {
    "bq_dataset_name": VERTEX_BQ_DATASET_NAME,
    "bq_dataset_source": VERTEX_BQ_DATASET_URI,
    "training_entrypoint": TRAINING_ENTRYPOINT,
    "training_dir": TRAINING_DIR,
    "training_run_id": TRAINING_RUN_ID,
    "cluster_name": CLUSTER_NAME,
    "runtime_env": RUNTIME_ENV,
    "threshold": THRESHOLD,
    "model_uri": MODEL_URI,
    "explain": EXPLAIN,
    "project": PROJECT_ID,
    "region": REGION,
    "bucket_uri": BUCKET_URI,
}

pipeline_job = vertex_ai.PipelineJob(
    display_name="Chicago Taxi Trips Pipeline job",
    template_path=PIPELINE_FILE_PATH,
    pipeline_root=PIPELINE_ROOT,
    parameter_values=parameter_values,
    enable_caching=True,
)

#### Run the pipeline

Run the pipeline job.

In [None]:
pipeline_job.run()

### Get the pipeline run

Use the Vertex AI Pipeline SDK to collect information about your pipeline run.

In [None]:
pipeline_runs_df = vertex_ai.get_pipeline_df(pipeline="chicago-taxi-trips-pipeline")
pipeline_runs_df.head()

## Cleaning up

In [None]:
delete_pipeline_job = False
delete_ray_cluster = False
delete_bucket = False

if delete_pipeline_job:
    pipeline_job.delete()

if delete_ray_cluster:
    vertex_ray.delete_ray_cluster(ray_cluster.cluster_resource_name)

if delete_bucket:
    !gsutil -m rm -r $BUCKET_URI