In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Ray on Vertex AI cluster management


<table align="left">

  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ray_on_vertex_ai/ray_cluster_management.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> <br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fofficial%2Fray_on_vertex_ai%2Fray_cluster_management.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"> <br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/ray_on_vertex_ai/ray_cluster_management.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> <br>
      Open in Vertex AI Workbench
    </a>
  </td>
<td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/ray_on_vertex_ai/ray_cluster_management.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"> <br>
      View on GitHub
    </a>
  </td>
</table>

## Overview

This tutorial demonstrates how to use Ray on Vertex AI SDK for cluster management.

Learn more about [Ray on Vertex AI overview](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview).

### Objective

In this tutorial, you learn how to create a cluster, list existing clusters, get a cluster, update (manually scaling) a cluster, and delete a cluster.

This tutorial uses the following Vertex AI services and resources:

- [Ray on Vertex AI](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview)


The steps performed include:

- Create a cluster.
- List existing clusters.
- Get a cluster.
- Manually scale up the cluster, then scale down the cluster.
- Autoscaling a cluster.
- Delete existing clusters.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI

Learn about [Ray on Vertex AI pricing](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/overview#pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Getting Started

### Install Vertex AI SDK and other required packages

In [None]:
! pip3 install --upgrade --quiet google-cloud-aiplatform[ray]

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.


In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Retrieve the project number
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Set network information

[Set up a VPC peering network](https://cloud.google.com/vertex-ai/docs/general/vpc-peering) and private services connection to access Vertex AI.

In [None]:
VPC_NETWORK = "[your-network-name]"  # @param {type:"string"}
VPC_NETWORK_FULL = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, VPC_NETWORK)
VPC_NETWORK_FULL

### Import libraries

In [None]:
import time

import vertex_ray
from ray.job_submission import JobStatus, JobSubmissionClient

## Create a cluster

Note that within the same VPC network, IP ranges restrict the numbers of clusters and nodes you can create.

In [None]:
head_node_type = vertex_ray.Resources(
    machine_type="n1-standard-16",
    node_count=1,
)

worker_node_types = [
    vertex_ray.Resources(
        machine_type="n1-standard-8",
        node_count=2,  # Can be > 1
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
    )
]

cluster_resource_name = vertex_ray.create_ray_cluster(
    head_node_type=head_node_type,
    network=VPC_NETWORK_FULL,
    worker_node_types=worker_node_types,
)

## List existing clusters

In [None]:
clusters = vertex_ray.list_ray_clusters()
clusters

## Scale Ray clusters on Vertex AI

### Update an existing cluster (manually scaling)

There are two options for scaling Ray clusters on Vertex AI: Autoscaling and manual scaling.

With Manual scaling, you manually update the maximum number of worker nodes you can scale up. Manual scaling gives users more granular control of the nodes.

Notice that the maximum number of worker nodes you can scale up depends on the initial node counts (more details are in these [formulas](https://cloud.google.com/vertex-ai/docs/open-source/ray-on-vertex-ai/set-up)) and is restricted by IP ranges within the same VPC network.

Get the cluster you want to scale.

In [None]:
cluster = vertex_ray.get_ray_cluster(cluster_resource_name)
cluster

Scale down workers from 2 nodes to 1 node.

In [None]:
new_worker_node_types = []
for worker_node_type in cluster.worker_node_types:
    worker_node_type.node_count = 1
    new_worker_node_types.append(worker_node_type)

cluster_resource_name = vertex_ray.update_ray_cluster(
    cluster_resource_name=cluster_resource_name,
    worker_node_types=new_worker_node_types,
)

Verify if the cluster is successfully scaled down.

In [None]:
cluster = vertex_ray.get_ray_cluster(cluster_resource_name)
cluster

Scale up to 2 worker nodes.

In [None]:
new_worker_node_types = []
for worker_node_type in cluster.worker_node_types:
    worker_node_type.node_count = 2
    new_worker_node_types.append(worker_node_type)

cluster_resource_name = vertex_ray.update_ray_cluster(
    cluster_resource_name=cluster_resource_name,
    worker_node_types=new_worker_node_types,
)

Verify that the cluster is successfully scaled up.

In [None]:
cluster = vertex_ray.get_ray_cluster(cluster_resource_name)
cluster

### Autoscaling

Autoscaling lets the cluster automatically adjust the number of worker nodes based on the resources required by, for example, Ray tasks and actors.

Autoscaling is recommended if you are running a heavy workload and are unsure of the resources needed.

#### Create a new cluster with autoscaling

To enable Ray cluster's autoscaling, set the minimum replica count (min_replica_count) and maximum replica count (max_replica_count) of a worker pool.


In [None]:
autoscaling_spec = vertex_ray.AutoscalingSpec(
    min_replica_count=1,
    max_replica_count=3,
)

head_node_type = vertex_ray.Resources(
    machine_type="n1-standard-16",
    node_count=1,
)

worker_node_types = [
    vertex_ray.Resources(
        machine_type="n1-standard-16",
        autoscaling_spec=autoscaling_spec,
    )
]

# Create the Ray cluster on Vertex AI
cluster_resource_name = vertex_ray.create_ray_cluster(
    cluster_name="my-autoscaling-cluster",
    head_node_type=head_node_type,
    worker_node_types=worker_node_types,
)

#### Get the Ray cluster

After you create the autoscaling cluster, you use the Ray on Vertex AI API to get the cluster.

In [None]:
ray_clusters = vertex_ray.list_ray_clusters()
ray_cluster_resource_name = ray_clusters[-1].cluster_resource_name
ray_cluster = vertex_ray.get_ray_cluster(ray_cluster_resource_name)

### Develop an application using the Ray Jobs API

To trigger the autoscaling, you develop an Ray application representing an heavy workload.


In [None]:
%%writefile my_heavy_workload.py
import ray
import time

# Initialize Ray
ray.init()

# Define a computationally intensive task
@ray.remote(num_cpus=1)
def heavy_task(x):
    """
    Simulates a heavy workload by performing a CPU-bound operation.
    This example calculates the sum of squares for a range of numbers.
    """
    total = 0
    for i in range(x):
        total += i * i
    time.sleep(1)  # Simulate some work duration
    return total

# Generate a large number of tasks
num_tasks = 1000
results = []
for i in range(num_tasks):
    results.append(heavy_task.remote(1000000))

# Retrieve results (this will trigger autoscaling if needed)
outputs = ray.get(results)

# Print the sum of the results (optional)
print(f"Sum of results: {sum(outputs)}")

# Terminate the process
ray.shutdown()

### Submit a Ray job using the Ray Jobs API

Submit the Ray job using the Ray Jobs API through the the public Ray dashboard address.

In [None]:
ray_client = JobSubmissionClient(
    "vertex_ray://{}".format(ray_cluster.dashboard_address),
)

job_id = ray_client.submit_job(
    entrypoint="python3 my_heavy_workload.py",
    runtime_env={
        "working_dir": ".",
        "pip": [
            "ray==2.33",
        ],
    },
)

As soon as you submit the job, go to the [Ray on Vertex AI page](https://console.cloud.google.com/vertex-ai/ray) to see how the cluster is getting update. And you can monitor the autoscaling processing both using Cloud Logging or the public Ray dashboard.

> Custom upscaling and downscaling speed is not supported. For default values, see [Upscaling and downscaling speed](https://docs.ray.io/en/latest/cluster/vms/user-guides/configuring-autoscaling.html#upscaling-and-downscaling-speed) in the Ray documentation.

### Monitor the status of the job

You can use the Ray Jobs API to monitor the status of the job.

In [None]:
while True:
    job_status = ray_client.get_job_status(job_id)
    if job_status == JobStatus.SUCCEEDED:
        print("Job succeeded!")
        break
    else:
        if job_status == JobStatus.FAILED:
            print("Job failed!")
            break
        else:
            print("Job is running...")
            time.sleep(60)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the cluster you created in this tutorial.

In [None]:
delete_ray_cluster = False

if delete_ray_cluster:
    for cluster in ray_clusters:
        vertex_ray.delete_ray_cluster(cluster.cluster_resource_name)