In [4]:
# Imports
import ray
from ray.job_submission import JobSubmissionClient, JobStatus
import time

In [2]:
# Ray cluster information for connection
ray_head_ip = "kuberay-head-svc.kuberay.svc.cluster.local"
ray_head_port = 8265
ray_address = f"http://{ray_head_ip}:{ray_head_port}"
client = JobSubmissionClient(ray_address)

In [7]:
# Submit Ray job using JobSubmissionClient
job_id = client.submit_job(
    entrypoint="python ray-gpu-example.py",
    runtime_env={
        "working_dir": "./",
        "pip": ["tensorflow==2.13.0"],
        # Add proxy in env_vars if required
        "env_vars": {}      
    },
    entrypoint_num_gpus = 1,
    entrypoint_num_cpus = 1
)

print(f"Ray job submitted with job_id: {job_id}")

# Waiting for Ray to finish the job and print the result
while True:
    status = client.get_job_status(job_id)
    if status in [ray.job_submission.JobStatus.RUNNING, ray.job_submission.JobStatus.PENDING]:
        time.sleep(5)
    else:
        break
try:
    logs = client.get_job_logs(job_id) 
    print(logs)
except RuntimeError as e:
    print(f"Failed to get job logs, please check logs on ray dashboard ")

2024-07-15 06:07:37,381	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_b78492fdea11c7d4.zip.
2024-07-15 06:07:37,382	INFO packaging.py:530 -- Creating a file package for local directory './'.


Ray job submitted with job_id: raysubmit_SJBrv3Cb9DGXn4PN
2024-07-14 23:07:37,403	INFO job_manager.py:530 -- Runtime env is setting up.
2024-07-14 23:08:28.541383: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-14 23:08:28.582176: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Num GPUs Available:  1
TensorFlow will run on GPU.
2024-07-14 23:08:32.217517: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3234 MB memory:  -> device: 0, name: 

In [None]:
# Disconnect from the Ray cluster
ray.shutdown()