In [1]:
# Imports
import ray
from ray.job_submission import JobSubmissionClient, JobStatus
import time

In [2]:
# Ray cluster information for connection
ray_head_ip = "kuberay-head-svc.kuberay.svc.cluster.local"
ray_head_port = 8265
ray_address = f"http://{ray_head_ip}:{ray_head_port}"
client = JobSubmissionClient(ray_address)

In [3]:
# Submit Ray job using JobSubmissionClient
job_id = client.submit_job(
    entrypoint="python ray-gpu-example.py",
    runtime_env={
        "working_dir": "./"
    },
    entrypoint_num_gpus = 1,
    entrypoint_num_cpus = 1,
)

print(f"Ray job submitted with job_id: {job_id}")

# Waiting for Ray to finish the job and print the result
while True:
    status = client.get_job_status(job_id)
    if status in [ray.job_submission.JobStatus.RUNNING, ray.job_submission.JobStatus.PENDING]:
        time.sleep(5)
    else:
        break
try:
    logs = client.get_job_logs(job_id) 
    print(logs)
except RuntimeError as e:
    print(f"Failed to get job logs, please check logs on ray dashboard ")

2024-03-13 20:17:12,319	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_cadba0a68a4c4235.zip.
2024-03-13 20:17:12,321	INFO packaging.py:530 -- Creating a file package for local directory './'.


Ray job submitted with job_id: raysubmit_qLSWwYQC4S3TrxsV
2024-03-13 13:21:58.893801: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-13 13:21:59.007216: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-13 13:21:59.741023: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/lo

In [None]:
# Disconnect from the Ray cluster
ray.shutdown()