In [1]:
import ray
import tensorflow as tf

from ray.air import session
from ray.air.integrations.keras import Callback
from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig

2023-04-06 23:15:46.583781: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# If using GPUs, set this to True.
use_gpu = True

a = 5
b = 10
size = 100

In [3]:
def build_model() -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=()),
            # Add feature dimension, expanding (batch_size,) to (batch_size, 1).
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(10),
            tf.keras.layers.Dense(1),
        ]
    )
    return model


def train_func(config: dict):
    batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_model()
        multi_worker_model.compile(
            optimizer=tf.keras.optimizers.SGD(learning_rate=config.get("lr", 1e-3)),
            loss=tf.keras.losses.mean_squared_error,
            metrics=[tf.keras.metrics.mean_squared_error],
        )

    dataset = session.get_dataset_shard("train")

    results = []
    for _ in range(epochs):
        tf_dataset = dataset.to_tf(
            feature_columns="x", label_columns="y", batch_size=batch_size
        )
        history = multi_worker_model.fit(tf_dataset, callbacks=[Callback()])
        results.append(history.history)
    return results

In [4]:
ray.init("ray://example-cluster-kuberay-head-svc.ray.svc.cluster.local:10001")

0,1
Python version:,3.10.9
Ray version:,2.3.0
Dashboard:,http://10.8.7.8:8265


In [5]:
config = {"lr": 1e-3, "batch_size": 32, "epochs": 4}

train_dataset = ray.data.from_items(
    [{"x": x / 200, "y": 2 * x / 200} for x in range(200)]
)
scaling_config = ScalingConfig(num_workers=2, use_gpu=use_gpu)



In [6]:
trainer = TensorflowTrainer(
    train_loop_per_worker=train_func,
    train_loop_config=config,
    scaling_config=scaling_config,
    datasets={"train": train_dataset},
)
result = trainer.fit()
print(result.metrics)

[2m[36m(TunerInternal pid=798)[0m 2023-04-06 16:16:12.231770: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(TunerInternal pid=798)[0m 2023-04-06 16:16:12.234316: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
[2m[36m(TunerInternal pid=798)[0m 2023-04-06 16:16:12.291769: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
[2m[36m(TunerInternal pid=798)[0m 2023-04-06 16:16:12.292305: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(TunerInternal pid=798)[0m To enable the following instructions: AVX2 AVX512F AVX512_VNNI

0,1
Current time:,2023-04-06 16:16:33
Running for:,00:00:18.51
Memory:,3.2/83.5 GiB

Trial name,status,loc,iter,total time (s),loss,mean_squared_error,_timestamp
TensorflowTrainer_072e8_00000,TERMINATED,10.8.7.8:999,4,10.117,0.928824,0.928824,1680822991


[2m[36m(pid=999)[0m 2023-04-06 16:16:18.754198: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=999)[0m 2023-04-06 16:16:18.777738: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
[2m[36m(pid=999)[0m 2023-04-06 16:16:18.830744: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
[2m[36m(pid=999)[0m 2023-04-06 16:16:18.831326: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[2m[36m(pid=999)[0m To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate com

      2/Unknown - 1s 55ms/step - loss: 3.3150 - mean_squared_error: 3.3150
      2/Unknown - 1s 55ms/step - loss: 3.3150 - mean_squared_error: 3.3150
      3/Unknown - 1s 76ms/step - loss: 3.0522 - mean_squared_error: 3.0522
      3/Unknown - 1s 75ms/step - loss: 3.0522 - mean_squared_error: 3.0522
      5/Unknown - 2s 63ms/step - loss: 2.7236 - mean_squared_error: 2.7236
      5/Unknown - 1s 63ms/step - loss: 2.7236 - mean_squared_error: 2.7236
[2m[36m(TunerInternal pid=798)[0m Result for TensorflowTrainer_072e8_00000:
[2m[36m(TunerInternal pid=798)[0m   _time_this_iter_s: 2.651543140411377
[2m[36m(TunerInternal pid=798)[0m   _timestamp: 1680822988
[2m[36m(TunerInternal pid=798)[0m   _training_iteration: 1
[2m[36m(TunerInternal pid=798)[0m   date: 2023-04-06_16-16-28
[2m[36m(TunerInternal pid=798)[0m   done: false
[2m[36m(TunerInternal pid=798)[0m   experiment_id: f2d05e60392044ee96706251fcdd2a67
[2m[36m(TunerInternal pid=798)[0m   hostname: example-cluster-kub

[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 2023-04-06 16:16:28.862638: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=1146)[0m 2023-04-06 16:16:28.894461: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=1146)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=1146)[0m 2023-04-06 16:16:28.989445: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborti

      1/Unknown - 0s 85ms/step - loss: 3.0830 - mean_squared_error: 3.0830
      2/Unknown - 0s 87ms/step - loss: 2.9205 - mean_squared_error: 2.9205
      2/Unknown - 0s 87ms/step - loss: 2.9205 - mean_squared_error: 2.9205
      6/Unknown - 0s 38ms/step - loss: 2.3748 - mean_squared_error: 2.3748
      6/Unknown - 0s 38ms/step - loss: 2.3748 - mean_squared_error: 2.3748


[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 2023-04-06 16:16:29.639782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=1146)[0m 2023-04-06 16:16:29.678426: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=1146)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 2023-04-06 16:16:29.685200: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor 

      1/Unknown - 0s 96ms/step - loss: 2.7288 - mean_squared_error: 2.7288
      1/Unknown - 0s 97ms/step - loss: 2.7288 - mean_squared_error: 2.7288
      3/Unknown - 0s 52ms/step - loss: 2.3659 - mean_squared_error: 2.3659
      6/Unknown - 0s 40ms/step - loss: 2.0840 - mean_squared_error: 2.0840
      6/Unknown - 0s 40ms/step - loss: 2.0840 - mean_squared_error: 2.0840


[2m[36m(RayTrainWorker pid=1146)[0m 2023-04-06 16:16:30.478433: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=1146)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 2023-04-06 16:16:30.444526: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 	 [[{{node Placeholder/_0}}]]
[2m[36m(RayTrainWorker pid=213, ip=10.8.8.8)[0m 2023-04-06 16:16:30.486778: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor 



[2m[36m(RayTrainWorker pid=1146)[0m 2023-04-06 16:16:30.620110: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant
[2m[36m(RayTrainWorker pid=1146)[0m 	 [[{{node Placeholder/_0}}]]


      1/Unknown - 0s 137ms/step - loss: 2.4216 - mean_squared_error: 2.4216
      1/Unknown - 0s 137ms/step - loss: 2.4216 - mean_squared_error: 2.4216
      2/Unknown - 0s 90ms/step - loss: 2.2835 - mean_squared_error: 2.2835 
      2/Unknown - 0s 90ms/step - loss: 2.2835 - mean_squared_error: 2.2835 
      4/Unknown - 0s 65ms/step - loss: 1.8886 - mean_squared_error: 1.8886
      4/Unknown - 0s 65ms/step - loss: 1.8886 - mean_squared_error: 1.8886
      7/Unknown - 0s 49ms/step - loss: 1.8287 - mean_squared_error: 1.8287
      7/Unknown - 0s 49ms/step - loss: 1.8287 - mean_squared_error: 1.8287


[2m[36m(TunerInternal pid=798)[0m 2023-04-06 16:16:33,798	INFO tune.py:798 -- Total run time: 18.59 seconds (18.50 seconds for the tuning loop).
2023-04-06 23:16:33,897	ERROR checkpoint_manager.py:170 -- The requested checkpoint is not available on this node, most likely because you are using Ray client or disabled checkpoint synchronization. To avoid this, enable checkpoint synchronization to cloud storage by specifying a `SyncConfig`. The checkpoint may be available on a different  node - please check this location on worker nodes: /home/ray/ray_results/TensorflowTrainer_2023-04-06_16-16-14/TensorflowTrainer_072e8_00000_0_2023-04-06_16-16-16/checkpoint_000003


[2m[36m(TunerInternal pid=798)[0m Result for TensorflowTrainer_072e8_00000:
[2m[36m(TunerInternal pid=798)[0m   _time_this_iter_s: 0.8873341083526611
[2m[36m(TunerInternal pid=798)[0m   _timestamp: 1680822991
[2m[36m(TunerInternal pid=798)[0m   _training_iteration: 4
[2m[36m(TunerInternal pid=798)[0m   date: 2023-04-06_16-16-31
[2m[36m(TunerInternal pid=798)[0m   done: true
[2m[36m(TunerInternal pid=798)[0m   experiment_id: f2d05e60392044ee96706251fcdd2a67
[2m[36m(TunerInternal pid=798)[0m   experiment_tag: '0'
[2m[36m(TunerInternal pid=798)[0m   hostname: example-cluster-kuberay-head-8jw8d
[2m[36m(TunerInternal pid=798)[0m   iterations_since_restore: 4
[2m[36m(TunerInternal pid=798)[0m   loss: 0.928823709487915
[2m[36m(TunerInternal pid=798)[0m   mean_squared_error: 0.928823709487915
[2m[36m(TunerInternal pid=798)[0m   node_ip: 10.8.7.8
[2m[36m(TunerInternal pid=798)[0m   pid: 999
[2m[36m(TunerInternal pid=798)[0m   should_checkpoint: true
