From e9a52c0e52c480979b0a441563abe19caa52e51b Mon Sep 17 00:00:00 2001 From: yhl48 Date: Wed, 5 Jun 2024 08:01:04 +0100 Subject: [PATCH 1/6] patched the device bug, works with 1 node --- src/litdata/utilities/env.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/litdata/utilities/env.py b/src/litdata/utilities/env.py index 1005c503a..9ae03dbec 100644 --- a/src/litdata/utilities/env.py +++ b/src/litdata/utilities/env.py @@ -49,9 +49,11 @@ def detect(cls) -> "_DistributedEnv": world_size = torch.distributed.get_world_size() global_rank = torch.distributed.get_rank() # Note: On multi node CPU, the number of nodes won't be correct. - num_nodes = world_size // torch.cuda.device_count() if torch.cuda.is_available() else world_size - if torch.cuda.is_available() and world_size % torch.cuda.device_count() != 0: - raise RuntimeError("The world size should be divisible by the number of GPUs.") + if torch.cuda.is_available(): + if world_size // torch.cuda.device_count() >= 1: + num_nodes = world_size // torch.cuda.device_count() + else: + num_nodes = 1 else: world_size = None global_rank = 0 From 686f20082784f94e25f4e2f5a21ff95fce16eb4c Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 5 Jun 2024 10:00:00 +0100 Subject: [PATCH 2/6] update --- src/litdata/utilities/env.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/litdata/utilities/env.py b/src/litdata/utilities/env.py index 9ae03dbec..f26104664 100644 --- a/src/litdata/utilities/env.py +++ b/src/litdata/utilities/env.py @@ -49,11 +49,15 @@ def detect(cls) -> "_DistributedEnv": world_size = torch.distributed.get_world_size() global_rank = torch.distributed.get_rank() # Note: On multi node CPU, the number of nodes won't be correct. - if torch.cuda.is_available(): - if world_size // torch.cuda.device_count() >= 1: - num_nodes = world_size // torch.cuda.device_count() - else: - num_nodes = 1 + if torch.cuda.is_available() and world_size // torch.cuda.device_count() >= 1: + num_nodes = world_size // torch.cuda.device_count() + else: + num_nodes = 1 + + # If you are using multiple nodes, we assume you are using all the GPUs. + # On single node, a user can be using only a few GPUs of the node. + if torch.cuda.is_available() and num_nodes >= 1 and world_size % torch.cuda.device_count() != 0: + raise RuntimeError("The world size should be divisible by the number of GPUs.") else: world_size = None global_rank = 0 From 7a2fab01867b8cb1a773b8f10e0831284d52ed10 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 5 Jun 2024 10:50:47 +0100 Subject: [PATCH 3/6] update --- tests/streaming/test_dataset.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py index 278b4872b..d8ceb8f05 100644 --- a/tests/streaming/test_dataset.py +++ b/tests/streaming/test_dataset.py @@ -255,7 +255,12 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr "compression", [ pytest.param(None), - pytest.param("zstd", marks=pytest.mark.skipif(condition=not _ZSTD_AVAILABLE, reason="Requires: ['zstd']")), + pytest.param( + "zstd", + marks=pytest.mark.skipif( + condition=not _ZSTD_AVAILABLE and sys.platform == "darwing", reason="Requires: ['zstd']" + ), + ), ], ) @pytest.mark.timeout(30) From f5c59dcc5a20df1a786eb26fc2d479c7a5b6b12f Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 5 Jun 2024 10:51:06 +0100 Subject: [PATCH 4/6] update --- tests/streaming/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py index d8ceb8f05..3a7b6656a 100644 --- a/tests/streaming/test_dataset.py +++ b/tests/streaming/test_dataset.py @@ -258,7 +258,7 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr pytest.param( "zstd", marks=pytest.mark.skipif( - condition=not _ZSTD_AVAILABLE and sys.platform == "darwing", reason="Requires: ['zstd']" + condition=not _ZSTD_AVAILABLE or sys.platform == "darwing", reason="Requires: ['zstd']" ), ), ], From 66491482425e663c1f7e1ccae3aabc912c5e5dc0 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 5 Jun 2024 10:51:17 +0100 Subject: [PATCH 5/6] update --- tests/streaming/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py index 3a7b6656a..24b27ff41 100644 --- a/tests/streaming/test_dataset.py +++ b/tests/streaming/test_dataset.py @@ -258,7 +258,7 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir, compr pytest.param( "zstd", marks=pytest.mark.skipif( - condition=not _ZSTD_AVAILABLE or sys.platform == "darwing", reason="Requires: ['zstd']" + condition=not _ZSTD_AVAILABLE or sys.platform == "darwin", reason="Requires: ['zstd']" ), ), ], From c006e0c84e84b7f696b516c2e1c567ad082166e2 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 5 Jun 2024 16:08:47 +0100 Subject: [PATCH 6/6] update --- src/litdata/utilities/env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/litdata/utilities/env.py b/src/litdata/utilities/env.py index f26104664..3c23e816e 100644 --- a/src/litdata/utilities/env.py +++ b/src/litdata/utilities/env.py @@ -56,7 +56,7 @@ def detect(cls) -> "_DistributedEnv": # If you are using multiple nodes, we assume you are using all the GPUs. # On single node, a user can be using only a few GPUs of the node. - if torch.cuda.is_available() and num_nodes >= 1 and world_size % torch.cuda.device_count() != 0: + if torch.cuda.is_available() and num_nodes > 1 and world_size % torch.cuda.device_count() != 0: raise RuntimeError("The world size should be divisible by the number of GPUs.") else: world_size = None