From aa7f2522dc438d3b9099fa31bf733ea64c3d1e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 8 Mar 2023 22:36:00 +0100 Subject: [PATCH] Fix race condition in Fabric test (#17002) --- tests/tests_fabric/parity/test_parity_ddp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tests_fabric/parity/test_parity_ddp.py b/tests/tests_fabric/parity/test_parity_ddp.py index 73933742ca069..10c7fc711c928 100644 --- a/tests/tests_fabric/parity/test_parity_ddp.py +++ b/tests/tests_fabric/parity/test_parity_ddp.py @@ -125,7 +125,6 @@ def train_fabric_ddp(fabric): return model.state_dict(), torch.tensor(iteration_timings), memory_stats -@pytest.mark.flaky(reruns=3) @RunIf(standalone=True) @pytest.mark.usefixtures("reset_deterministic_algorithm", "reset_cudnn_benchmark") @pytest.mark.parametrize( @@ -148,6 +147,9 @@ def test_parity_ddp(accelerator, devices, tolerance): fabric.barrier() cuda_reset() torch.distributed.destroy_process_group() + # sleep for a bit to avoid race conditions, since the very first call in `train_torch_ddp` + # is initializing a new process group + time.sleep(3) # Train with raw PyTorch state_dict_torch, timings_torch, memory_torch = train_torch_ddp(