From cd1a452e036401b97e60a76ee58f472807f92e2e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 3 Sep 2021 15:18:55 +0100 Subject: [PATCH 1/4] Remove todo, ensure we only check rank 0 for deepspeed warning --- pytorch_lightning/plugins/training_type/deepspeed.py | 1 - tests/plugins/test_deepspeed_plugin.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 5fa8739de7b2d..dc6808ffd442d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -669,7 +669,6 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: filepath: write-target file's path """ if self.zero_stage_3 and self._multi_device and self.is_global_zero: - # todo (sean): Add link to docs once docs are merged. warning_cache.warn( "When saving the DeepSpeed Stage 3 checkpoint, " "each worker will save a shard of the checkpoint within a directory. " diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7400569bd3f99..e19aa69641a6f 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -420,7 +420,7 @@ def test_deepspeed_fp32_works(tmpdir): @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_stage_3_save_warning(tmpdir): """ - Test to ensure that DeepSpeed Stage 3 gives a warning when saving. + Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero. """ model = BoringModel() trainer = Trainer( @@ -428,8 +428,9 @@ def test_deepspeed_stage_3_save_warning(tmpdir): ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") - with pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory."): - trainer.save_checkpoint(checkpoint_path) + if trainer.is_global_zero: + with pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory."): + trainer.save_checkpoint(checkpoint_path) @RunIf(min_gpus=1, deepspeed=True, special=True) From da517eb4e60aa781a90e80106c19a2aed60a3f7e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 6 Sep 2021 13:04:18 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/plugins/test_deepspeed_plugin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index db6acca83d32b..978ca7dd8ff36 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -402,9 +402,7 @@ def test_deepspeed_fp32_works(tmpdir): @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_stage_3_save_warning(tmpdir): - """ - Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero. - """ + """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16 From 65af5fd1cef5e24a3a7fbf189ac56e7c5fe445a8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 7 Sep 2021 10:45:14 +0100 Subject: [PATCH 3/4] Fix test --- tests/plugins/test_deepspeed_plugin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 978ca7dd8ff36..b0268f24177ef 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -409,9 +409,13 @@ def test_deepspeed_stage_3_save_warning(tmpdir): ) trainer.fit(model) checkpoint_path = os.path.join(tmpdir, "model.pt") + with pytest.warns(UserWarning) as record: + # both ranks need to call save checkpoint + trainer.save_checkpoint(checkpoint_path) if trainer.is_global_zero: - with pytest.warns(UserWarning, match="each worker will save a shard of the checkpoint within a directory."): - trainer.save_checkpoint(checkpoint_path) + assert len(record) == 1 + match = "each worker will save a shard of the checkpoint within a directory." + assert match in str(record[0].message) @RunIf(min_gpus=1, deepspeed=True, special=True) From 0f9fba3453f15229dd0eabd9419685e6e1756b64 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Sep 2021 10:48:25 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/trainer/loops/test_training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py index c37681e4831ca..d21e8efc7a5cb 100644 --- a/tests/trainer/loops/test_training_loop.py +++ b/tests/trainer/loops/test_training_loop.py @@ -191,7 +191,7 @@ def training_epoch_end(self, outputs) -> None: def test_batch_loop_releases_loss(tmpdir): - """Test that loss/graph is released so that it can be garbage collected before the next training step""" + """Test that loss/graph is released so that it can be garbage collected before the next training step.""" class TestModel(BoringModel): def training_step(self, batch, batch_idx):