From 5046ec8bc4afb3b1f5624236c307601da43579d2 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 17 Sep 2022 16:00:08 +0200 Subject: [PATCH 1/5] fix tensorboard creating an experiment when nothing logged --- src/pytorch_lightning/loggers/tensorboard.py | 5 +++-- .../checkpointing/test_model_checkpoint.py | 15 +++++++++++---- tests/tests_pytorch/loggers/test_tensorboard.py | 10 ++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 7ff19b8c38c89..f27ae1c015b59 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -268,8 +268,9 @@ def save(self) -> None: @rank_zero_only def finalize(self, status: str) -> None: - self.experiment.flush() - self.experiment.close() + if self._experiment is not None: + self.experiment.flush() + self.experiment.close() self.save() @property diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 6d44bfe83be3d..2fcec5364cb03 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -869,18 +869,24 @@ def validation_step(self, batch, batch_idx): "limit_test_batches": 2, "enable_progress_bar": False, "enable_model_summary": False, + "log_every_n_steps": 1, + "default_root_dir": tmpdir, } trainer = Trainer(**trainer_kwargs, callbacks=[checkpoint_callback]) trainer.fit(model) - assert os.listdir(tmpdir) == ["epoch=00.ckpt"] + assert os.listdir(tmpdir) == ["epoch=00.ckpt", "lightning_logs"] for idx in range(4): # load from checkpoint - trainer = pl.Trainer(**trainer_kwargs, default_root_dir=tmpdir) + trainer = Trainer(**trainer_kwargs) trainer.fit(model, ckpt_path=checkpoint_callback.best_model_path) + # assert trainer.logger._experiment, str(idx) trainer.test(ckpt_path=checkpoint_callback.best_model_path, verbose=False) + assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} - assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_{i}" for i in range(4)} + + # no new versions created after the initial fit, because the ones that resume from ckpt do not log anything + assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_0"} def test_checkpoint_repeated_strategy_extended(tmpdir): @@ -891,6 +897,7 @@ class ExtendedBoringModel(BoringModel): def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) + self.log("val_loss", loss) return {"val_loss": loss} def validation_epoch_end(self, *_): @@ -930,7 +937,7 @@ def assert_checkpoint_log_dir(idx): limit_test_batches=4, callbacks=[checkpoint_cb], ) - trainer = pl.Trainer(**trainer_config) + trainer = Trainer(**trainer_config) assert_trainer_init(trainer) model = ExtendedBoringModel() diff --git a/tests/tests_pytorch/loggers/test_tensorboard.py b/tests/tests_pytorch/loggers/test_tensorboard.py index 3793b5c58b5b6..3b25c86b87c32 100644 --- a/tests/tests_pytorch/loggers/test_tensorboard.py +++ b/tests/tests_pytorch/loggers/test_tensorboard.py @@ -275,7 +275,17 @@ def training_step(self, *args): def test_tensorboard_finalize(summary_writer, tmpdir): """Test that the SummaryWriter closes in finalize.""" logger = TensorBoardLogger(save_dir=tmpdir) + assert logger._experiment is None logger.finalize("any") + + # no log calls, no experiment created -> nothing to flush + summary_writer.assert_not_called() + + logger = TensorBoardLogger(save_dir=tmpdir) + logger.log_metrics({"flush_me": 11.1}) # trigger creation of an experiment + logger.finalize("any") + + # finalize flushes to experiment directory summary_writer().flush.assert_called() summary_writer().close.assert_called() From 35472e5edd484339b8e96e115f6ad023ba622706 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 17 Sep 2022 16:36:58 +0200 Subject: [PATCH 2/5] update changelog --- src/pytorch_lightning/CHANGELOG.md | 10 +++++++--- .../checkpointing/test_model_checkpoint.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4ca202c34edd6..d9ffd760ae4df 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -7,9 +7,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [unReleased] - 2022-MM-DD -- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610)) - - ### Added @@ -40,6 +37,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `WandbLogger.download_artifact` and `WandbLogger.use_artifact` for managing artifacts with Weights and Biases ([#14551](https://github.com/Lightning-AI/lightning/issues/14551)) +- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610)) + + ### Changed - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) @@ -186,6 +186,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724)) +- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762) + + + ## [1.7.6] - 2022-09-13 ### Changed diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 2fcec5364cb03..50342b7d8ce25 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -886,7 +886,7 @@ def validation_step(self, batch, batch_idx): assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} # no new versions created after the initial fit, because the ones that resume from ckpt do not log anything - assert set(os.listdir(tmpdir / "lightning_logs")) == {f"version_0"} + assert set(os.listdir(tmpdir / "lightning_logs")) == {"version_0"} def test_checkpoint_repeated_strategy_extended(tmpdir): From 0017f4eaa78393e8bb3b1d9c064d21b956c6ab4d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 17 Sep 2022 16:38:18 +0200 Subject: [PATCH 3/5] fix ordering issue --- tests/tests_pytorch/checkpointing/test_model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 50342b7d8ce25..49e580307fc91 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -874,7 +874,7 @@ def validation_step(self, batch, batch_idx): } trainer = Trainer(**trainer_kwargs, callbacks=[checkpoint_callback]) trainer.fit(model) - assert os.listdir(tmpdir) == ["epoch=00.ckpt", "lightning_logs"] + assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} for idx in range(4): # load from checkpoint From 052107d7b7bb9b78c3486947f28079ecadbe5fad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 18 Sep 2022 06:00:00 -0400 Subject: [PATCH 4/5] Update tests/tests_pytorch/checkpointing/test_model_checkpoint.py Co-authored-by: Kushashwa Ravi Shrimali --- tests/tests_pytorch/checkpointing/test_model_checkpoint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index 49e580307fc91..f8a172d27359f 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -880,7 +880,6 @@ def validation_step(self, batch, batch_idx): # load from checkpoint trainer = Trainer(**trainer_kwargs) trainer.fit(model, ckpt_path=checkpoint_callback.best_model_path) - # assert trainer.logger._experiment, str(idx) trainer.test(ckpt_path=checkpoint_callback.best_model_path, verbose=False) assert set(os.listdir(tmpdir)) == {"epoch=00.ckpt", "lightning_logs"} From 6fa7e3646c1406594f9ff355ae51a2ff153393a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 18 Sep 2022 06:00:10 -0400 Subject: [PATCH 5/5] Update src/pytorch_lightning/CHANGELOG.md Co-authored-by: Kushashwa Ravi Shrimali --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d9ffd760ae4df..367d32210e2e4 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -186,7 +186,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed torchscript error with ensembles of LightningModules ([#14657](https://github.com/Lightning-AI/lightning/pull/14657), [#14724](https://github.com/Lightning-AI/lightning/pull/14724)) -- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762) +- Fixed an issue with `TensorBoardLogger.finalize` creating a new experiment when none was created during the Trainer's execution ([#14762](https://github.com/Lightning-AI/lightning/pull/14762))