From 466efda967807a152f5bdf386cea0e0a86b7197f Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 19 Nov 2021 04:04:48 +0100 Subject: [PATCH 1/4] Fix `move_metrics_to_cpu` with evaluation --- .../loops/epoch/evaluation_epoch_loop.py | 12 ++++++---- .../logging_/test_eval_loop_logging.py | 24 +++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index b4660c96a0989..102603f20302b 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -24,7 +24,6 @@ from pytorch_lightning.trainer.progress import BatchProgress from pytorch_lightning.utilities.auto_restart import MergedIteratorState, reload_dataloader_state_dict from pytorch_lightning.utilities.fetching import AbstractDataFetcher, DataFetcher -from pytorch_lightning.utilities.memory import recursive_detach from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT @@ -134,10 +133,13 @@ def advance( self.trainer.logger_connector.update_eval_step_metrics() # track epoch level outputs - if self._should_track_batch_outputs_for_epoch_end(): - output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu) - if output is not None: - self.outputs.append(output) + if self._should_track_batch_outputs_for_epoch_end() and output is not None: + self.outputs.append(output) + + if self.trainer.move_metrics_to_cpu: + # the evaluation step output is not moved as they are not considered "metrics" + assert self.trainer._results is not None + self.trainer._results.cpu() if not self.batch_progress.is_last_batch: # if fault tolerant is enabled and process has been notified, exit. diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 88229effbc8c9..894b3e7bb3659 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -699,3 +699,27 @@ def test_filter_metrics_for_dataloader(kwargs, expected): """Logged metrics should only include metrics from the concerned dataloader.""" actual = LoggerConnector._filter_metrics_for_dataloader(**kwargs) assert actual == expected + + +def test_evaluation_move_metrics_to_cpu_and_outputs(tmpdir): + class TestModel(BoringModel): + def validation_step(self, *args): + x = torch.tensor(2.0, requires_grad=True, device=self.device) + y = x * 2 + assert x.requires_grad is True + assert y.grad_fn is None # disabled by validation + + self.log("foo", y) + return y + + def validation_epoch_end(self, outputs): + # the step outputs were not moved + assert all(o.device == self.device for o in outputs) + # but the logging results were + assert self.trainer.callback_metrics["foo"].device.type == "cpu" + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, accelerator="auto", devices=1 + ) + trainer.validate(model, verbose=False) From e8756f780ab7468b23831abe368cf2c2691a1030 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 19 Nov 2021 04:48:55 +0100 Subject: [PATCH 2/4] Improve debugging --- tests/trainer/logging_/test_eval_loop_logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 894b3e7bb3659..570356b1e9705 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -714,7 +714,7 @@ def validation_step(self, *args): def validation_epoch_end(self, outputs): # the step outputs were not moved - assert all(o.device == self.device for o in outputs) + assert all(o.device == self.device for o in outputs), outputs # but the logging results were assert self.trainer.callback_metrics["foo"].device.type == "cpu" From 3588b018e35a367ebcc6ea80a3b092d6c8f70f16 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 19 Nov 2021 05:03:34 +0100 Subject: [PATCH 3/4] Update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae0515cf22703..ba00fb2243979 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -150,10 +150,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -- +- Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) -- +- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) From 67b8c26204ad72ea4aff37205e5e3a1cd718d6be Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 19 Nov 2021 15:42:47 +0100 Subject: [PATCH 4/4] Run only on GPU --- tests/trainer/logging_/test_eval_loop_logging.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 570356b1e9705..66c91eaf15f1b 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -26,6 +26,7 @@ from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel, RandomDataset +from tests.helpers.runif import RunIf def test__validation_step__log(tmpdir): @@ -701,6 +702,7 @@ def test_filter_metrics_for_dataloader(kwargs, expected): assert actual == expected +@RunIf(min_gpus=1) def test_evaluation_move_metrics_to_cpu_and_outputs(tmpdir): class TestModel(BoringModel): def validation_step(self, *args): @@ -719,7 +721,5 @@ def validation_epoch_end(self, outputs): assert self.trainer.callback_metrics["foo"].device.type == "cpu" model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, accelerator="auto", devices=1 - ) + trainer = Trainer(default_root_dir=tmpdir, limit_val_batches=2, move_metrics_to_cpu=True, gpus=1) trainer.validate(model, verbose=False)