Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not force sync_dist=True on epoch end #13364

Merged
merged 22 commits into from Jul 22, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/pytorch_lightning/CHANGELOG.md
Expand Up @@ -139,7 +139,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604))


-
- Raised a warning instead of forcing `sync_dist=True` on epoch end ([13364](https://github.com/Lightning-AI/lightning/pull/13364))


### Deprecated
Expand Down
Expand Up @@ -522,12 +522,15 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]:
cache = result_metric._forward_cache
elif not on_step and result_metric.meta.on_epoch:
if result_metric._computed is None:
# always reduce on epoch end
should = result_metric.meta.sync.should
result_metric.meta.sync.should = True
if not result_metric.meta.sync.should:
warning_cache.warn(
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved
f"It is recommended to use `self.log({result_metric.meta.name!r}, sync_dist=True)` when"
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved
" logging on epoch level in distributed setting to accumulate the metric across devices."
)
result_metric.compute()
result_metric.meta.sync.should = should

cache = result_metric._computed

if cache is not None:
if not isinstance(cache, torch.Tensor):
raise ValueError(
Expand All @@ -536,6 +539,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]:
)
if not result_metric.meta.enable_graph:
return cache.detach()

return cache

def valid_items(self) -> Generator:
Expand Down
17 changes: 17 additions & 0 deletions tests/tests_pytorch/trainer/logging_/test_distributed_logging.py
Expand Up @@ -15,6 +15,8 @@
from typing import Any, Dict, Optional, Union
from unittest.mock import Mock

import pytest

import pytorch_lightning as pl
from pytorch_lightning import Callback, Trainer
from pytorch_lightning.demos.boring_classes import BoringModel
Expand Down Expand Up @@ -194,3 +196,18 @@ def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") ->
assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1}
trainer.predict(model)
assert trainer.logger.logs == {"fit": 1, "validate": 1, "test": 1, "predict": 1}


def test_logger_sync_dist():
class CustomBoringModel(BoringModel):
def training_epoch_end(self, *args, **kwargs):
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved
super().training_epoch_end(*args, **kwargs)
self.log("global_rank", self.global_rank, sync_dist=False)

model = CustomBoringModel()
trainer = Trainer(fast_dev_run=1, accelerator="cpu", strategy="ddp", devices=2)
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved

with pytest.warns(UserWarning, match="It is recommended to use .* sync_dist=True"):
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved
trainer.fit(model)
krishnakalyan3 marked this conversation as resolved.
Show resolved Hide resolved

assert trainer.callback_metrics["global_rank"] == 0