From edc1074cc85e2a4939ff964db3bd5e4cf791ad9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Jun 2021 08:50:01 +0200 Subject: [PATCH 1/9] fix messsage --- pytorch_lightning/plugins/training_type/ddp.py | 15 +++++++-------- .../plugins/training_type/ddp_spawn.py | 15 +++++++-------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 84850b8d01b12..d848a2a99b877 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -37,7 +37,7 @@ rank_zero_deprecation, rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available, rank_zero_info from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import reset_seed @@ -233,13 +233,6 @@ def setup_distributed(self): # where to store ip_table self.init_ddp_connection() - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -308,6 +301,12 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + # on rank=0 let everyone know training is starting + rank_zero_info("-" * 100) + rank_zero_info(f"distributed_backend={self.distributed_backend}") + rank_zero_info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + rank_zero_info("-" * 100) + def pre_dispatch(self): # move the model to the correct device self.model_to_device() diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 47f2a64c04759..18f984eb8bf85 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -36,7 +36,7 @@ ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available, rank_zero_info from pytorch_lightning.utilities.seed import reset_seed if _TORCH_GREATER_EQUAL_1_8: @@ -182,13 +182,6 @@ def new_process(self, process_idx, trainer, mp_queue): # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -267,6 +260,12 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + # on rank=0 let everyone know training is starting + rank_zero_info("-" * 100) + rank_zero_info(f"distributed_backend={self.distributed_backend}") + rank_zero_info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") + rank_zero_info("-" * 100) + def determine_ddp_device_ids(self): if self.root_device.type == "cpu": return None From 98a9777b65a936b7ab69a5aff6284c40ec33e525 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Jun 2021 06:55:28 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- pytorch_lightning/plugins/training_type/ddp_spawn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index d848a2a99b877..721685f7c4c43 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -37,7 +37,7 @@ rank_zero_deprecation, rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available, rank_zero_info +from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import reset_seed diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 18f984eb8bf85..58a9e4d99eb4a 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -36,7 +36,7 @@ ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available, rank_zero_info +from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.seed import reset_seed if _TORCH_GREATER_EQUAL_1_8: From e97f6d50f335e2bc20d53ab257575991c67c07cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Jun 2021 08:55:55 +0200 Subject: [PATCH 3/9] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 011777002eef9..4c918b42c16f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -308,6 +308,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Pass the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973)) +- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111)) + ## [1.3.7] - 2021-06-22 From 1fe3d98f13197b4fc797da577f37d9099889ba23 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 25 Jun 2021 09:49:44 +0200 Subject: [PATCH 4/9] single rank_zero_info --- pytorch_lightning/plugins/training_type/ddp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 721685f7c4c43..35f36ff3a69a5 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -302,10 +302,10 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info("-" * 100) - rank_zero_info(f"distributed_backend={self.distributed_backend}") - rank_zero_info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - rank_zero_info("-" * 100) + rank_zero_info(("-" * 100, + f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", + "-" * 100)) def pre_dispatch(self): # move the model to the correct device From 0f4cca239ca6b4a5668fc13fd80e09267758f639 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Jun 2021 07:50:47 +0000 Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/training_type/ddp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 35f36ff3a69a5..d512b5f115e5f 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -302,10 +302,10 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info(("-" * 100, - f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", - "-" * 100)) + rank_zero_info(( + "-" * 100, f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", "-" * 100 + )) def pre_dispatch(self): # move the model to the correct device From cfe38e2716d601def94e285befd9751b8b89eb21 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 25 Jun 2021 09:52:17 +0200 Subject: [PATCH 6/9] Apply suggestions from code review --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- pytorch_lightning/plugins/training_type/ddp_spawn.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index d512b5f115e5f..aeadb7ae5468c 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -304,7 +304,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt # on rank=0 let everyone know training is starting rank_zero_info(( "-" * 100, f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", "-" * 100 + f"All DDP processes registered. Starting ddp with {self.world_size} processes", "-" * 100, )) def pre_dispatch(self): diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 58a9e4d99eb4a..4470b97db1ecc 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -261,10 +261,10 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info("-" * 100) - rank_zero_info(f"distributed_backend={self.distributed_backend}") - rank_zero_info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - rank_zero_info("-" * 100) + rank_zero_info("-" * 100, + f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", + "-" * 100,)) def determine_ddp_device_ids(self): if self.root_device.type == "cpu": From 33ede7bf85b5098c39d238bceff73bf9e40f95a1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Jun 2021 07:53:15 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pytorch_lightning/plugins/training_type/ddp.py | 6 ++++-- pytorch_lightning/plugins/training_type/ddp_spawn.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index aeadb7ae5468c..9f3f4b3fdde9b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -303,8 +303,10 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt # on rank=0 let everyone know training is starting rank_zero_info(( - "-" * 100, f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", "-" * 100, + "-" * 100, + f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", + "-" * 100, )) def pre_dispatch(self): diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 4470b97db1ecc..35fa8f54bad27 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -261,9 +261,9 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info("-" * 100, - f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", + rank_zero_info("-" * 100, + f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", "-" * 100,)) def determine_ddp_device_ids(self): From 5346ef8fec531f81a4ffb39714e86e5f5f1a212f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 25 Jun 2021 10:16:26 +0200 Subject: [PATCH 8/9] fix Jirka --- pytorch_lightning/plugins/training_type/ddp_spawn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 35fa8f54bad27..b39a44262f79b 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -261,10 +261,12 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info("-" * 100, - f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", - "-" * 100,)) + rank_zero_info(( + "-" * 100, + f"distributed_backend={self.distributed_backend}", + f"All DDP processes registered. Starting ddp with {self.world_size} processes", + "-" * 100, + )) def determine_ddp_device_ids(self): if self.root_device.type == "cpu": From 7f52f71de24a10066ef2a04881a3e2719f61002b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 25 Jun 2021 15:47:18 +0200 Subject: [PATCH 9/9] tuple not working, doing f-string --- pytorch_lightning/plugins/training_type/ddp.py | 12 ++++++------ pytorch_lightning/plugins/training_type/ddp_spawn.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 9f3f4b3fdde9b..b855d100b1f12 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -302,12 +302,12 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info(( - "-" * 100, - f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", - "-" * 100, - )) + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) def pre_dispatch(self): # move the model to the correct device diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index b39a44262f79b..8bac7e0cffbdb 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -261,12 +261,12 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) # on rank=0 let everyone know training is starting - rank_zero_info(( - "-" * 100, - f"distributed_backend={self.distributed_backend}", - f"All DDP processes registered. Starting ddp with {self.world_size} processes", - "-" * 100, - )) + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) def determine_ddp_device_ids(self): if self.root_device.type == "cpu":