From 1694278a57abb72f1ab6546517f6587324d131a1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:36:50 -0400 Subject: [PATCH 1/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/distrib_data_parallel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 53a47c130df50..13a0dec11e459 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -129,6 +129,7 @@ def train_fx(trial_hparams, cluster_manager, _): from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, rank_zero_info +import numpy as np try: from apex import amp @@ -377,8 +378,10 @@ def __set_random_port(self): try: default_port = os.environ['MASTER_PORT'] except Exception: - import random - default_port = random.randint(10000, 19000) + import os + pid = os.getpid() + rng1 = np.random.RandomState(pid) + default_port = rng1.randint(10000, 19999, 1)[0] os.environ['MASTER_PORT'] = str(default_port) def spawn_ddp_children(self, model): From cc5dbbb8f106168544b86cda858377a5e6236ed0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:38:17 -0400 Subject: [PATCH 2/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/distrib_data_parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 13a0dec11e459..cd023dfa61e91 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -378,11 +378,12 @@ def __set_random_port(self): try: default_port = os.environ['MASTER_PORT'] except Exception: - import os + # use the process id as a seed to a generator for port only pid = os.getpid() rng1 = np.random.RandomState(pid) default_port = rng1.randint(10000, 19999, 1)[0] - os.environ['MASTER_PORT'] = str(default_port) + + os.environ['MASTER_PORT'] = str(default_port) def spawn_ddp_children(self, model): self.__set_random_port() From 4bbd0dd80594dff8f03c80dbc52942e725331782 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:43:55 -0400 Subject: [PATCH 3/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index cd023dfa61e91..2e45af4913f76 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -129,7 +129,6 @@ def train_fx(trial_hparams, cluster_manager, _): from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, rank_zero_info -import numpy as np try: from apex import amp From 4b673c4392862656db2922018a4e2d75af64cf47 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:46:09 -0400 Subject: [PATCH 4/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d41f5ff3fd6b9..d2f505c4aa4dc 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -892,6 +892,7 @@ def fit( mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,)) elif self.distributed_backend == 'ddp_spawn': + self.__set_random_port() model.share_memory() # spin up peers From 4bb38f56efc74f11da3fc5c041225676e78ad5f6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:46:29 -0400 Subject: [PATCH 5/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 2e45af4913f76..3b94c9ae3b7cb 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -372,7 +372,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): def __set_random_port(self): """ When running DDP NOT managed by SLURM, the ports might collide - :return: """ try: default_port = os.environ['MASTER_PORT'] From 9b5e9492b2bfff924c9b8ec58f022b60dd59dd1c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:53:00 -0400 Subject: [PATCH 6/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d2f505c4aa4dc..13dba2141c3a3 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -32,7 +32,7 @@ from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info +from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info, rank_zero_only try: from apex import amp @@ -322,6 +322,12 @@ def __init__( # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) + # init the default rank if exists + if 'LOCAL_RANK' in os.environ: + rank_zero_only.rank = os.environ['LOCAL_RANK'] + if 'SLURM_JOB_ID' in os.environ: + rank_zero_only.rank = os.environ['SLURM_JOB_ID'] + # Init callbacks self.prepare_data_per_node = prepare_data_per_node self.callbacks = callbacks or [] From 8a819fe5152bbd20ca82a13552e54fb716069842 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 17 Jun 2020 23:55:41 -0400 Subject: [PATCH 7/7] init the port using a seed that matches process id for ddp --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 13dba2141c3a3..0f76c072291f4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -323,6 +323,8 @@ def __init__( os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) # init the default rank if exists + # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks + # this way we only show it on rank 0 if 'LOCAL_RANK' in os.environ: rank_zero_only.rank = os.environ['LOCAL_RANK'] if 'SLURM_JOB_ID' in os.environ: