diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 356e3f02b4049..4cf5e61b5c7eb 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -410,9 +410,6 @@ def train(self): self.interrupted = True self.on_keyboard_interrupt() - for proc in self.interactive_ddp_procs: - subprocess.Popen.kill(proc) - self.run_training_teardown() def prepare_train_loop_dataloader(self, train_dataloader): @@ -853,9 +850,7 @@ def run_training_teardown(self): if hasattr(self, '_teardown_already_run') and self._teardown_already_run: return - # clean up dist group - if self.use_ddp or self.use_ddp2: - torch_distrib.destroy_process_group() + self._teardown_already_run = True # Train end events with self.profiler.profile('on_train_end'): @@ -869,8 +864,16 @@ def run_training_teardown(self): self.logger.finalize("success") # summarize profile results - self.profiler.describe() - self._teardown_already_run = True + if self.global_rank == 0: + self.profiler.describe() + + if self.global_rank == 0: + for proc in self.interactive_ddp_procs: + subprocess.Popen.kill(proc) + + # clean up dist group + if self.use_ddp or self.use_ddp2: + torch_distrib.destroy_process_group() def training_forward(self, batch, batch_idx, opt_idx, hiddens): """