From e439d8f0bbdba66e017fdb6638622d89db05ccb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 6 Jul 2021 19:33:24 +0200 Subject: [PATCH 1/7] move device --- pytorch_lightning/accelerators/gpu.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 3348727a36e61..82cb9d8cb2298 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -26,16 +26,18 @@ class GPUAccelerator(Accelerator): """ Accelerator for GPU devices. """ + def setup_environment(self) -> None: + if "cuda" not in str(self.root_device): + raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") + torch.cuda.set_device(self.root_device) + def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: """ Raises: MisconfigurationException: If the selected device is not GPU. """ - if "cuda" not in str(self.root_device): - raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") self.set_nvidia_flags(trainer.local_rank) - torch.cuda.set_device(self.root_device) return super().setup(trainer, model) def on_train_start(self) -> None: From c39dd574ed06a6320fc162c5fc67c657adb34667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 6 Jul 2021 19:34:37 +0200 Subject: [PATCH 2/7] debug --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b984608c87d6d..8c698eb30029d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -815,6 +815,8 @@ def _run(self, model: 'pl.LightningModule') -> Optional[Union[_EVALUATE_OUTPUT, self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator.connect(model) self.accelerator.setup_environment() + # check if bug is fixed + _ = self.log_dir self._call_setup_hook(model) # allow user to setup lightning_module in accelerator environment # restore modules after setup From 64264df3a2a2a0c7dd3556323bb7d2b123dbf711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 6 Jul 2021 19:44:25 +0200 Subject: [PATCH 3/7] missing super call --- pytorch_lightning/accelerators/gpu.py | 1 + pytorch_lightning/trainer/trainer.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 82cb9d8cb2298..0592cffa1a4bc 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -27,6 +27,7 @@ class GPUAccelerator(Accelerator): """ Accelerator for GPU devices. """ def setup_environment(self) -> None: + super().setup_environment() if "cuda" not in str(self.root_device): raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead") torch.cuda.set_device(self.root_device) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8c698eb30029d..41ffb4ba1da9e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -816,7 +816,8 @@ def _run(self, model: 'pl.LightningModule') -> Optional[Union[_EVALUATE_OUTPUT, self.accelerator.connect(model) self.accelerator.setup_environment() # check if bug is fixed - _ = self.log_dir + if torch.distributed.is_available() and torch.distributed.is_initialized(): + _ = self.log_dir self._call_setup_hook(model) # allow user to setup lightning_module in accelerator environment # restore modules after setup From 8561edb3b95bdea1b37450e8b073ad80d6616b51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 6 Jul 2021 20:50:13 +0200 Subject: [PATCH 4/7] set_device in ddp plugin --- pytorch_lightning/plugins/training_type/ddp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index a882390b78b0d..8e4f4c0694e67 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -367,8 +367,6 @@ def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, opti prepare_for_backward(self.model, closure_loss) def model_to_device(self): - if self.root_device.type == "cuda": - torch.cuda.set_device(self.root_device) self.model.to(self.root_device) def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> torch.Tensor: From 519d01cee558886bd163cfff4b8012b08a544372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Jul 2021 01:02:20 +0200 Subject: [PATCH 5/7] redundant set device in single device plugin --- pytorch_lightning/plugins/training_type/single_device.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index d4a328902eba0..c1ef9028ceb7f 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -61,9 +61,6 @@ def root_device(self) -> torch.device: return self.device def model_to_device(self) -> None: - if self.on_gpu: - torch.cuda.set_device(self.root_device) - self._model.to(self.root_device) def setup(self, model: torch.nn.Module) -> torch.nn.Module: From 83d6bfaf469776cd5d6d18132ee96e32d7280394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Jul 2021 01:23:23 +0200 Subject: [PATCH 6/7] remove redundant set_device in ddp subclasses --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 -- pytorch_lightning/plugins/training_type/fully_sharded.py | 1 - 2 files changed, 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 4d229e4bff43a..e704b662fd6ca 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -339,8 +339,6 @@ def setup_distributed(self): if not self._config_initialized: self._format_config() self._config_initialized = True - if self.on_gpu: - torch.cuda.set_device(self.root_device) def pre_dispatch(self): self.init_deepspeed() diff --git a/pytorch_lightning/plugins/training_type/fully_sharded.py b/pytorch_lightning/plugins/training_type/fully_sharded.py index 476df9be13cfe..a02be35409dc5 100644 --- a/pytorch_lightning/plugins/training_type/fully_sharded.py +++ b/pytorch_lightning/plugins/training_type/fully_sharded.py @@ -118,7 +118,6 @@ def setup_distributed(self) -> None: "You selected accelerator to be `ddp_fully_sharded`, but GPU is not available." ) super().setup_distributed() - torch.cuda.set_device(self.root_device) @contextlib.contextmanager def model_sharded_context(self) -> Generator: From 3f9a9e2b47558a57eb92d53e9d002d1288dfbca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 7 Jul 2021 02:56:30 +0200 Subject: [PATCH 7/7] reset debug changes --- pytorch_lightning/trainer/trainer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 41ffb4ba1da9e..b984608c87d6d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -815,9 +815,6 @@ def _run(self, model: 'pl.LightningModule') -> Optional[Union[_EVALUATE_OUTPUT, self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator.connect(model) self.accelerator.setup_environment() - # check if bug is fixed - if torch.distributed.is_available() and torch.distributed.is_initialized(): - _ = self.log_dir self._call_setup_hook(model) # allow user to setup lightning_module in accelerator environment # restore modules after setup