From ac09f5f1559f848a0d324320402415ae2facb304 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Sep 2022 14:53:23 +0300 Subject: [PATCH 1/9] Break hpu graphs into two for better performance Signed-off-by: Jerome --- src/pytorch_lightning/strategies/hpu_parallel.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index e7c18d34713d9..f5e0b9a35f3ad 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -137,10 +137,15 @@ def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] - def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: - # Break lazy accumulation of graph after every step + def on_after_backward(self): + # Break lazy accumulation of graph after fwd+bwd + htcore.mark_step() + + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, + optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + optimizer.step(closure=optimizer_closure) + # Break lazy accumulation of graph after optimizer htcore.mark_step() - return step_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step From 2b4659b1aa998d75c9996d7f52a18da595f7ae49 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 11:58:24 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/strategies/hpu_parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index f5e0b9a35f3ad..888ceda57ad37 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -141,8 +141,9 @@ def on_after_backward(self): # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, - optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs + ): optimizer.step(closure=optimizer_closure) # Break lazy accumulation of graph after optimizer htcore.mark_step() From 494621affd573d56054da9b7b5d756bc382f9872 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Sep 2022 14:59:02 +0300 Subject: [PATCH 3/9] Update changelog Signed-off-by: Jerome --- src/pytorch_lightning/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ecf1ce319aa13..f875d5f62a010 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -158,6 +158,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) +- Fixed an issue wrt performance on hpu backends ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) + + ## [1.7.5] - 2022-09-06 ### Fixed From 31b5146df0e662235791d46a6e240f7852107d0c Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Sep 2022 15:03:51 +0300 Subject: [PATCH 4/9] Update graph break for single hpu plugins Signed-off-by: Jerome --- src/pytorch_lightning/strategies/single_hpu.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 5c29829fa6ce9..dfdc9820170c8 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -79,10 +79,15 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: def model_to_device(self) -> None: self.model.to(self.root_device) # type: ignore - def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: - # Break lazy accumulation of graph after every step + def on_after_backward(self): + # Break lazy accumulation of graph after fwd+bwd + htcore.mark_step() + + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, + optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + optimizer.step(closure=optimizer_closure) + # Break lazy accumulation of graph after optimizer htcore.mark_step() - return step_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step From 498ca39280eee09cb449b73a45005d3fdaccc051 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 12:06:10 +0000 Subject: [PATCH 5/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/strategies/single_hpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index dfdc9820170c8..42b875451c66a 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -83,8 +83,9 @@ def on_after_backward(self): # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() - def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, - optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + def optimizer_step( + self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs + ): optimizer.step(closure=optimizer_closure) # Break lazy accumulation of graph after optimizer htcore.mark_step() From 07157255511ff8d7fe4cd13360695289860b1637 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Sep 2022 15:26:31 +0300 Subject: [PATCH 6/9] Update optimizer step Signed-off-by: Jerome --- src/pytorch_lightning/strategies/hpu_parallel.py | 16 ++++++++++++---- src/pytorch_lightning/strategies/single_hpu.py | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 888ceda57ad37..26dc3000cc910 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -13,9 +13,11 @@ # limitations under the License. import logging import os -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import torch.distributed +from torch.optim.optimizer import Optimizer +from torch.nn import Module import pytorch_lightning as pl from lightning_lite.utilities.distributed import group as _group @@ -142,11 +144,17 @@ def on_after_backward(self): htcore.mark_step() def optimizer_step( - self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs - ): - optimizer.step(closure=optimizer_closure) + self, + optimizer: Optimizer, + opt_idx: int, + closure: Callable[[], Any], + model: Optional[Union["pl.LightningModule", Module]] = None, + **kwargs: Any, + ) -> Any: + optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) # Break lazy accumulation of graph after optimizer htcore.mark_step() + return optimizer_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 42b875451c66a..2b2bf8130e7c6 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Optional +from torch.optim.optimizer import Optimizer +from typing import Any, Callable, Dict, Optional, Union +from torch.nn import Module import pytorch_lightning as pl from lightning_lite.utilities.types import _DEVICE @@ -84,11 +86,17 @@ def on_after_backward(self): htcore.mark_step() def optimizer_step( - self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs - ): - optimizer.step(closure=optimizer_closure) + self, + optimizer: Optimizer, + opt_idx: int, + closure: Callable[[], Any], + model: Optional[Union["pl.LightningModule", Module]] = None, + **kwargs: Any, + ) -> Any: + optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs) # Break lazy accumulation of graph after optimizer htcore.mark_step() + return optimizer_output def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT: # Break lazy accumulation of graph after every step From 9ab6123f92e14b54ef9996486fe6faacc5cd7448 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 12:29:17 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/strategies/hpu_parallel.py | 2 +- src/pytorch_lightning/strategies/single_hpu.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 26dc3000cc910..1d4d929fc7b37 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -16,8 +16,8 @@ from typing import Any, Callable, Dict, List, Optional, Union import torch.distributed -from torch.optim.optimizer import Optimizer from torch.nn import Module +from torch.optim.optimizer import Optimizer import pytorch_lightning as pl from lightning_lite.utilities.distributed import group as _group diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 2b2bf8130e7c6..6e3504f1f4486 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from torch.optim.optimizer import Optimizer from typing import Any, Callable, Dict, Optional, Union + from torch.nn import Module +from torch.optim.optimizer import Optimizer import pytorch_lightning as pl from lightning_lite.utilities.types import _DEVICE From 6aa5dc083aa955a6f95b947e9c48c85e5139fef3 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 12 Sep 2022 16:23:05 +0300 Subject: [PATCH 8/9] Resolve mypy errors Signed-off-by: Jerome --- src/pytorch_lightning/strategies/hpu_parallel.py | 2 +- src/pytorch_lightning/strategies/single_hpu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index eec6e6eeaad57..fdca6813c44f3 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -139,7 +139,7 @@ def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] - def on_after_backward(self): + def on_after_backward(self) -> None: # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index 9c70ea366fcad..5d6ead0358744 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -82,7 +82,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: def model_to_device(self) -> None: self.model.to(self.root_device) # type: ignore - def on_after_backward(self): + def on_after_backward(self) -> None: # Break lazy accumulation of graph after fwd+bwd htcore.mark_step() From bc38e8ec9f7e65f8c24ffbf53733579b8b8b8813 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 13 Sep 2022 12:14:09 +0200 Subject: [PATCH 9/9] Update src/pytorch_lightning/CHANGELOG.md Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 2cc4e28653e21..9ff8b1d6feaba 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -169,7 +169,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue to keep downscaling the batch size in case there hasn't been even a single successful optimal batch size with `mode="power"` ([#14372](https://github.com/Lightning-AI/lightning/pull/14372)) -- Fixed an issue wrt performance on hpu backends ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) +- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656)) - Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454))