From 0f5218ce91f1d554c320de902e65a03357b07443 Mon Sep 17 00:00:00 2001 From: Jerome Date: Thu, 22 Jun 2023 08:16:55 +0300 Subject: [PATCH 1/5] Add graph breaks with lazy mode in test/eval phase Signed-off-by: Jerome --- CHANGELOG.md | 24 +++++++++++++++++++ examples/pytorch/mnist_sample.py | 19 ++++++++++++--- src/lightning_habana/__about__.py | 2 +- .../pytorch/strategies/parallel.py | 17 +++++++++++++ .../pytorch/strategies/single.py | 18 ++++++++++++++ 5 files changed, 76 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0355c31c..2a5808cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [UnReleased] - 2023-MM-DD + +### Added + +- + +### Changed + +- + +### Fixed + +- Fixed Attribute Error ([#43](https://github.com/Lightning-AI/lightning-Habana/pull/43)) +- Fixed graph breaks in test/val phases in lazy mode ([#45](https://github.com/Lightning-AI/lightning-Habana/pull/45)) + +### Removed + +- + +### Deprecated + +- + + ## [1.0.0] - 2023-06-14 ### Added diff --git a/examples/pytorch/mnist_sample.py b/examples/pytorch/mnist_sample.py index 76463e64..2a2fb920 100644 --- a/examples/pytorch/mnist_sample.py +++ b/examples/pytorch/mnist_sample.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse import torch from lightning_utilities import module_available from torch.nn import functional as F # noqa: N812 @@ -24,7 +25,7 @@ from pytorch_lightning.demos.mnist_datamodule import MNISTDataModule from lightning_habana.pytorch.accelerator import HPUAccelerator -from lightning_habana.pytorch.strategies import SingleHPUStrategy +from lightning_habana.pytorch.strategies import SingleHPUStrategy, HPUParallelStrategy class LitClassifier(LightningModule): @@ -60,9 +61,21 @@ def configure_optimizers(self): if __name__ == "__main__": - dm = MNISTDataModule(batch_size=32) + parser = argparse.ArgumentParser(description="MNIST on HPU", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--hpus', default=1, type=int, help='Number of hpus to be used for training') + parser.add_argument('-b', '--batch-size', default=32, type=int) + args = parser.parse_args() + dm = MNISTDataModule(batch_size=args.batch_size) model = LitClassifier() - trainer = Trainer(fast_dev_run=True, accelerator=HPUAccelerator(), devices=1, strategy=SingleHPUStrategy()) + + hpus = args.hpus + _strategy = SingleHPUStrategy() + if hpus > 1: + parallel_hpus = [torch.device("hpu")] * hpus + _strategy = HPUParallelStrategy(parallel_devices=parallel_hpus) + trainer = Trainer(fast_dev_run=True, accelerator=HPUAccelerator(), devices=hpus, strategy=_strategy) + trainer.fit(model, datamodule=dm) trainer.test(model, datamodule=dm) diff --git a/src/lightning_habana/__about__.py b/src/lightning_habana/__about__.py index 4eae40dd..c3f97a29 100644 --- a/src/lightning_habana/__about__.py +++ b/src/lightning_habana/__about__.py @@ -1,4 +1,4 @@ -__version__ = "1.0.0" +__version__ = "1.0.1.dev" __author__ = "Lightning-AI et al." __author_email__ = "name@lightning.ai" __license__ = "Apache-2.0" diff --git a/src/lightning_habana/pytorch/strategies/parallel.py b/src/lightning_habana/pytorch/strategies/parallel.py index a59a2e07..7baec9f9 100644 --- a/src/lightning_habana/pytorch/strategies/parallel.py +++ b/src/lightning_habana/pytorch/strategies/parallel.py @@ -28,6 +28,7 @@ from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO from lightning.pytorch.plugins.precision import PrecisionPlugin from lightning.pytorch.strategies.ddp import DDPStrategy + from lightning.pytorch.utilities.types import STEP_OUTPUT elif module_available("pytorch_lightning"): from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment from lightning_fabric.utilities.distributed import _distributed_available @@ -38,6 +39,7 @@ from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy + from pytorch_lightning.utilities.types import STEP_OUTPUT else: raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.") from torch.nn import Module @@ -138,6 +140,21 @@ def optimizer_step( htcore.mark_step() return optimizer_output + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().validation_step(*args, **kwargs) + + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().validation_step(*args, **kwargs) + + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().predict_step(*args, **kwargs) + @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py index fbbe22c6..d92f1d1d 100644 --- a/src/lightning_habana/pytorch/strategies/single.py +++ b/src/lightning_habana/pytorch/strategies/single.py @@ -25,6 +25,7 @@ from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO from lightning.pytorch.plugins.precision import PrecisionPlugin from lightning.pytorch.strategies.single_device import SingleDeviceStrategy + from lightning.pytorch.utilities.types import STEP_OUTPUT elif module_available("pytorch_lightning"): from lightning_fabric.plugins import CheckpointIO from lightning_fabric.utilities.types import _DEVICE @@ -34,6 +35,7 @@ from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.single_device import SingleDeviceStrategy + from pytorch_lightning.utilities.types import STEP_OUTPUT else: raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.") @@ -108,6 +110,22 @@ def optimizer_step( htcore.mark_step() return optimizer_output + + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().validation_step(*args, **kwargs) + + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().validation_step(*args, **kwargs) + + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + # Break lazy accumulation of graph after every step + htcore.mark_step() + return super().predict_step(*args, **kwargs) + @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: strategy_registry.register( From 209e92b75d1f92aa9988311ccbf5ff3ca980343b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Jun 2023 05:21:40 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/pytorch/mnist_sample.py | 11 +++++------ src/lightning_habana/pytorch/strategies/single.py | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/pytorch/mnist_sample.py b/examples/pytorch/mnist_sample.py index 2a2fb920..75fc6fd0 100644 --- a/examples/pytorch/mnist_sample.py +++ b/examples/pytorch/mnist_sample.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse + import torch from lightning_utilities import module_available from torch.nn import functional as F # noqa: N812 @@ -25,7 +26,7 @@ from pytorch_lightning.demos.mnist_datamodule import MNISTDataModule from lightning_habana.pytorch.accelerator import HPUAccelerator -from lightning_habana.pytorch.strategies import SingleHPUStrategy, HPUParallelStrategy +from lightning_habana.pytorch.strategies import HPUParallelStrategy, SingleHPUStrategy class LitClassifier(LightningModule): @@ -61,10 +62,9 @@ def configure_optimizers(self): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="MNIST on HPU", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--hpus', default=1, type=int, help='Number of hpus to be used for training') - parser.add_argument('-b', '--batch-size', default=32, type=int) + parser = argparse.ArgumentParser(description="MNIST on HPU", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--hpus", default=1, type=int, help="Number of hpus to be used for training") + parser.add_argument("-b", "--batch-size", default=32, type=int) args = parser.parse_args() dm = MNISTDataModule(batch_size=args.batch_size) model = LitClassifier() @@ -76,6 +76,5 @@ def configure_optimizers(self): _strategy = HPUParallelStrategy(parallel_devices=parallel_hpus) trainer = Trainer(fast_dev_run=True, accelerator=HPUAccelerator(), devices=hpus, strategy=_strategy) - trainer.fit(model, datamodule=dm) trainer.test(model, datamodule=dm) diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py index d92f1d1d..f7167c76 100644 --- a/src/lightning_habana/pytorch/strategies/single.py +++ b/src/lightning_habana/pytorch/strategies/single.py @@ -110,7 +110,6 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: # Break lazy accumulation of graph after every step htcore.mark_step() From 4060467637eb4802353961f03d5adcf366c883ca Mon Sep 17 00:00:00 2001 From: Jerome Date: Thu, 22 Jun 2023 09:50:07 +0300 Subject: [PATCH 3/5] Update hooks signature Signed-off-by: Jerome --- .../pytorch/strategies/parallel.py | 14 ++++++-------- src/lightning_habana/pytorch/strategies/single.py | 14 ++++++-------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/lightning_habana/pytorch/strategies/parallel.py b/src/lightning_habana/pytorch/strategies/parallel.py index 7baec9f9..c3601e78 100644 --- a/src/lightning_habana/pytorch/strategies/parallel.py +++ b/src/lightning_habana/pytorch/strategies/parallel.py @@ -28,7 +28,6 @@ from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO from lightning.pytorch.plugins.precision import PrecisionPlugin from lightning.pytorch.strategies.ddp import DDPStrategy - from lightning.pytorch.utilities.types import STEP_OUTPUT elif module_available("pytorch_lightning"): from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment from lightning_fabric.utilities.distributed import _distributed_available @@ -39,7 +38,6 @@ from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy - from pytorch_lightning.utilities.types import STEP_OUTPUT else: raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.") from torch.nn import Module @@ -140,20 +138,20 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + def validation_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().validation_step(*args, **kwargs) + return super().validation_step(batch, batch_idx) - def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + def test_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().validation_step(*args, **kwargs) + return super().test_step(batch, batch_idx) - def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + def predict_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().predict_step(*args, **kwargs) + return super().predict_step(batch, batch_idx) @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py index f7167c76..5b8eba6c 100644 --- a/src/lightning_habana/pytorch/strategies/single.py +++ b/src/lightning_habana/pytorch/strategies/single.py @@ -25,7 +25,6 @@ from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO from lightning.pytorch.plugins.precision import PrecisionPlugin from lightning.pytorch.strategies.single_device import SingleDeviceStrategy - from lightning.pytorch.utilities.types import STEP_OUTPUT elif module_available("pytorch_lightning"): from lightning_fabric.plugins import CheckpointIO from lightning_fabric.utilities.types import _DEVICE @@ -35,7 +34,6 @@ from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.single_device import SingleDeviceStrategy - from pytorch_lightning.utilities.types import STEP_OUTPUT else: raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.") @@ -110,20 +108,20 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + def validation_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().validation_step(*args, **kwargs) + return super().validation_step(batch, batch_idx) - def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + def test_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().validation_step(*args, **kwargs) + return super().test_step(batch, batch_idx) - def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + def predict_step(self, batch, batch_idx): # Break lazy accumulation of graph after every step htcore.mark_step() - return super().predict_step(*args, **kwargs) + return super().predict_step(batch, batch_idx) @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: From a9aa6b65a4b11c05e71237a790547e6fc1effb31 Mon Sep 17 00:00:00 2001 From: Jerome Date: Thu, 22 Jun 2023 10:22:53 +0300 Subject: [PATCH 4/5] Resolve mypy errors Signed-off-by: Jerome --- src/lightning_habana/pytorch/strategies/parallel.py | 6 +++--- src/lightning_habana/pytorch/strategies/single.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lightning_habana/pytorch/strategies/parallel.py b/src/lightning_habana/pytorch/strategies/parallel.py index c3601e78..4279cabc 100644 --- a/src/lightning_habana/pytorch/strategies/parallel.py +++ b/src/lightning_habana/pytorch/strategies/parallel.py @@ -138,17 +138,17 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, batch, batch_idx): + def validation_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().validation_step(batch, batch_idx) - def test_step(self, batch, batch_idx): + def test_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().test_step(batch, batch_idx) - def predict_step(self, batch, batch_idx): + def predict_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().predict_step(batch, batch_idx) diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py index 5b8eba6c..69e9c4c5 100644 --- a/src/lightning_habana/pytorch/strategies/single.py +++ b/src/lightning_habana/pytorch/strategies/single.py @@ -108,17 +108,17 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, batch, batch_idx): + def validation_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().validation_step(batch, batch_idx) - def test_step(self, batch, batch_idx): + def test_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().test_step(batch, batch_idx) - def predict_step(self, batch, batch_idx): + def predict_step(self, batch, batch_idx) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().predict_step(batch, batch_idx) From 7ff8768c40a5d7c51e31fb1ca16a3725c95328e9 Mon Sep 17 00:00:00 2001 From: Jerome Date: Thu, 22 Jun 2023 10:34:12 +0300 Subject: [PATCH 5/5] Correct mypy errors Signed-off-by: Jerome --- src/lightning_habana/pytorch/strategies/parallel.py | 6 +++--- src/lightning_habana/pytorch/strategies/single.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lightning_habana/pytorch/strategies/parallel.py b/src/lightning_habana/pytorch/strategies/parallel.py index 4279cabc..4414d456 100644 --- a/src/lightning_habana/pytorch/strategies/parallel.py +++ b/src/lightning_habana/pytorch/strategies/parallel.py @@ -138,17 +138,17 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, batch, batch_idx) -> Any: + def validation_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().validation_step(batch, batch_idx) - def test_step(self, batch, batch_idx) -> Any: + def test_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().test_step(batch, batch_idx) - def predict_step(self, batch, batch_idx) -> Any: + def predict_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().predict_step(batch, batch_idx) diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py index 69e9c4c5..faccd67d 100644 --- a/src/lightning_habana/pytorch/strategies/single.py +++ b/src/lightning_habana/pytorch/strategies/single.py @@ -108,17 +108,17 @@ def optimizer_step( htcore.mark_step() return optimizer_output - def validation_step(self, batch, batch_idx) -> Any: + def validation_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().validation_step(batch, batch_idx) - def test_step(self, batch, batch_idx) -> Any: + def test_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().test_step(batch, batch_idx) - def predict_step(self, batch, batch_idx) -> Any: + def predict_step(self, batch: Any, batch_idx: int) -> Any: # Break lazy accumulation of graph after every step htcore.mark_step() return super().predict_step(batch, batch_idx)