Lightning-AI · jerome-habana · Jun 26, 2023 · Jun 22, 2023 · Jun 22, 2023 · Jun 22, 2023
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [UnReleased] - 2023-MM-DD
+
+### Added
+
+-
+
+### Changed
+
+-
+
+### Fixed
+
+- Fixed Attribute Error ([#43](https://github.com/Lightning-AI/lightning-Habana/pull/43))
+- Fixed graph breaks in test/val phases in lazy mode ([#45](https://github.com/Lightning-AI/lightning-Habana/pull/45))
+
+### Removed
+
+-
+
+### Deprecated
+
+-
+
+
 ## [1.0.0] - 2023-06-14
 
 ### Added

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
+
 import torch
 from lightning_utilities import module_available
 from torch.nn import functional as F  # noqa: N812
@@ -24,7 +26,7 @@
     from pytorch_lightning.demos.mnist_datamodule import MNISTDataModule
 
 from lightning_habana.pytorch.accelerator import HPUAccelerator
-from lightning_habana.pytorch.strategies import SingleHPUStrategy
+from lightning_habana.pytorch.strategies import HPUParallelStrategy, SingleHPUStrategy
 
 
 class LitClassifier(LightningModule):
@@ -60,9 +62,19 @@ def configure_optimizers(self):
 
 
 if __name__ == "__main__":
-    dm = MNISTDataModule(batch_size=32)
+    parser = argparse.ArgumentParser(description="MNIST on HPU", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--hpus", default=1, type=int, help="Number of hpus to be used for training")
+    parser.add_argument("-b", "--batch-size", default=32, type=int)
+    args = parser.parse_args()
+    dm = MNISTDataModule(batch_size=args.batch_size)
     model = LitClassifier()
-    trainer = Trainer(fast_dev_run=True, accelerator=HPUAccelerator(), devices=1, strategy=SingleHPUStrategy())
+
+    hpus = args.hpus
+    _strategy = SingleHPUStrategy()
+    if hpus > 1:
+        parallel_hpus = [torch.device("hpu")] * hpus
+        _strategy = HPUParallelStrategy(parallel_devices=parallel_hpus)
+    trainer = Trainer(fast_dev_run=True, accelerator=HPUAccelerator(), devices=hpus, strategy=_strategy)
 
     trainer.fit(model, datamodule=dm)
     trainer.test(model, datamodule=dm)
@@ -1,4 +1,4 @@
-__version__ = "1.0.0"
+__version__ = "1.0.1.dev"
 __author__ = "Lightning-AI et al."
 __author_email__ = "name@lightning.ai"
 __license__ = "Apache-2.0"

@@ -138,6 +138,21 @@ def optimizer_step(
         htcore.mark_step()
         return optimizer_output
 
+    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().validation_step(batch, batch_idx)
+
+    def test_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().test_step(batch, batch_idx)
+
+    def predict_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().predict_step(batch, batch_idx)
+
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
         strategy_registry.register(

@@ -108,6 +108,21 @@ def optimizer_step(
         htcore.mark_step()
         return optimizer_output
 
+    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().validation_step(batch, batch_idx)
+
+    def test_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().test_step(batch, batch_idx)
+
+    def predict_step(self, batch: Any, batch_idx: int) -> Any:
+        # Break lazy accumulation of graph after every step
+        htcore.mark_step()
+        return super().predict_step(batch, batch_idx)
+
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None:
         strategy_registry.register(