diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst index b222782dfc6f5..a6c20414a7a02 100644 --- a/docs/source-pytorch/accelerators/hpu_basic.rst +++ b/docs/source-pytorch/accelerators/hpu_basic.rst @@ -79,5 +79,4 @@ Known limitations ----------------- * `Habana dataloader `__ is not supported. -* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported. * :func:`torch.inference_mode` is not supported diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 0e08683211431..3b1c0e6b43707 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -66,3 +66,34 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins trainer.fit(model, datamodule=dm) For more details, please refer to `PyTorch Mixed Precision Training on Gaudi `__. + +---- + +Enabling DeviceStatsMonitor with HPUs +---------------------------------------- + +:class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. +This callback can be passed for training with HPUs. It returns a map of the following metrics with their values in bytes of type uint64: + +- **Limit**: amount of total memory on HPU device. +- **InUse**: amount of allocated memory at any instance. +- **MaxInUse**: amount of total active memory allocated. +- **NumAllocs**: number of allocations. +- **NumFrees**: number of freed chunks. +- **ActiveAllocs**: number of active allocations. +- **MaxAllocSize**: maximum allocated size. +- **TotalSystemAllocs**: total number of system allocations. +- **TotalSystemFrees**: total number of system frees. +- **TotalActiveAllocs**: total number of active allocations. + +The below snippet shows how DeviceStatsMonitor can be enabled. + +.. code-block:: python + + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import DeviceStatsMonitor + + device_stats = DeviceStatsMonitor() + trainer = Trainer(accelerator="hpu", callbacks=[device_stats]) + +For more details, please refer to `Memory Stats APIs `__. diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ea649a9b65236..cf370dfae2f88 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -111,6 +111,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for async checkpointing ([#13658](https://github.com/Lightning-AI/lightning/pull/13658)) +- Added support for HPU Device stats monitor ([#13819](https://github.com/Lightning-AI/lightning/pull/13819)) + + ### Changed - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 686bf6bb9452d..8fc242fa55f20 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -39,9 +39,24 @@ def setup_environment(self, root_device: torch.device) -> None: raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """HPU device stats aren't supported yet.""" - rank_zero_debug("HPU device stats aren't supported yet.") - return {} + """Returns a map of the following metrics with their values: + + - Limit: amount of total memory on HPU device. + - InUse: amount of allocated memory at any instance. + - MaxInUse: amount of total active memory allocated. + - NumAllocs: number of allocations. + - NumFrees: number of freed chunks. + - ActiveAllocs: number of active allocations. + - MaxAllocSize: maximum allocated size. + - TotalSystemAllocs: total number of system allocations. + - TotalSystemFrees: total number of system frees. + - TotalActiveAllocs: total number of active allocations. + """ + try: + return torch_hpu.hpu.memory_stats(device) + except (AttributeError, NameError): + rank_zero_debug("HPU `get_device_stats` failed") + return {} @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]: diff --git a/tests/tests_pytorch/accelerators/test_hpu.py b/tests/tests_pytorch/accelerators/test_hpu.py index 0ef63de417907..4947000b47162 100644 --- a/tests/tests_pytorch/accelerators/test_hpu.py +++ b/tests/tests_pytorch/accelerators/test_hpu.py @@ -303,3 +303,23 @@ def training_epoch_end(self, outputs) -> None: trainer.fit(model) assert all(model.optims) + + +@RunIf(hpu=True) +def test_hpu_device_stats_monitor(tmpdir): + + hpu_stats = HPUAccelerator().get_device_stats("hpu") + fields = [ + "Limit", + "InUse", + "MaxInUse", + "NumAllocs", + "NumFrees", + "ActiveAllocs", + "MaxAllocSize", + "TotalSystemAllocs", + "TotalSystemFrees", + "TotalActiveAllocs", + ] + for f in fields: + assert any(f in h for h in hpu_stats.keys())