From a07c9f30f2fb52602c0cc2360a6c17fc76a68bd5 Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 25 Jul 2022 08:13:46 +0300 Subject: [PATCH 1/9] Added support for HPU device stats monitor Signed-off-by: Jerome --- .../source-pytorch/accelerators/hpu_basic.rst | 1 - .../accelerators/hpu_intermediate.rst | 28 +++++++++++++++++++ src/pytorch_lightning/CHANGELOG.md | 3 ++ src/pytorch_lightning/accelerators/hpu.py | 21 ++++++++++++-- tests/tests_pytorch/accelerators/test_hpu.py | 20 +++++++++++++ 5 files changed, 69 insertions(+), 4 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst index b222782dfc6f5..a6c20414a7a02 100644 --- a/docs/source-pytorch/accelerators/hpu_basic.rst +++ b/docs/source-pytorch/accelerators/hpu_basic.rst @@ -79,5 +79,4 @@ Known limitations ----------------- * `Habana dataloader `__ is not supported. -* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported. * :func:`torch.inference_mode` is not supported diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 0e08683211431..46d2fe1a7bae1 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -66,3 +66,31 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins trainer.fit(model, datamodule=dm) For more details, please refer to `PyTorch Mixed Precision Training on Gaudi `__. + +---- + +Enabling DeviceStatsMonitor +--------------------------- + +DeviceStatsMonitor is a callback that automatically monitors and logs device stats during training stage. +This callback can be passed for training with HPUs. It returns a map of the following metrics with their values: + + **Limit** : amount of total memory on HPU device, + **InUse** : amount of allocated memory at any instance., + **MaxInUse** : amount of total active memory allocated, + **NumAllocs** : number of allocations, + **NumFrees** : number of freed chunks, + **ActiveAllocs** : number of active allocations, + **MaxAllocSize** : maximum allocated size, + **TotalSystemAllocs** : total number of system allocations, + **TotalSystemFrees** : total number of system frees, + **TotalActiveAllocs** : total number of active allocations + +The below snippet shows how DeviceStatsMonitor can be enabled. + +.. code-block:: python + + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import DeviceStatsMonitor + device_stats = DeviceStatsMonitor() + trainer = Trainer(callbacks=[device_stats]) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index af53c9b063853..d5e3fedcffebf 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -108,6 +108,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for DDP Fork ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) +- Added support for HPU Device stats monitor ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) + + ### Changed - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 686bf6bb9452d..a78c00ac6e3d0 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -39,9 +39,24 @@ def setup_environment(self, root_device: torch.device) -> None: raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """HPU device stats aren't supported yet.""" - rank_zero_debug("HPU device stats aren't supported yet.") - return {} + """ + Returns a map of the following metrics with their values: + Limit, + InUse, + MaxInUse, + NumAllocs, + NumFrees, + ActiveAllocs, + MaxAllocSize, + TotalSystemAllocs, + TotalSystemFrees, + TotalActiveAllocs + """ + try: + return torch_hpu.hpu.memory_stats(device) + except (AttributeError, NameError): + rank_zero_debug("HPU `get_device_stats` failed") + return {} @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]: diff --git a/tests/tests_pytorch/accelerators/test_hpu.py b/tests/tests_pytorch/accelerators/test_hpu.py index 0ef63de417907..5433a0ffed65d 100644 --- a/tests/tests_pytorch/accelerators/test_hpu.py +++ b/tests/tests_pytorch/accelerators/test_hpu.py @@ -303,3 +303,23 @@ def training_epoch_end(self, outputs) -> None: trainer.fit(model) assert all(model.optims) + + +@RunIf(hpu=True) +def test_hpu_device_stats_monitor(tmpdir): + + hpu_stats = HPUAccelerator().get_device_stats("hpu") + fields = [ + "Limit", + "InUse", + "MaxInUse", + "NumAllocs", + "NumFrees", + "ActiveAllocs", + "MaxAllocSize", + "TotalSystemAllocs", + "TotalSystemFrees", + "TotalActiveAllocs" + ] + for f in fields: + assert any(f in h for h in hpu_stats.keys()) From d497dea2f44e0cffdf1f40adb8a893fb7f00c29b Mon Sep 17 00:00:00 2001 From: Jerome Date: Mon, 25 Jul 2022 08:16:18 +0300 Subject: [PATCH 2/9] Update changelog Signed-off-by: Jerome --- src/pytorch_lightning/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d5e3fedcffebf..029396c9c95d1 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -108,7 +108,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for DDP Fork ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) -- Added support for HPU Device stats monitor ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) +- Added support for HPU Device stats monitor ([#13819](https://github.com/PyTorchLightning/pytorch-lightning/pull/13819)) ### Changed From 6850e8ae28b04bc496fdda0648353e50b3ea2aec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Jul 2022 05:18:13 +0000 Subject: [PATCH 3/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../accelerators/hpu_intermediate.rst | 1 + src/pytorch_lightning/accelerators/hpu.py | 24 +++++++++---------- tests/tests_pytorch/accelerators/test_hpu.py | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 46d2fe1a7bae1..fc04a84262311 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -92,5 +92,6 @@ The below snippet shows how DeviceStatsMonitor can be enabled. from pytorch_lightning import Trainer from pytorch_lightning.callbacks import DeviceStatsMonitor + device_stats = DeviceStatsMonitor() trainer = Trainer(callbacks=[device_stats]) diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index a78c00ac6e3d0..934c5a1dbc4ca 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -39,18 +39,18 @@ def setup_environment(self, root_device: torch.device) -> None: raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """ - Returns a map of the following metrics with their values: - Limit, - InUse, - MaxInUse, - NumAllocs, - NumFrees, - ActiveAllocs, - MaxAllocSize, - TotalSystemAllocs, - TotalSystemFrees, - TotalActiveAllocs + """Returns a map of the following metrics with their values: + + Limit, + InUse, + MaxInUse, + NumAllocs, + NumFrees, + ActiveAllocs, + MaxAllocSize, + TotalSystemAllocs, + TotalSystemFrees, + TotalActiveAllocs """ try: return torch_hpu.hpu.memory_stats(device) diff --git a/tests/tests_pytorch/accelerators/test_hpu.py b/tests/tests_pytorch/accelerators/test_hpu.py index 5433a0ffed65d..4947000b47162 100644 --- a/tests/tests_pytorch/accelerators/test_hpu.py +++ b/tests/tests_pytorch/accelerators/test_hpu.py @@ -319,7 +319,7 @@ def test_hpu_device_stats_monitor(tmpdir): "MaxAllocSize", "TotalSystemAllocs", "TotalSystemFrees", - "TotalActiveAllocs" - ] + "TotalActiveAllocs", + ] for f in fields: assert any(f in h for h in hpu_stats.keys()) From f3313521769ac4394cdee849cda0e650be35efee Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Mon, 25 Jul 2022 11:09:09 +0530 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- docs/source-pytorch/accelerators/hpu_intermediate.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index fc04a84262311..13ad8d6d4b5ff 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -69,8 +69,8 @@ For more details, please refer to `PyTorch Mixed Precision Training on Gaudi Date: Mon, 25 Jul 2022 09:52:06 +0300 Subject: [PATCH 5/9] Update reference Signed-off-by: Jerome --- docs/source-pytorch/accelerators/hpu_intermediate.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 13ad8d6d4b5ff..e2ae96b323a6b 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -95,3 +95,5 @@ The below snippet shows how DeviceStatsMonitor can be enabled. device_stats = DeviceStatsMonitor() trainer = Trainer(accelerator="hpu", callbacks=[device_stats]) + +For more details, please refer to `Memory Stats APIs `__. From d20d34682a5c3b9c41de868abda79a98ccf0ccc3 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:28:34 +0530 Subject: [PATCH 6/9] Apply suggestions from code review Co-authored-by: Rohit Gupta --- .../accelerators/hpu_intermediate.rst | 22 +++++++++---------- src/pytorch_lightning/CHANGELOG.md | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index e2ae96b323a6b..3a32127ff4655 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -72,19 +72,19 @@ For more details, please refer to `PyTorch Mixed Precision Training on Gaudi Date: Mon, 1 Aug 2022 22:55:14 +0530 Subject: [PATCH 7/9] fix alignment --- .../accelerators/hpu_intermediate.rst | 20 +++++++++---------- src/pytorch_lightning/accelerators/hpu.py | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 3a32127ff4655..62581a25184dd 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -75,16 +75,16 @@ Enabling DeviceStatsMonitor with HPUs :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. This callback can be passed for training with HPUs. It returns a map of the following metrics with their values: - **Limit** : amount of total memory on HPU device. - **InUse** : amount of allocated memory at any instance. - **MaxInUse** : amount of total active memory allocated. - **NumAllocs** : number of allocations. - **NumFrees** : number of freed chunks. - **ActiveAllocs** : number of active allocations. - **MaxAllocSize** : maximum allocated size. - **TotalSystemAllocs** : total number of system allocations. - **TotalSystemFrees** : total number of system frees. - **TotalActiveAllocs** : total number of active allocations. +- **Limit** : amount of total memory on HPU device. +- **InUse** : amount of allocated memory at any instance. +- **MaxInUse** : amount of total active memory allocated. +- **NumAllocs** : number of allocations. +- **NumFrees** : number of freed chunks. +- **ActiveAllocs** : number of active allocations. +- **MaxAllocSize** : maximum allocated size. +- **TotalSystemAllocs** : total number of system allocations. +- **TotalSystemFrees** : total number of system frees. +- **TotalActiveAllocs** : total number of active allocations. The below snippet shows how DeviceStatsMonitor can be enabled. diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 934c5a1dbc4ca..6b0bf1b04b3f1 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -41,16 +41,16 @@ def setup_environment(self, root_device: torch.device) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Returns a map of the following metrics with their values: - Limit, - InUse, - MaxInUse, - NumAllocs, - NumFrees, - ActiveAllocs, - MaxAllocSize, - TotalSystemAllocs, - TotalSystemFrees, - TotalActiveAllocs + - Limit + - InUse + - MaxInUse + - NumAllocs + - NumFrees + - ActiveAllocs + - MaxAllocSize + - TotalSystemAllocs + - TotalSystemFrees + - TotalActiveAllocs """ try: return torch_hpu.hpu.memory_stats(device) From 1729ecdf3540b1d10ce83aca5fbd4af62ac86c04 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 1 Aug 2022 23:15:43 +0530 Subject: [PATCH 8/9] add descriptions --- .../accelerators/hpu_intermediate.rst | 20 +++++++++---------- src/pytorch_lightning/accelerators/hpu.py | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 62581a25184dd..fcad1593c88cb 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -75,16 +75,16 @@ Enabling DeviceStatsMonitor with HPUs :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. This callback can be passed for training with HPUs. It returns a map of the following metrics with their values: -- **Limit** : amount of total memory on HPU device. -- **InUse** : amount of allocated memory at any instance. -- **MaxInUse** : amount of total active memory allocated. -- **NumAllocs** : number of allocations. -- **NumFrees** : number of freed chunks. -- **ActiveAllocs** : number of active allocations. -- **MaxAllocSize** : maximum allocated size. -- **TotalSystemAllocs** : total number of system allocations. -- **TotalSystemFrees** : total number of system frees. -- **TotalActiveAllocs** : total number of active allocations. +- **Limit**: amount of total memory on HPU device. +- **InUse**: amount of allocated memory at any instance. +- **MaxInUse**: amount of total active memory allocated. +- **NumAllocs**: number of allocations. +- **NumFrees**: number of freed chunks. +- **ActiveAllocs**: number of active allocations. +- **MaxAllocSize**: maximum allocated size. +- **TotalSystemAllocs**: total number of system allocations. +- **TotalSystemFrees**: total number of system frees. +- **TotalActiveAllocs**: total number of active allocations. The below snippet shows how DeviceStatsMonitor can be enabled. diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 6b0bf1b04b3f1..8fc242fa55f20 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -41,16 +41,16 @@ def setup_environment(self, root_device: torch.device) -> None: def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Returns a map of the following metrics with their values: - - Limit - - InUse - - MaxInUse - - NumAllocs - - NumFrees - - ActiveAllocs - - MaxAllocSize - - TotalSystemAllocs - - TotalSystemFrees - - TotalActiveAllocs + - Limit: amount of total memory on HPU device. + - InUse: amount of allocated memory at any instance. + - MaxInUse: amount of total active memory allocated. + - NumAllocs: number of allocations. + - NumFrees: number of freed chunks. + - ActiveAllocs: number of active allocations. + - MaxAllocSize: maximum allocated size. + - TotalSystemAllocs: total number of system allocations. + - TotalSystemFrees: total number of system frees. + - TotalActiveAllocs: total number of active allocations. """ try: return torch_hpu.hpu.memory_stats(device) From 0d1bbad5e6f0dab3327b54fe91338f933f9863b1 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Tue, 2 Aug 2022 07:59:22 +0530 Subject: [PATCH 9/9] Update hpu_intermediate.rst --- docs/source-pytorch/accelerators/hpu_intermediate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index fcad1593c88cb..3b1c0e6b43707 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -73,7 +73,7 @@ Enabling DeviceStatsMonitor with HPUs ---------------------------------------- :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. -This callback can be passed for training with HPUs. It returns a map of the following metrics with their values: +This callback can be passed for training with HPUs. It returns a map of the following metrics with their values in bytes of type uint64: - **Limit**: amount of total memory on HPU device. - **InUse**: amount of allocated memory at any instance.