Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/pytorch_lightning/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed an issue where `self.log`-ing a tensor would create a user warning from PyTorch about cloning tensors ([#14599](https://github.com/Lightning-AI/lightning/pull/14599))


- Break HPU Graphs into two parts (forward + backward as one and optimizer as another) for better performance ([#14656](https://github.com/Lightning-AI/lightning/pull/14656))


- Fixed compatibility when `torch.distributed` is not available ([#14454](https://github.com/Lightning-AI/lightning/pull/14454))


Expand Down
22 changes: 18 additions & 4 deletions src/pytorch_lightning/strategies/hpu_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
# limitations under the License.
import logging
import os
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Union

import torch.distributed
from torch.nn import Module
from torch.optim.optimizer import Optimizer

import pytorch_lightning as pl
from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
Expand Down Expand Up @@ -137,10 +139,22 @@ def broadcast(self, obj: object, src: int = 0) -> object: # type: ignore
broadcast_object_list(obj, src, group=_group.WORLD)
return obj[0]

def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
def on_after_backward(self) -> None:
# Break lazy accumulation of graph after fwd+bwd
htcore.mark_step()
return step_output

def optimizer_step(
self,
optimizer: Optimizer,
opt_idx: int,
closure: Callable[[], Any],
model: Optional[Union["pl.LightningModule", Module]] = None,
**kwargs: Any,
) -> Any:
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
# Break lazy accumulation of graph after optimizer
htcore.mark_step()
return optimizer_output

def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
Expand Down
23 changes: 19 additions & 4 deletions src/pytorch_lightning/strategies/single_hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Optional
from typing import Any, Callable, Dict, Optional, Union

from torch.nn import Module
from torch.optim.optimizer import Optimizer

import pytorch_lightning as pl
from lightning_lite.plugins.io.checkpoint_plugin import CheckpointIO
Expand Down Expand Up @@ -79,10 +82,22 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
def model_to_device(self) -> None:
self.model.to(self.root_device) # type: ignore

def training_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
def on_after_backward(self) -> None:
# Break lazy accumulation of graph after fwd+bwd
htcore.mark_step()
return step_output

def optimizer_step(
self,
optimizer: Optimizer,
opt_idx: int,
closure: Callable[[], Any],
model: Optional[Union["pl.LightningModule", Module]] = None,
**kwargs: Any,
) -> Any:
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
# Break lazy accumulation of graph after optimizer
htcore.mark_step()
return optimizer_output

def validation_step_end(self, step_output: STEP_OUTPUT) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
Expand Down