Lightning-AI · tchaton · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021
@@ -30,9 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515))
 
 
-- Added dataclasses for progress tracking (
-    [#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603),
-    [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574))
+- Progress tracking
+  * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574))
+  * Integrate progress tracking with the training loops ([#7976](https://github.com/PyTorchLightning/pytorch-lightning/pull/7976))
 
 
 - Added support for passing a `LightningDataModule` positionally as the second argument to `trainer.{validate,test,predict}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431))

@@ -23,10 +23,12 @@
 from torch import Tensor
 from torch.optim import Optimizer
 
+import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
+from pytorch_lightning.trainer.progress import TrainBatchLoopProgress, TrainingProgress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -53,6 +55,24 @@ def __init__(self) -> None:
         self._optimizer_freq_cumsum: Optional[int] = None
         self._remaining_splits: Optional[List[Any]] = None
         self._skip_backward: bool = False
+        self._progress: Optional[TrainBatchLoopProgress] = None
+        self._progress_optimization: Optional[TrainingProgress] = None
+
+    @property
+    def progress(self) -> Optional[TrainBatchLoopProgress]:
+        return self._progress
+
+    @progress.setter
+    def progress(self, progress: TrainBatchLoopProgress):
+        self._progress = progress
+
+    @property
+    def progress_optimization(self) -> Optional[TrainingProgress]:
+        return self._progress_optimization
+
+    @progress_optimization.setter
+    def progress_optimization(self, progress_optimization: TrainingProgress):
+        self._progress_optimization = progress_optimization
 
     @property
     def done(self) -> bool:
@@ -66,6 +86,11 @@ def optimizer_freq_cumsum(self) -> int:
             self._optimizer_freq_cumsum = np.cumsum(self.trainer.optimizer_frequencies)
         return self._optimizer_freq_cumsum
 
+    def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
+        # TODO(@justusschock): can we make this a weakref/proxy?
+        void(*args, **kwargs)
+        self.trainer = trainer
+
     def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
         """Runs all the data splits and the ``on_batch_start`` and ``on_train_batch_start`` hooks
 
@@ -78,6 +103,8 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
             self.warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(signal=0, training_step_output=[[]])
 
+        self.progress.increment_ready()
+
         # hook
         self.trainer.logger_connector.on_batch_start()
         response = self.trainer.call_hook("on_batch_start")
@@ -100,6 +127,10 @@ def reset(self) -> None:
         self.batch_idx = 0
         self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))]
 
+        if not self.trainer.is_restarting:
+            # reset tracking
+            self.progress_optimization.optimization.reset_on_epoch()
+
     def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         """Splits the data into tbptt splits
 
@@ -111,6 +142,14 @@ def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         void(batch_idx, dataloader_idx)
         self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch)))
 
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        self.progress.increment_started()
+        return super().on_advance_start(*args, **kwargs)
+
+    def on_advance_end(self) -> None:
+        self.progress.increment_completed()
+        return super().on_advance_end()
+
     def advance(self, batch, batch_idx, dataloader_idx):
         """Runs the train step together with optimization (if necessary) on the current batch split
 
@@ -128,7 +167,19 @@ def advance(self, batch, batch_idx, dataloader_idx):
         self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch)
 
         if self.trainer.lightning_module.automatic_optimization:
-            for opt_idx, optimizer in self.get_active_optimizers(batch_idx):
+            active_optimizers = self.get_active_optimizers(batch_idx)
+            for opt_idx, optimizer in active_optimizers:
+
+                # handle optimization restart
+                if self.trainer.is_restarting:
+                    if len(active_optimizers) > 1 and opt_idx < self.progress.current.completed:
+                        continue
+                    else:
+                        self.trainer.is_restarting = False
+
+                # track optimizer_idx
+                self.progress.optimizer_idx = opt_idx
+
                 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
                 if result:
                     self.batch_outputs[opt_idx].append(result.training_step_output)
@@ -138,6 +189,8 @@ def advance(self, batch, batch_idx, dataloader_idx):
             if result:
                 self.batch_outputs[0].append(result.training_step_output)
 
+        self.progress.increment_processed()
+
     def num_active_optimizers(self, batch_idx: Optional[int] = None) -> int:
         """Gets the number of active optimizers based on their frequency"""
         return len(self.get_active_optimizers(batch_idx))
@@ -217,8 +270,13 @@ def _training_step_and_backward_closure(
         """
 
         result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
+
         if result is not None:
             return_result.update(result)
+
+            # this should be done only if result.loss exists
+            self.progress_optimization.optimization.optimizer.increment_started()
+
             return return_result.loss
 
     def _make_closure(self, *closure_args: Any, **closure_kwargs: Any) -> Callable:
@@ -250,6 +308,8 @@ def _on_after_backward(self, batch_idx: int, untouched_loss: Tensor) -> None:
         # insert after step hook
         self.trainer.call_hook("on_after_backward")
 
+        self.progress_optimization.optimization.optimizer.increment_ready()
+
         # when in dev debugging track the losses
         self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach())
 
@@ -400,14 +460,20 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
+        self.progress_optimization.optimization.optimizer.increment_completed()
+
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
 
         Args:
             optimizer: the current optimizer
         """
+        self.progress_optimization.optimization.zero_grad.increment_ready()
+
         self.trainer.call_hook('on_before_zero_grad', optimizer)
 
+        self.progress_optimization.optimization.zero_grad.increment_started()
+
     def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int) -> None:
         """Zeroes out all gradients of parameters optimized by the current optimizer.
 
@@ -418,6 +484,8 @@ def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer,
         """
         self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
+        self.progress_optimization.optimization.zero_grad.increment_completed()
+
     def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, Tensor]:
         """Tracks gradient norms and clips the gradients of all parameters optimized by the current optimizer.
 

@@ -20,6 +20,7 @@
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.loops.batch.training_batch_loop import TrainingBatchLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
+from pytorch_lightning.trainer.progress import TrainingLoopProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
@@ -48,12 +49,28 @@ def __init__(self, min_steps: int, max_steps: int):
         self.is_last_batch: Optional[bool] = None
 
         self.batch_loop: Optional[TrainingBatchLoop] = None
+        self._progress: Optional[TrainingLoopProgress] = None
 
         self._dataloader_idx: Optional[int] = None
         self._warning_cache: WarningCache = WarningCache()
         self._epoch_output: Optional[List[List[STEP_OUTPUT]]] = None
         self._results = ResultCollection(training=True)
 
+    @property
+    def progress(self) -> TrainingLoopProgress:
+        if not self._progress:
+            self._progress = TrainingLoopProgress()
+            self.batch_loop.progress = self._progress.batch
+            self.batch_loop.progress_optimization = self._progress.epoch
+        return self._progress
+
+    @progress.setter
+    def progress(self, progress: TrainingLoopProgress) -> None:
+        if progress:
+            self.batch_loop.progress = progress.batch
+            self.batch_loop.progress_optimization = progress.epoch
+            self._progress = progress
+
     @property
     def results(self) -> ResultCollection:
         return self._results
@@ -63,13 +80,21 @@ def batch_idx(self) -> int:
         """Returns the current batch index (within this epoch)"""
         return self.iteration_count
 
+    @property
+    def total_optimizer_step(self) -> int:
+        return self.progress.epoch.optimization.optimizer.total.completed
+
+    @property
+    def current_batch_seen(self) -> int:
+        return self.progress.batch.current.completed
+
     @property
     def done(self) -> bool:
         """Returns whether the training should be stopped.
         The criteria are that the number of steps reached the max steps,
         the last batch is reached or the trainer signals to stop (e.g. by early stopping).
         """
-        max_steps_reached = self.max_steps is not None and self.global_step >= self.max_steps
+        max_steps_reached = self.max_steps is not None and (self.total_optimizer_step) >= self.max_steps
         return max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(self.is_last_batch)
 
     def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
@@ -88,12 +113,22 @@ def reset(self) -> None:
         # track epoch output
         self._epoch_output = [[] for _ in range(self.batch_loop.num_active_optimizers(self.total_batch_idx))]
 
+        if not self.trainer.is_restarting:
+            # reset tracking
+            self.progress.reset_on_epoch()
+        else:
+            self.batches_seen = self.current_batch_seen
+
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
+        self.progress.epoch.increment_ready()
+
         # hook
         self.trainer.logger_connector.on_epoch_start()
         self.trainer.call_hook("on_epoch_start")
         self.trainer.call_hook("on_train_epoch_start")
 
+        self.progress.epoch.increment_started()
+
     def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
         """Runs a single training batch.
 
@@ -216,15 +251,20 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]:
                     'HINT: remove the return statement in training_epoch_end'
                 )
 
+        self.progress.epoch.increment_processed()
+
         # call train epoch end hooks
         self._on_train_epoch_end_hook(processed_outputs)
         self.trainer.call_hook('on_epoch_end')
         self.trainer.logger_connector.on_epoch_end()
+
+        self.progress.epoch.increment_completed()
+
         return self._epoch_output
 
     def teardown(self) -> None:
         """Frees memory of tracked epoch outputs."""
-        self.epoch_output = None
+        self._epoch_output = None
 
     def _on_train_epoch_end_hook(self, processed_epoch_output: List[List[STEP_OUTPUT]]) -> None:
         """Runs ``on_train_epoch_end hook``."""

@@ -21,6 +21,7 @@
 from pytorch_lightning.loops.dataloader.evaluation_loop import EvaluationLoop
 from pytorch_lightning.loops.epoch.training_epoch_loop import TrainingEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
+from pytorch_lightning.trainer.progress import FitLoopProgress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -53,6 +54,7 @@ def __init__(
         self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
         self.epoch_loop = TrainingEpochLoop(min_steps, max_steps)
         self.val_loop = EvaluationLoop()
+        self._progress: Optional[FitLoopProgress] = None
 
     @property
     def results(self) -> ResultCollection:
@@ -114,6 +116,16 @@ def max_steps(self, value: int) -> None:
         # TODO(@awaelchli): This setter is required by debugging connector (fast dev run), should be avoided
         self.epoch_loop.max_steps = value
 
+    @property
+    def total_epoch_completed(self) -> int:
+        """Returns the total number of epoch completed"""
+        return self.progress.train.epoch.total.completed
+
+    @property
+    def total_optimizer_step_completed(self) -> int:
+        """Returns the total number of optimizer step completed"""
+        return self.progress.train.epoch.optimization.optimizer.total.completed
+
     @property
     def running_loss(self) -> TensorRunningAccum:
         """Returns the running loss"""
@@ -137,14 +149,14 @@ def done(self) -> bool:
         or if the maximum number of steps or epochs is reached.
         """
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
-        stop_steps = self.max_steps is not None and self.global_step >= self.max_steps
-        stop_epochs = self.max_epochs is not None and self.current_epoch >= self.max_epochs
+        stop_steps = self.max_steps is not None and self.total_optimizer_step_completed >= self.max_steps
+        stop_epochs = self.max_epochs is not None and self.total_epoch_completed >= self.max_epochs
 
         should_stop = False
         if self.trainer.should_stop:
             # early stopping
-            met_min_epochs = self.current_epoch >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
+            met_min_epochs = self.total_epoch_completed >= self.min_epochs if self.min_epochs else True
+            met_min_steps = self.total_optimizer_step_completed >= self.min_steps if self.min_steps else True
             if met_min_epochs and met_min_steps:
                 should_stop = True
             else:
@@ -171,8 +183,24 @@ def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
     def reset(self) -> None:
         """Resets the internal state of this loop"""
 
+    @property
+    def progress(self) -> FitLoopProgress:
+        if not self._progress:
+            self._progress = FitLoopProgress(train=self.epoch_loop.progress)
+        return self._progress
+
+    @progress.setter
+    def progress(self, progress: FitLoopProgress) -> None:
+        if progress:
+            self._progress = progress
+            self.epoch_loop.progress = progress.train
+
     def on_run_start(self) -> None:
         """Calls the ``on_train_start`` hook."""
+
+        # reset current epoch counter to 0
+        self.progress.train.epoch.current.reset()
+
         self.results.to(device=self.trainer.lightning_module.device)
         self.trainer.call_hook("on_train_start")