Lightning-AI · tchaton · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021 · Jun 7, 2021
@@ -31,7 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
 - Progress tracking
-  * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
+  * Added dataclasses for progress tracking([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574))
+  * Integrate progress tracking with the training loops ([#7976](https://github.com/PyTorchLightning/pytorch-lightning/pull/7976))
   * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
   * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/PyTorchLightning/pytorch-lightning/pull/8244))
 
@@ -137,6 +138,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247))
 
 
+- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))
+
+
 ### Changed
 
 

@@ -211,6 +211,7 @@ def closure_dis():
             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
 
         self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
+        self._trainer.fit_loop.epoch_loop.batch_loop.optim_progress.optimizer.step.increment_processed()
         self._total_optimizer_step_calls += 1
 
     def __repr__(self):

@@ -13,11 +13,12 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, OrderedDict
 
 from deprecate import void
 
 import pytorch_lightning as pl
+from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -46,7 +47,44 @@ class Loop(ABC):
     def __init__(self) -> None:
         self.iteration_count: int = 0
         self.trainer: Optional['pl.Trainer'] = None
+        self._cached_state: Optional[Dict] = None
         self.restarting = False
+        self._loops = OrderedDict()
+        self._progress = OrderedDict()
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if isinstance(value, Loop):
+            self._loops[name] = value
+        elif isinstance(value, BaseProgress):
+            self._progress[name] = value
+        else:
+            object.__setattr__(self, name, value)
+
+    def __getattr__(self, name) -> Any:
+        loops = self.__dict__.get('_loops')
+        if loops is None:
+            raise MisconfigurationException("The Loop wasn't called parent `__init__` function.")
+
+        if name in loops:
+            return loops[name]
+
+        progress = self.__dict__.get('_progress')
+
+        if name in progress:
+            return progress[name]
+
+        if name not in self.__dict__:
+            raise AttributeError(f"{self.__class__.__name__} Loop doesn't have attribute {name}.")
+
+        return self.__dict__[name]
+
+    def __delattr__(self, name) -> None:
+        if name in self._loops:
+            del self._loops[name]
+        elif name in self._progress:
+            del self._progress[name]
+        else:
+            object.__delattr__(self, name)
 
     @property
     @abstractmethod
@@ -89,7 +127,8 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
             return self.on_skip()
 
         if self.restarting:
-            self.restore()
+            self.restore(self._cached_state)
+            self._cached_state = None
             self.restarting = False
         else:
             self.reset()
@@ -108,7 +147,8 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         output = self.on_run_end()
         return output
 
-    def restore(self) -> None:
+    @abstractmethod
+    def restore(self, state: Optional[Dict] = None) -> None:
         """Restore the internal state of the loop the beginning of run if restarting is ``True``."""
 
     @abstractmethod
@@ -142,9 +182,43 @@ def on_run_end(self) -> Any:
     def teardown(self) -> None:
         """Use to release memory etc."""
 
-    def load_state_dict(self, state_dict: Dict) -> None:
-        """Restore the loop state from the provided state_dict."""
-
+    @abstractmethod
     def state_dict(self) -> Dict:
-        """Return the loop current states."""
-        return {}
+        """Current Loop state"""
+
+    def get_state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional[str] = '') -> OrderedDict:
+        if destination is None:
+            destination = OrderedDict()
+
+        destination[prefix + "state_dict"] = self.state_dict()
+
+        for name, progress in self._progress.items():
+            destination[prefix + name] = progress.state_dict()
+
+        for name, loop in self._loops.items():
+            loop.get_state_dict(destination, prefix + name + '.')
+        return destination
+
+    def _load_from_state_dict(self, state_dict, prefix, strict, missing_keys, unexpected_keys, error_msgs):
+        self._cached_state = state_dict[prefix + "state_dict"]
+
+        for name, progress in self._progress.items():
+            progress.load_state_dict(state_dict[prefix + name])
+
+    def load_state_dict(self, state_dict: Dict, strict: bool = True):
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+
+        state_dict = state_dict.copy()
+
+        def load(loop, prefix=''):
+            loop._load_from_state_dict(state_dict, prefix, True, missing_keys, unexpected_keys, error_msgs)
+            loop.restarting = True
+            for name, loop_children in loop._loops.items():
+                if loop_children is not None:
+                    load(loop_children, prefix + name + '.')
+
+        load(self)
+        load = None  # break load->load reference cycle
@@ -69,10 +69,8 @@ def connect(
     ) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        if optim_progress is not None:
-            self.optim_progress = optim_progress
+        self.progress = progress or self.progress
+        self.optim_progress = optim_progress or self.optim_progress
 
     @property
     def done(self) -> bool:
@@ -98,6 +96,8 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(signal=0, training_step_output=[[]])
 
+        self.progress.increment_ready()
+
         # hook
         self.trainer.logger_connector.on_batch_start()
         response = self.trainer.call_hook("on_batch_start")
@@ -114,12 +114,23 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
         self.batch_outputs = None  # free memory
         return output
 
-    def reset(self) -> None:
+    def _initialize(self):
         """Resets the loop state"""
         self._hiddens = None
         self.batch_idx = 0
         self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))]
 
+    def restore(self) -> None:
+        """Restore the loop state"""
+        self._initialize()
+
+    def reset(self) -> None:
+        """Resets the loop state"""
+        self._initialize()
+
+        # reset tracking
+        self.optim_progress.optimizer.reset_on_epoch()
+
     def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         """Splits the data into tbptt splits
 
@@ -131,6 +142,14 @@ def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         void(batch_idx, dataloader_idx)
         self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch)))
 
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        self.progress.increment_started()
+        return super().on_advance_start(*args, **kwargs)
+
+    def on_advance_end(self) -> None:
+        self.progress.increment_completed()
+        return super().on_advance_end()
+
     def advance(self, batch, batch_idx, dataloader_idx):
         """Runs the train step together with optimization (if necessary) on the current batch split
 
@@ -148,7 +167,17 @@ def advance(self, batch, batch_idx, dataloader_idx):
         self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch)
 
         if self.trainer.lightning_module.automatic_optimization:
-            for opt_idx, optimizer in self.get_active_optimizers(batch_idx):
+            active_optimizers = self.get_active_optimizers(batch_idx)
+            for opt_idx, optimizer in active_optimizers:
+
+                # handle optimization restart
+                if self.trainer.is_restarting:
+                    if len(active_optimizers) > 1 and opt_idx < self.progress.current.completed:
+                        continue
+
+                # track optimizer_idx
+                self.optim_progress.optimizer_idx = opt_idx
+
                 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
                 if result:
                     self.batch_outputs[opt_idx].append(result.training_step_output)
@@ -158,6 +187,8 @@ def advance(self, batch, batch_idx, dataloader_idx):
             if result:
                 self.batch_outputs[0].append(result.training_step_output)
 
+        self.progress.increment_processed()
+
     def teardown(self) -> None:
         # release memory
         self._remaining_splits = None
@@ -238,8 +269,14 @@ def _training_step_and_backward_closure(
         """
 
         result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
+
         if result is not None:
             return_result.update(result)
+
+            # this should be done only if result.loss exists
+            if not self.should_accumulate():
+                self.optim_progress.optimizer.step.increment_started()
+
             return return_result.loss
 
     def _make_closure(self, *closure_args: Any, **closure_kwargs: Any) -> Callable:
@@ -409,6 +446,8 @@ def _optimizer_step(
         # wraps into LightningOptimizer only for running step
         optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx)
 
+        self.optim_progress.optimizer.step.increment_ready()
+
         # model hook
         model_ref.optimizer_step(
             self.trainer.current_epoch,
@@ -421,13 +460,17 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
+        self.optim_progress.optimizer.step.increment_completed()
+
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
 
         Args:
             optimizer: the current optimizer
         """
+        self.optim_progress.optimizer.zero_grad.increment_started()
         self.trainer.call_hook('on_before_zero_grad', optimizer)
+        self.optim_progress.optimizer.zero_grad.increment_ready()
 
     def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int) -> None:
         """Zeroes out all gradients of parameters optimized by the current optimizer.
@@ -437,8 +480,11 @@ def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer,
             optimizer: the current optimizer
             opt_idx: the index of the current optimizer
         """
+
         self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
+        self.optim_progress.optimizer.zero_grad.increment_completed()
+
     def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, Tensor]:
         """Tracks gradient norms and clips the gradients of all parameters optimized by the current optimizer.
 
@@ -700,3 +746,12 @@ def _truncated_bptt_steps(self) -> int:
         if lightning_module.truncated_bptt_steps > 0:
             return lightning_module.truncated_bptt_steps
         return self.trainer.truncated_bptt_steps or 0
+
+    def state_dict(self) -> Dict:
+        return {"progress": self.progress.state_dict(), "optim_progress": self.optim_progress.state_dict()}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        if "progress" in state_dict:
+            self.progress.load_state_dict(state_dict['progress'])
+        if "optim_progress" in state_dict:
+            self.optim_progress.load_state_dict(state_dict['optim_progress'])