diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e1517ebc2b36..e6367977d237b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.3.5] - 2021-06-08 + +### Added + +- Added warning to Training Step output ([#7779](https://github.com/PyTorchLightning/pytorch-lightning/pull/7779)) + +### Fixed + +- Fixed LearningRateMonitor + BackboneFinetuning ([#7835](https://github.com/PyTorchLightning/pytorch-lightning/pull/7835)) +- Minor improvements to `apply_to_collection` and type signature of `log_dict` ([#7851](https://github.com/PyTorchLightning/pytorch-lightning/pull/7851)) +- Fixed docker versions ([#7834](https://github.com/PyTorchLightning/pytorch-lightning/pull/7834)) +- Fixed sharded training check for fp16 precision ([#7825](https://github.com/PyTorchLightning/pytorch-lightning/pull/7825)) +- Fixed support for torch Module type hints in LightningCLI ([#7807](https://github.com/PyTorchLightning/pytorch-lightning/pull/7807)) + +### Changed + +- Move `training_output` validation to after `train_step_end` ([#7868](https://github.com/PyTorchLightning/pytorch-lightning/pull/7868)) + + ## [1.3.4] - 2021-06-01 ### Fixed @@ -12,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed info message when max training time reached ([#7780](https://github.com/PyTorchLightning/pytorch-lightning/pull/7780)) - Fixed missing `__len__` method to `IndexBatchSamplerWrapper` ([#7681](https://github.com/PyTorchLightning/pytorch-lightning/pull/7681)) + ## [1.3.3] - 2021-05-27 ### Changed @@ -41,8 +61,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed setting correct `DistribType` for `ddp_cpu` (spawn) backend ([#7492](https://github.com/PyTorchLightning/pytorch-lightning/pull/7492)) - Fixed incorrect number of calls to LR scheduler when `check_val_every_n_epoch > 1` ([#7032](https://github.com/PyTorchLightning/pytorch-lightning/pull/7032)) -## [1.3.1] - 2021-05-11 +## [1.3.1] - 2021-05-11 ### Fixed @@ -55,11 +75,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944/)) +- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944)) - Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202)) - Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942)) - Added utils for metrics to scalar conversions ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180)) -- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/)) +- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834)) - Added more explicit exception message when trying to execute `trainer.test()` or `trainer.validate()` with `fast_dev_run=True` ([#6667](https://github.com/PyTorchLightning/pytorch-lightning/pull/6667)) - Added `LightningCLI` class to provide simple reproducibility with minimum boilerplate training CLI ( [#4492](https://github.com/PyTorchLightning/pytorch-lightning/pull/4492), @@ -85,7 +105,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120)) - Added `configure_sharded_model` hook ([#6679](https://github.com/PyTorchLightning/pytorch-lightning/pull/6679)) - Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/PyTorchLightning/pytorch-lightning/pull/6595)) -- Added support for DDP communication hooks ([#6736](https://github.com/PyTorchLightning/pytorch-lightning/issues/6736)) +- Added support for DDP communication hooks ([#6736](https://github.com/PyTorchLightning/pytorch-lightning/pull/6736)) - Added `artifact_location` argument to `MLFlowLogger` which will be passed to the `MlflowClient.create_experiment` call ([#6677](https://github.com/PyTorchLightning/pytorch-lightning/pull/6677)) - Added `model` parameter to precision plugins' `clip_gradients` signature ( [#6764](https://github.com/PyTorchLightning/pytorch-lightning/pull/6764), @@ -104,14 +124,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `tpu_distributed` check for TPU Spawn barrier ([#7241](https://github.com/PyTorchLightning/pytorch-lightning/pull/7241)) - Added device updates to TPU Spawn for Pod training ([#7243](https://github.com/PyTorchLightning/pytorch-lightning/pull/7243)) - Added warning when missing `Callback` and using `resume_from_checkpoint` ([#7254](https://github.com/PyTorchLightning/pytorch-lightning/pull/7254)) -- DeepSpeed single file saving ([#6900](https://github.com/PyTorchLightning/pytorch-lightning/issues/6900)) +- DeepSpeed single file saving ([#6900](https://github.com/PyTorchLightning/pytorch-lightning/pull/6900)) - Added Training type Plugins Registry ( - [#6982](https://github.com/PyTorchLightning/pytorch-lightning/issues/6982), - [#7063](https://github.com/PyTorchLightning/pytorch-lightning/issues/7063), - [#7214](https://github.com/PyTorchLightning/pytorch-lightning/issues/7214), - [#7224](https://github.com/PyTorchLightning/pytorch-lightning/issues/7224) + [#6982](https://github.com/PyTorchLightning/pytorch-lightning/pull/6982), + [#7063](https://github.com/PyTorchLightning/pytorch-lightning/pull/7063), + [#7214](https://github.com/PyTorchLightning/pytorch-lightning/pull/7214), + [#7224](https://github.com/PyTorchLightning/pytorch-lightning/pull/7224) ) -- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/PyTorchLightning/pytorch-lightning/issues/6056)) +- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/PyTorchLightning/pytorch-lightning/pull/6056)) ### Changed @@ -129,7 +149,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The trainer no longer tries to save a checkpoint on exception or run callback's `on_train_end` functions ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864)) - Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) - Disabled `lr_scheduler.step()` in manual optimization ([#6825](https://github.com/PyTorchLightning/pytorch-lightning/pull/6825)) -- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762/)) +- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762)) - `pl.seed_everything` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/PyTorchLightning/pytorch-lightning/pull/7024)) - Changed default setting for communication of multi-node training using `DDPShardedPlugin` ([#6937](https://github.com/PyTorchLightning/pytorch-lightning/pull/6937)) - `trainer.tune()` now returns the tuning result ([#7258](https://github.com/PyTorchLightning/pytorch-lightning/pull/7258)) @@ -164,7 +184,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/PyTorchLightning/pytorch-lightning/pull/7201)) - Deprecated `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066)) - Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180)) -- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/)) +- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834)) - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) - Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945)) - Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621)) @@ -270,7 +290,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed auto-scaling mode when calling tune method on trainer ([#7321](https://github.com/PyTorchLightning/pytorch-lightning/pull/7321)) - Fixed finetuning complex models correctly unfreezes ([#6880](https://github.com/PyTorchLightning/pytorch-lightning/pull/6880)) - Ensure we set the eval/train flag correctly on accelerator model ([#6877](https://github.com/PyTorchLightning/pytorch-lightning/pull/6877)) -- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/)) +- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802)) - Fixed matching the number of outputs of backward with forward for AllGatherGrad ([#6625](https://github.com/PyTorchLightning/pytorch-lightning/pull/6625)) - Fixed the `gradient_clip_algorithm` has no effect ([#6928](https://github.com/PyTorchLightning/pytorch-lightning/pull/6928)) - Fixed CUDA OOM detection and handling ([#6934](https://github.com/PyTorchLightning/pytorch-lightning/pull/6934)) @@ -486,7 +506,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed `computer_vision_fine_tunning` example to use `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/PyTorchLightning/pytorch-lightning/pull/5377)) - Changed `automatic casting` for LoggerConnector `metrics` ([#5218](https://github.com/PyTorchLightning/pytorch-lightning/pull/5218)) - Changed `iou` [func] to allow float input ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704)) -- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/PyTorchLightning/pytorch-lightning/pull/5409/)) +- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/PyTorchLightning/pytorch-lightning/pull/5409)) - Set PyTorch 1.4 as min requirements, also for testing and examples `torchvision>=0.5` and `torchtext>=0.5` ([#5418](https://github.com/PyTorchLightning/pytorch-lightning/pull/5418)) - Changed `callbacks` argument in `Trainer` to allow `Callback` input ([#5446](https://github.com/PyTorchLightning/pytorch-lightning/pull/5446)) - Changed the default of `find_unused_parameters` to `False` in DDP ([#5185](https://github.com/PyTorchLightning/pytorch-lightning/pull/5185)) @@ -1253,7 +1273,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed getting `experiment_id` from MLFlow only once instead of each training loop ([#3394](https://github.com/PyTorchLightning/pytorch-lightning/pull/3394)) - Fixed `overfit_batches` which now correctly disables shuffling for the training loader. ([#3501](https://github.com/PyTorchLightning/pytorch-lightning/pull/3501)) - Fixed gradient norm tracking for `row_log_interval > 1` ([#3489](https://github.com/PyTorchLightning/pytorch-lightning/pull/3489)) -- Fixed `ModelCheckpoint` name formatting ([3164](https://github.com/PyTorchLightning/pytorch-lightning/pull/3163)) +- Fixed `ModelCheckpoint` name formatting ([#3164](https://github.com/PyTorchLightning/pytorch-lightning/pull/3163)) - Fixed example implementation of AutoEncoder ([#3190](https://github.com/PyTorchLightning/pytorch-lightning/pull/3190)) - Fixed invalid paths when remote logging with TensorBoard ([#3236](https://github.com/PyTorchLightning/pytorch-lightning/pull/3236)) - Fixed change `t()` to `transpose()` as XLA devices do not support `.t()` on 1-dim tensor ([#3252](https://github.com/PyTorchLightning/pytorch-lightning/pull/3252)) @@ -1513,8 +1533,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` ([#1908](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908)) - Early stopping checks `on_validation_end` ([#1458](https://github.com/PyTorchLightning/pytorch-lightning/pull/1458)) - Speed up single-core TPU training by loading data using `ParallelLoader` ([#2033](https://github.com/PyTorchLightning/pytorch-lightning/pull/2033)) -- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)) -- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610)) +- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([#1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)) +- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([#1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610)) - Added back the slow spawn ddp implementation as `ddp_spawn` ([#2115](https://github.com/PyTorchLightning/pytorch-lightning/pull/2115)) - Added loading checkpoints from URLs ([#1667](https://github.com/PyTorchLightning/pytorch-lightning/pull/1667)) - Added a callback method `on_keyboard_interrupt` for handling KeyboardInterrupt events during training ([#2134](https://github.com/PyTorchLightning/pytorch-lightning/pull/2134)) diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile index fbfd2224a66a9..528e7561c1e96 100644 --- a/dockers/nvidia/Dockerfile +++ b/dockers/nvidia/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes -FROM nvcr.io/nvidia/pytorch:21.04-py3 +FROM nvcr.io/nvidia/pytorch:21.05-py3 LABEL maintainer="PyTorchLightning " @@ -39,14 +39,16 @@ RUN \ # Installations python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + pip install "Pillow>=8.2" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \ pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \ pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \ pip install ./pytorch-lightning --no-cache-dir && \ - pip install "Pillow>=8.1" --no-cache-dir --upgrade-strategy only-if-needed && \ rm -rf pytorch-lightning && \ + pip install jupyterlab[all] -U && \ pip list -RUN pip install lightning-grid -U +RUN pip install lightning-grid -U && \ + pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed ENV PYTHONPATH="/workspace" diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index d8a5e153425a2..9471cf85b2b4b 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = '1.3.4' +__version__ = '1.3.5' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index 7530bfaa9d21e..61b0b31155994 100644 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -19,8 +19,10 @@ Monitor and logs learning rate for lr schedulers during training. """ +from collections import defaultdict +from typing import Any, DefaultDict, Dict, List, Optional, Type -from typing import Dict, List, Optional +from torch.optim.optimizer import Optimizer from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.utilities import rank_zero_warn @@ -53,7 +55,7 @@ class LearningRateMonitor(Callback): In case of multiple optimizers of same type, they will be named ``Adam``, ``Adam-1`` etc. If a optimizer has multiple parameter groups they will be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a - ``name`` keyword in the construction of the learning rate schdulers + ``name`` keyword in the construction of the learning rate schedulers Example:: @@ -138,6 +140,9 @@ def on_train_epoch_start(self, trainer, *args, **kwargs): def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: latest_stat = {} + names = self._find_names(trainer.lr_schedulers, add_lr_sch_names=False) + self._remap_keys(names) + for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): if scheduler['interval'] == interval or interval == 'any': opt = scheduler['scheduler'].optimizer @@ -146,7 +151,7 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: for i, pg in enumerate(param_groups): suffix = f'/pg{i + 1}' if len(param_groups) > 1 else '' - lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}') + lr = self._extract_lr(pg, f'{name}{suffix}') latest_stat.update(lr) momentum = self._extract_momentum( param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas @@ -155,12 +160,23 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: return latest_stat - def _extract_lr(self, param_group, name: str) -> Dict[str, float]: + def _extract_lr(self, param_group: Dict[str, Any], name: str) -> Dict[str, Any]: lr = param_group.get('lr') self.lrs[name].append(lr) return {name: lr} - def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]: + def _remap_keys(self, names: List[str], token: str = '/pg1') -> None: + """ + This function is used the remap the keys if param groups for a given optimizer increased. + """ + for new_name in names: + old_name = new_name.replace(token, '') + if token in new_name and old_name in self.lrs: + self.lrs[new_name] = self.lrs.pop(old_name) + elif new_name not in self.lrs: + self.lrs[new_name] = [] + + def _extract_momentum(self, param_group: Dict[str, Any], name: str, use_betas: bool) -> Dict[str, float]: if not self.log_momentum: return {} @@ -168,35 +184,46 @@ def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str self.last_momentum_values[name] = momentum return {name: momentum} - def _find_names(self, lr_schedulers) -> List[str]: - # Create uniqe names in the case we have multiple of the same learning - # rate schduler + multiple parameter groups + def _add_prefix( + self, name: str, optimizer_cls: Type[Optimizer], seen_optimizer_types: DefaultDict[Type[Optimizer], int] + ) -> str: + if optimizer_cls not in seen_optimizer_types: + return name + count = seen_optimizer_types[optimizer_cls] + return name + f'-{count - 1}' if count > 1 else name + + def _find_names(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> List[str]: + # Create unique names in the case we have multiple of the same learning + # rate scheduler + multiple parameter groups names = [] + seen_optimizers = [] + seen_optimizer_types = defaultdict(int) for scheduler in lr_schedulers: sch = scheduler['scheduler'] if scheduler['name'] is not None: name = scheduler['name'] else: - opt_name = 'lr-' + sch.optimizer.__class__.__name__ - i, name = 1, opt_name + name = 'lr-' + sch.optimizer.__class__.__name__ - # Multiple schduler of the same type - while True: - if name not in names: - break - i, name = i + 1, f'{opt_name}-{i}' + seen_optimizers.append(sch.optimizer) + optimizer_cls = type(sch.optimizer) + if scheduler['name'] is None: + seen_optimizer_types[optimizer_cls] += 1 - # Multiple param groups for the same schduler + # Multiple param groups for the same scheduler param_groups = sch.optimizer.param_groups + name = self._add_prefix(name, optimizer_cls, seen_optimizer_types) + if len(param_groups) != 1: - for i, pg in enumerate(param_groups): + for i in range(len(param_groups)): temp = f'{name}/pg{i + 1}' names.append(temp) else: names.append(name) - self.lr_sch_names.append(name) + if add_lr_sch_names: + self.lr_sch_names.append(name) return names diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 109b8fd8104b5..b324582fe7602 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -25,7 +25,7 @@ from argparse import Namespace from functools import partial from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union import torch from torch import ScriptModule, Tensor @@ -347,7 +347,7 @@ def log( def log_dict( self, - dictionary: dict, + dictionary: Mapping[str, Any], prog_bar: bool = False, logger: bool = True, on_step: Optional[bool] = None, diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py index 02da937286dcc..fceafddd66ec0 100644 --- a/pytorch_lightning/plugins/training_type/sharded.py +++ b/pytorch_lightning/plugins/training_type/sharded.py @@ -54,7 +54,8 @@ def _reinit_optimizers_with_oss(self): optim_class = type(optimizer) zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults) if _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE: - is_fp16 = self.lightning_module.trainer.precision == 16 + precision = self.lightning_module.trainer.precision + is_fp16 = precision in ("mixed", 16) # For multi-node training, compressing the model shards in fp16 before broadcasting # improves performance. When using PyTorch AMP, it will not degrade # the model performance. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 943864138e371..d596bc8a43831 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -14,7 +14,7 @@ from contextlib import contextmanager, suppress from copy import copy, deepcopy -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Mapping, Optional, Union import numpy as np import torch @@ -265,6 +265,16 @@ def _check_training_step_output(self, training_step_output): if training_step_output.grad_fn is None: # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + elif self.trainer.lightning_module.automatic_optimization: + if not any(( + isinstance(training_step_output, torch.Tensor), + (isinstance(training_step_output, Mapping) + and 'loss' in training_step_output), training_step_output is None + )): + raise MisconfigurationException( + "In automatic optimization, `training_step` must either return a Tensor, " + "a dict with key 'loss' or None (where the step will be skipped)." + ) def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging @@ -282,10 +292,10 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): self.trainer.logger_connector.cache_logged_metrics() - self._check_training_step_output(training_step_output) - training_step_output = self.trainer.call_hook("training_step_end", training_step_output) + self._check_training_step_output(training_step_output) + training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( training_step_output, split_batch ) diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 1cbab2fb8dee9..61739cd25d1d2 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -13,6 +13,7 @@ # limitations under the License. import operator from abc import ABC +from collections import OrderedDict from collections.abc import Mapping, Sequence from copy import copy from functools import partial @@ -85,10 +86,12 @@ def apply_to_collection( # Recursively apply to collection items if isinstance(data, Mapping): - return elem_type({ - k: apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) - for k, v in data.items() - }) + return elem_type( + OrderedDict({ + k: apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs) + for k, v in data.items() + }) + ) if isinstance(data, tuple) and hasattr(data, '_fields'): # named tuple return elem_type( diff --git a/requirements.txt b/requirements.txt index c3a4caaf6429d..d985004a2ca9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ torch>=1.4 future>=0.17.1 # required for builtins in setup.py tqdm>=4.41.0 PyYAML>=5.1,<=5.4.1 -fsspec[http]>=2021.4.0 +fsspec[http]>=2021.05.0, !=2021.06.0 tensorboard>=2.2.0, !=2.5.0 # 2.5.0 GPU CI error: 'Couldn't build proto file into descriptor pool!' torchmetrics>=0.2.0 pyDeprecate==0.3.0 diff --git a/requirements/extra.txt b/requirements/extra.txt index db2e66540eef1..cb9515beefb9a 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -7,4 +7,4 @@ torchtext>=0.5 # onnx>=1.7.0 onnxruntime>=1.3.0 hydra-core>=1.0 -jsonargparse[signatures]>=3.11.1 +jsonargparse[signatures]>=3.13.1 diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index bea6c45e95ced..808165d61b053 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest +import torch from torch import optim import tests.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import LearningRateMonitor +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.callbacks.finetuning import BackboneFinetuning from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule @@ -278,3 +281,102 @@ def configure_optimizers(self): ) trainer.fit(TestModel()) assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name'] + + +def test_multiple_optimizers_basefinetuning(tmpdir): + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.backbone = torch.nn.Sequential( + torch.nn.Linear(32, 32), + torch.nn.Linear(32, 32), + torch.nn.Linear(32, 32), + torch.nn.ReLU(True), + ) + self.layer = torch.nn.Linear(32, 2) + + def training_step(self, batch, batch_idx, optimizer_idx): + return super().training_step(batch, batch_idx) + + def forward(self, x): + return self.layer(self.backbone(x)) + + def configure_optimizers(self): + parameters = list(filter(lambda p: p.requires_grad, self.parameters())) + opt = optim.Adam(parameters, lr=0.1) + opt_2 = optim.Adam(parameters, lr=0.1) + opt_3 = optim.Adam(parameters, lr=0.1) + optimizers = [opt, opt_2, opt_3] + schedulers = [ + optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.5), + optim.lr_scheduler.StepLR(opt_2, step_size=1, gamma=0.5), + ] + return optimizers, schedulers + + class Check(Callback): + + def on_train_epoch_start(self, trainer, pl_module) -> None: + num_param_groups = sum([len(opt.param_groups) for opt in trainer.optimizers]) + assert lr_monitor.lr_sch_names == ['lr-Adam', 'lr-Adam-1'] + if trainer.current_epoch == 0: + assert num_param_groups == 3 + elif trainer.current_epoch == 1: + assert num_param_groups == 4 + assert list(lr_monitor.lrs) == ['lr-Adam-1', 'lr-Adam/pg1', 'lr-Adam/pg2'] + elif trainer.current_epoch == 2: + assert num_param_groups == 5 + assert list(lr_monitor.lrs) == ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2'] + else: + expected = ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2', 'lr-Adam-1/pg3'] + assert list(lr_monitor.lrs) == expected + + class TestFinetuning(BackboneFinetuning): + + def freeze_before_training(self, pl_module): + self.freeze(pl_module.backbone[0]) + self.freeze(pl_module.backbone[1]) + self.freeze(pl_module.layer) + + def finetune_function(self, pl_module, epoch: int, optimizer, opt_idx: int): + """Called when the epoch begins.""" + if epoch == 1 and opt_idx == 0: + self.unfreeze_and_add_param_group(pl_module.backbone[0], optimizer, lr=0.1) + if epoch == 2 and opt_idx == 1: + self.unfreeze_and_add_param_group(pl_module.layer, optimizer, lr=0.1) + + if epoch == 3 and opt_idx == 1: + assert len(optimizer.param_groups) == 2 + self.unfreeze_and_add_param_group(pl_module.backbone[1], optimizer, lr=0.1) + assert len(optimizer.param_groups) == 3 + + lr_monitor = LearningRateMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=5, + limit_val_batches=0, + limit_train_batches=2, + callbacks=[TestFinetuning(), lr_monitor, Check()], + progress_bar_refresh_rate=0, + weights_summary=None, + checkpoint_callback=False + ) + model = TestModel() + model.training_epoch_end = None + trainer.fit(model) + + expected = [0.1, 0.05, 0.025, 0.0125, 0.00625] + assert lr_monitor.lrs['lr-Adam/pg1'] == expected + + expected = [0.1, 0.05, 0.025, 0.0125] + assert lr_monitor.lrs['lr-Adam/pg2'] == expected + + expected = [0.1, 0.05, 0.025, 0.0125, 0.00625] + assert lr_monitor.lrs['lr-Adam-1/pg1'] == expected + + expected = [0.1, 0.05, 0.025] + assert lr_monitor.lrs['lr-Adam-1/pg2'] == expected + + expected = [0.1, 0.05] + assert lr_monitor.lrs['lr-Adam-1/pg3'] == expected diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py index 94becf6488fc3..99eedf377f902 100644 --- a/tests/trainer/loops/test_training_loop.py +++ b/tests/trainer/loops/test_training_loop.py @@ -11,10 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re + import pytest import torch from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel @@ -222,3 +225,75 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx): else: assert trainer.batch_idx == batch_idx_ assert trainer.global_step == batch_idx_ * max_epochs + + +def test_should_stop_mid_epoch(tmpdir): + """Test that training correctly stops mid epoch and that validation is still called at the right time""" + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.validation_called_at = None + + def training_step(self, batch, batch_idx): + if batch_idx == 4: + self.trainer.should_stop = True + return super().training_step(batch, batch_idx) + + def validation_step(self, *args): + self.validation_called_at = (self.trainer.current_epoch, self.trainer.global_step) + return super().validation_step(*args) + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=10, + limit_val_batches=1, + ) + trainer.fit(model) + + assert trainer.current_epoch == 0 + assert trainer.global_step == 5 + assert model.validation_called_at == (0, 4) + + +@pytest.mark.parametrize(['output'], [(5., ), ({'a': 5}, )]) +def test_warning_invalid_trainstep_output(tmpdir, output): + + class InvalidTrainStepModel(BoringModel): + + def training_step(self, batch, batch_idx): + return output + + model = InvalidTrainStepModel() + + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + with pytest.raises( + MisconfigurationException, + match=re.escape( + "In automatic optimization, `training_step` must either return a Tensor, " + "a dict with key 'loss' or None (where the step will be skipped)." + ) + ): + trainer.fit(model) + + +def test_warning_valid_train_step_end(tmpdir): + + class ValidTrainStepEndModel(BoringModel): + + def training_step(self, batch, batch_idx): + output = self(batch) + return {'output': output, 'batch': batch} + + def training_step_end(self, outputs): + loss = self.loss(outputs['batch'], outputs['output']) + return loss + + # No error is raised + model = ValidTrainStepEndModel() + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1) + + trainer.fit(model) diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py index a7eea3a749f26..7454ce01d3bee 100644 --- a/tests/utilities/test_apply_func.py +++ b/tests/utilities/test_apply_func.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import numbers -from collections import namedtuple +from collections import namedtuple, OrderedDict import numpy as np import torch @@ -76,3 +76,19 @@ def test_recursive_application_to_collection(): assert isinstance(reduced['g'], numbers.Number), 'Reduction of a number should result in a tensor' assert reduced['g'] == expected_result['g'], 'Reduction of a number did not yield the desired result' + + # mapping support + reduced = apply_to_collection({'a': 1, 'b': 2}, int, lambda x: str(x)) + assert reduced == {'a': '1', 'b': '2'} + reduced = apply_to_collection(OrderedDict([('b', 2), ('a', 1)]), int, lambda x: str(x)) + assert reduced == OrderedDict([('b', '2'), ('a', '1')]) + + # custom mappings + class _CustomCollection(dict): + + def __init__(self, initial_dict): + super().__init__(initial_dict) + + to_reduce = _CustomCollection({'a': 1, 'b': 2, 'c': 3}) + reduced = apply_to_collection(to_reduce, int, lambda x: str(x)) + assert reduced == _CustomCollection({'a': '1', 'b': '2', 'c': '3'}) diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py index 5780a83e75db8..c1eabca5d663d 100644 --- a/tests/utilities/test_cli.py +++ b/tests/utilities/test_cli.py @@ -20,18 +20,26 @@ from argparse import Namespace from contextlib import redirect_stdout from io import StringIO +from typing import List, Optional from unittest import mock import pytest +import torch import yaml +from packaging import version from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint from pytorch_lightning.plugins.environments import SLURMEnvironment from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.cli import LightningArgumentParser, LightningCLI, SaveConfigCallback +from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE from tests.helpers import BoringDataModule, BoringModel +torchvision_version = version.parse('0') +if _TORCHVISION_AVAILABLE: + torchvision_version = version.parse(__import__('torchvision').__version__) + @mock.patch('argparse.ArgumentParser.parse_args') def test_default_args(mock_argparse, tmpdir): @@ -443,3 +451,49 @@ def __init__( assert cli.model.submodule2 == cli.config_init['model']['submodule2'] assert isinstance(cli.config_init['model']['submodule1'], BoringModel) assert isinstance(cli.config_init['model']['submodule2'], BoringModel) + + +@pytest.mark.skipif(torchvision_version < version.parse('0.8.0'), reason='torchvision>=0.8.0 is required') +def test_lightning_cli_torch_modules(tmpdir): + + class TestModule(BoringModel): + + def __init__( + self, + activation: torch.nn.Module = None, + transform: Optional[List[torch.nn.Module]] = None, + ): + super().__init__() + self.activation = activation + self.transform = transform + + config = """model: + activation: + class_path: torch.nn.LeakyReLU + init_args: + negative_slope: 0.2 + transform: + - class_path: torchvision.transforms.Resize + init_args: + size: 64 + - class_path: torchvision.transforms.CenterCrop + init_args: + size: 64 + """ + config_path = tmpdir / 'config.yaml' + with open(config_path, 'w') as f: + f.write(config) + + cli_args = [ + f'--trainer.default_root_dir={tmpdir}', + '--trainer.max_epochs=1', + f'--config={str(config_path)}', + ] + + with mock.patch('sys.argv', ['any.py'] + cli_args): + cli = LightningCLI(TestModule) + + assert isinstance(cli.model.activation, torch.nn.LeakyReLU) + assert cli.model.activation.negative_slope == 0.2 + assert len(cli.model.transform) == 2 + assert all(isinstance(v, torch.nn.Module) for v in cli.model.transform)