diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9e1517ebc2b36..e6367977d237b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,25 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.3.5] - 2021-06-08
+
+### Added
+
+- Added warning to Training Step output ([#7779](https://github.com/PyTorchLightning/pytorch-lightning/pull/7779))
+
+### Fixed
+
+- Fixed LearningRateMonitor + BackboneFinetuning ([#7835](https://github.com/PyTorchLightning/pytorch-lightning/pull/7835))
+- Minor improvements to `apply_to_collection` and type signature of `log_dict` ([#7851](https://github.com/PyTorchLightning/pytorch-lightning/pull/7851))
+- Fixed docker versions ([#7834](https://github.com/PyTorchLightning/pytorch-lightning/pull/7834))
+- Fixed sharded training check for fp16 precision ([#7825](https://github.com/PyTorchLightning/pytorch-lightning/pull/7825))
+- Fixed support for torch Module type hints in LightningCLI ([#7807](https://github.com/PyTorchLightning/pytorch-lightning/pull/7807))
+
+### Changed
+
+- Move `training_output` validation to after `train_step_end` ([#7868](https://github.com/PyTorchLightning/pytorch-lightning/pull/7868))
+
+
 ## [1.3.4] - 2021-06-01
 
 ### Fixed
@@ -12,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed info message when max training time reached ([#7780](https://github.com/PyTorchLightning/pytorch-lightning/pull/7780))
 - Fixed missing `__len__` method to `IndexBatchSamplerWrapper` ([#7681](https://github.com/PyTorchLightning/pytorch-lightning/pull/7681))
 
+
 ## [1.3.3] - 2021-05-27
 
 ### Changed
@@ -41,8 +61,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed setting correct `DistribType` for `ddp_cpu` (spawn) backend ([#7492](https://github.com/PyTorchLightning/pytorch-lightning/pull/7492))
 - Fixed incorrect number of calls to LR scheduler when `check_val_every_n_epoch > 1` ([#7032](https://github.com/PyTorchLightning/pytorch-lightning/pull/7032))
 
-## [1.3.1] - 2021-05-11
 
+## [1.3.1] - 2021-05-11
 
 ### Fixed
 
@@ -55,11 +75,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944/))
+- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944))
 - Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202))
 - Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942))
 - Added utils for metrics to scalar conversions ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180))
-- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/))
+- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834))
 - Added more explicit exception message when trying to execute `trainer.test()` or `trainer.validate()` with `fast_dev_run=True` ([#6667](https://github.com/PyTorchLightning/pytorch-lightning/pull/6667))
 - Added `LightningCLI` class to provide simple reproducibility with minimum boilerplate training CLI (
     [#4492](https://github.com/PyTorchLightning/pytorch-lightning/pull/4492),
@@ -85,7 +105,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 - Added `configure_sharded_model` hook ([#6679](https://github.com/PyTorchLightning/pytorch-lightning/pull/6679))
 - Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/PyTorchLightning/pytorch-lightning/pull/6595))
-- Added support for DDP communication hooks ([#6736](https://github.com/PyTorchLightning/pytorch-lightning/issues/6736))
+- Added support for DDP communication hooks ([#6736](https://github.com/PyTorchLightning/pytorch-lightning/pull/6736))
 - Added `artifact_location` argument to `MLFlowLogger` which will be passed to the `MlflowClient.create_experiment` call ([#6677](https://github.com/PyTorchLightning/pytorch-lightning/pull/6677))
 - Added `model` parameter to precision plugins' `clip_gradients` signature (
     [#6764](https://github.com/PyTorchLightning/pytorch-lightning/pull/6764),
@@ -104,14 +124,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `tpu_distributed` check for TPU Spawn barrier ([#7241](https://github.com/PyTorchLightning/pytorch-lightning/pull/7241))
 - Added device updates to TPU Spawn for Pod training ([#7243](https://github.com/PyTorchLightning/pytorch-lightning/pull/7243))
 - Added warning when missing `Callback` and using `resume_from_checkpoint` ([#7254](https://github.com/PyTorchLightning/pytorch-lightning/pull/7254))
-- DeepSpeed single file saving ([#6900](https://github.com/PyTorchLightning/pytorch-lightning/issues/6900))
+- DeepSpeed single file saving ([#6900](https://github.com/PyTorchLightning/pytorch-lightning/pull/6900))
 - Added Training type Plugins Registry (
-    [#6982](https://github.com/PyTorchLightning/pytorch-lightning/issues/6982),
-    [#7063](https://github.com/PyTorchLightning/pytorch-lightning/issues/7063),
-    [#7214](https://github.com/PyTorchLightning/pytorch-lightning/issues/7214),
-    [#7224](https://github.com/PyTorchLightning/pytorch-lightning/issues/7224)
+    [#6982](https://github.com/PyTorchLightning/pytorch-lightning/pull/6982),
+    [#7063](https://github.com/PyTorchLightning/pytorch-lightning/pull/7063),
+    [#7214](https://github.com/PyTorchLightning/pytorch-lightning/pull/7214),
+    [#7224](https://github.com/PyTorchLightning/pytorch-lightning/pull/7224)
 )
-- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/PyTorchLightning/pytorch-lightning/issues/6056))
+- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/PyTorchLightning/pytorch-lightning/pull/6056))
 
 ### Changed
 
@@ -129,7 +149,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - The trainer no longer tries to save a checkpoint on exception or run callback's `on_train_end` functions ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864))
 - Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
 - Disabled `lr_scheduler.step()` in manual optimization  ([#6825](https://github.com/PyTorchLightning/pytorch-lightning/pull/6825))
-- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762/))
+- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762))
 - `pl.seed_everything` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/PyTorchLightning/pytorch-lightning/pull/7024))
 - Changed default setting for communication of multi-node training using `DDPShardedPlugin` ([#6937](https://github.com/PyTorchLightning/pytorch-lightning/pull/6937))
 - `trainer.tune()` now returns the tuning result ([#7258](https://github.com/PyTorchLightning/pytorch-lightning/pull/7258))
@@ -164,7 +184,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/PyTorchLightning/pytorch-lightning/pull/7201))
 - Deprecated `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066))
 - Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180))
-- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/))
+- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834))
 - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146))
 - Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
 - Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
@@ -270,7 +290,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed auto-scaling mode when calling tune method on trainer ([#7321](https://github.com/PyTorchLightning/pytorch-lightning/pull/7321))
 - Fixed finetuning complex models correctly unfreezes ([#6880](https://github.com/PyTorchLightning/pytorch-lightning/pull/6880))
 - Ensure we set the eval/train flag correctly on accelerator model ([#6877](https://github.com/PyTorchLightning/pytorch-lightning/pull/6877))
-- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
+- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802))
 - Fixed matching the number of outputs of backward with forward for AllGatherGrad ([#6625](https://github.com/PyTorchLightning/pytorch-lightning/pull/6625))
 - Fixed the `gradient_clip_algorithm` has no effect ([#6928](https://github.com/PyTorchLightning/pytorch-lightning/pull/6928))
 - Fixed CUDA OOM detection and handling ([#6934](https://github.com/PyTorchLightning/pytorch-lightning/pull/6934))
@@ -486,7 +506,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `computer_vision_fine_tunning` example to use `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/PyTorchLightning/pytorch-lightning/pull/5377))
 - Changed `automatic casting` for LoggerConnector `metrics` ([#5218](https://github.com/PyTorchLightning/pytorch-lightning/pull/5218))
 - Changed `iou` [func] to allow float input ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704))
-- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/PyTorchLightning/pytorch-lightning/pull/5409/))
+- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/PyTorchLightning/pytorch-lightning/pull/5409))
 - Set PyTorch 1.4 as min requirements, also for testing and examples `torchvision>=0.5` and `torchtext>=0.5` ([#5418](https://github.com/PyTorchLightning/pytorch-lightning/pull/5418))
 - Changed `callbacks` argument in `Trainer` to allow `Callback` input ([#5446](https://github.com/PyTorchLightning/pytorch-lightning/pull/5446))
 - Changed the default of `find_unused_parameters` to `False` in DDP ([#5185](https://github.com/PyTorchLightning/pytorch-lightning/pull/5185))
@@ -1253,7 +1273,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed getting `experiment_id` from MLFlow only once instead of each training loop ([#3394](https://github.com/PyTorchLightning/pytorch-lightning/pull/3394))
 - Fixed `overfit_batches` which now correctly disables shuffling for the training loader. ([#3501](https://github.com/PyTorchLightning/pytorch-lightning/pull/3501))
 - Fixed gradient norm tracking for `row_log_interval > 1` ([#3489](https://github.com/PyTorchLightning/pytorch-lightning/pull/3489))
-- Fixed `ModelCheckpoint` name formatting ([3164](https://github.com/PyTorchLightning/pytorch-lightning/pull/3163))
+- Fixed `ModelCheckpoint` name formatting ([#3164](https://github.com/PyTorchLightning/pytorch-lightning/pull/3163))
 - Fixed example implementation of AutoEncoder ([#3190](https://github.com/PyTorchLightning/pytorch-lightning/pull/3190))
 - Fixed invalid paths when remote logging with TensorBoard ([#3236](https://github.com/PyTorchLightning/pytorch-lightning/pull/3236))
 - Fixed change `t()` to `transpose()` as XLA devices do not support `.t()` on 1-dim tensor ([#3252](https://github.com/PyTorchLightning/pytorch-lightning/pull/3252))
@@ -1513,8 +1533,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` ([#1908](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908))
 - Early stopping checks `on_validation_end` ([#1458](https://github.com/PyTorchLightning/pytorch-lightning/pull/1458))
 - Speed up single-core TPU training by loading data using `ParallelLoader` ([#2033](https://github.com/PyTorchLightning/pytorch-lightning/pull/2033))
-- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756))
-- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610))
+- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([#1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756))
+- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([#1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610))
 - Added back the slow spawn ddp implementation as `ddp_spawn` ([#2115](https://github.com/PyTorchLightning/pytorch-lightning/pull/2115))
 - Added loading checkpoints from URLs ([#1667](https://github.com/PyTorchLightning/pytorch-lightning/pull/1667))
 - Added a callback method `on_keyboard_interrupt` for handling KeyboardInterrupt events during training ([#2134](https://github.com/PyTorchLightning/pytorch-lightning/pull/2134))
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
index fbfd2224a66a9..528e7561c1e96 100644
--- a/dockers/nvidia/Dockerfile
+++ b/dockers/nvidia/Dockerfile
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
-FROM nvcr.io/nvidia/pytorch:21.04-py3
+FROM nvcr.io/nvidia/pytorch:21.05-py3
 
 LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
@@ -39,14 +39,16 @@ RUN \
 
 # Installations
     python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+    pip install "Pillow>=8.2" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install ./pytorch-lightning --no-cache-dir && \
-    pip install "Pillow>=8.1" --no-cache-dir --upgrade-strategy only-if-needed && \
     rm -rf pytorch-lightning && \
+    pip install jupyterlab[all] -U && \
     pip list
 
-RUN pip install lightning-grid -U
+RUN pip install lightning-grid -U && \
+    pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed
 
 ENV PYTHONPATH="/workspace"
 
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
index d8a5e153425a2..9471cf85b2b4b 100644
--- a/pytorch_lightning/__about__.py
+++ b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.3.4'
+__version__ = '1.3.5'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 7530bfaa9d21e..61b0b31155994 100644
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -19,8 +19,10 @@
 Monitor and logs learning rate for lr schedulers during training.
 
 """
+from collections import defaultdict
+from typing import Any, DefaultDict, Dict, List, Optional, Type
 
-from typing import Dict, List, Optional
+from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_warn
@@ -53,7 +55,7 @@ class LearningRateMonitor(Callback):
     In case of multiple optimizers of same type, they will be named ``Adam``,
     ``Adam-1`` etc. If a optimizer has multiple parameter groups they will
     be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a
-    ``name`` keyword in the construction of the learning rate schdulers
+    ``name`` keyword in the construction of the learning rate schedulers
 
     Example::
 
@@ -138,6 +140,9 @@ def on_train_epoch_start(self, trainer, *args, **kwargs):
     def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
         latest_stat = {}
 
+        names = self._find_names(trainer.lr_schedulers, add_lr_sch_names=False)
+        self._remap_keys(names)
+
         for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers):
             if scheduler['interval'] == interval or interval == 'any':
                 opt = scheduler['scheduler'].optimizer
@@ -146,7 +151,7 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
 
                 for i, pg in enumerate(param_groups):
                     suffix = f'/pg{i + 1}' if len(param_groups) > 1 else ''
-                    lr = self._extract_lr(param_group=pg, name=f'{name}{suffix}')
+                    lr = self._extract_lr(pg, f'{name}{suffix}')
                     latest_stat.update(lr)
                     momentum = self._extract_momentum(
                         param_group=pg, name=f'{name}-momentum{suffix}', use_betas=use_betas
@@ -155,12 +160,23 @@ def _extract_stats(self, trainer, interval: str) -> Dict[str, float]:
 
         return latest_stat
 
-    def _extract_lr(self, param_group, name: str) -> Dict[str, float]:
+    def _extract_lr(self, param_group: Dict[str, Any], name: str) -> Dict[str, Any]:
         lr = param_group.get('lr')
         self.lrs[name].append(lr)
         return {name: lr}
 
-    def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str, float]:
+    def _remap_keys(self, names: List[str], token: str = '/pg1') -> None:
+        """
+        This function is used the remap the keys if param groups for a given optimizer increased.
+        """
+        for new_name in names:
+            old_name = new_name.replace(token, '')
+            if token in new_name and old_name in self.lrs:
+                self.lrs[new_name] = self.lrs.pop(old_name)
+            elif new_name not in self.lrs:
+                self.lrs[new_name] = []
+
+    def _extract_momentum(self, param_group: Dict[str, Any], name: str, use_betas: bool) -> Dict[str, float]:
         if not self.log_momentum:
             return {}
 
@@ -168,35 +184,46 @@ def _extract_momentum(self, param_group, name: str, use_betas: bool) -> Dict[str
         self.last_momentum_values[name] = momentum
         return {name: momentum}
 
-    def _find_names(self, lr_schedulers) -> List[str]:
-        # Create uniqe names in the case we have multiple of the same learning
-        # rate schduler + multiple parameter groups
+    def _add_prefix(
+        self, name: str, optimizer_cls: Type[Optimizer], seen_optimizer_types: DefaultDict[Type[Optimizer], int]
+    ) -> str:
+        if optimizer_cls not in seen_optimizer_types:
+            return name
+        count = seen_optimizer_types[optimizer_cls]
+        return name + f'-{count - 1}' if count > 1 else name
+
+    def _find_names(self, lr_schedulers: List, add_lr_sch_names: bool = True) -> List[str]:
+        # Create unique names in the case we have multiple of the same learning
+        # rate scheduler + multiple parameter groups
         names = []
+        seen_optimizers = []
+        seen_optimizer_types = defaultdict(int)
         for scheduler in lr_schedulers:
             sch = scheduler['scheduler']
             if scheduler['name'] is not None:
                 name = scheduler['name']
             else:
-                opt_name = 'lr-' + sch.optimizer.__class__.__name__
-                i, name = 1, opt_name
+                name = 'lr-' + sch.optimizer.__class__.__name__
 
-                # Multiple schduler of the same type
-                while True:
-                    if name not in names:
-                        break
-                    i, name = i + 1, f'{opt_name}-{i}'
+            seen_optimizers.append(sch.optimizer)
+            optimizer_cls = type(sch.optimizer)
+            if scheduler['name'] is None:
+                seen_optimizer_types[optimizer_cls] += 1
 
-            # Multiple param groups for the same schduler
+            # Multiple param groups for the same scheduler
             param_groups = sch.optimizer.param_groups
 
+            name = self._add_prefix(name, optimizer_cls, seen_optimizer_types)
+
             if len(param_groups) != 1:
-                for i, pg in enumerate(param_groups):
+                for i in range(len(param_groups)):
                     temp = f'{name}/pg{i + 1}'
                     names.append(temp)
             else:
                 names.append(name)
 
-            self.lr_sch_names.append(name)
+            if add_lr_sch_names:
+                self.lr_sch_names.append(name)
 
         return names
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 109b8fd8104b5..b324582fe7602 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -25,7 +25,7 @@
 from argparse import Namespace
 from functools import partial
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import ScriptModule, Tensor
@@ -347,7 +347,7 @@ def log(
 
     def log_dict(
         self,
-        dictionary: dict,
+        dictionary: Mapping[str, Any],
         prog_bar: bool = False,
         logger: bool = True,
         on_step: Optional[bool] = None,
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
index 02da937286dcc..fceafddd66ec0 100644
--- a/pytorch_lightning/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -54,7 +54,8 @@ def _reinit_optimizers_with_oss(self):
                 optim_class = type(optimizer)
                 zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults)
                 if _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE:
-                    is_fp16 = self.lightning_module.trainer.precision == 16
+                    precision = self.lightning_module.trainer.precision
+                    is_fp16 = precision in ("mixed", 16)
                     # For multi-node training, compressing the model shards in fp16 before broadcasting
                     # improves performance. When using PyTorch AMP, it will not degrade
                     # the model performance.
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 943864138e371..d596bc8a43831 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -14,7 +14,7 @@
 
 from contextlib import contextmanager, suppress
 from copy import copy, deepcopy
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 import numpy as np
 import torch
@@ -265,6 +265,16 @@ def _check_training_step_output(self, training_step_output):
             if training_step_output.grad_fn is None:
                 # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ...
                 raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor")
+        elif self.trainer.lightning_module.automatic_optimization:
+            if not any((
+                isinstance(training_step_output, torch.Tensor),
+                (isinstance(training_step_output, Mapping)
+                 and 'loss' in training_step_output), training_step_output is None
+            )):
+                raise MisconfigurationException(
+                    "In automatic optimization, `training_step` must either return a Tensor, "
+                    "a dict with key 'loss' or None (where the step will be skipped)."
+                )
 
     def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
         # give the PL module a result for logging
@@ -282,10 +292,10 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
 
             self.trainer.logger_connector.cache_logged_metrics()
 
-            self._check_training_step_output(training_step_output)
-
             training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
 
+            self._check_training_step_output(training_step_output)
+
             training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
                 training_step_output, split_batch
             )
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
index 1cbab2fb8dee9..61739cd25d1d2 100644
--- a/pytorch_lightning/utilities/apply_func.py
+++ b/pytorch_lightning/utilities/apply_func.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import operator
 from abc import ABC
+from collections import OrderedDict
 from collections.abc import Mapping, Sequence
 from copy import copy
 from functools import partial
@@ -85,10 +86,12 @@ def apply_to_collection(
 
     # Recursively apply to collection items
     if isinstance(data, Mapping):
-        return elem_type({
-            k: apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs)
-            for k, v in data.items()
-        })
+        return elem_type(
+            OrderedDict({
+                k: apply_to_collection(v, dtype, function, *args, wrong_dtype=wrong_dtype, **kwargs)
+                for k, v in data.items()
+            })
+        )
 
     if isinstance(data, tuple) and hasattr(data, '_fields'):  # named tuple
         return elem_type(
diff --git a/requirements.txt b/requirements.txt
index c3a4caaf6429d..d985004a2ca9d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ torch>=1.4
 future>=0.17.1  # required for builtins in setup.py
 tqdm>=4.41.0
 PyYAML>=5.1,<=5.4.1
-fsspec[http]>=2021.4.0
+fsspec[http]>=2021.05.0, !=2021.06.0
 tensorboard>=2.2.0, !=2.5.0  # 2.5.0 GPU CI error: 'Couldn't build proto file into descriptor pool!'
 torchmetrics>=0.2.0
 pyDeprecate==0.3.0
diff --git a/requirements/extra.txt b/requirements/extra.txt
index db2e66540eef1..cb9515beefb9a 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -7,4 +7,4 @@ torchtext>=0.5
 # onnx>=1.7.0
 onnxruntime>=1.3.0
 hydra-core>=1.0
-jsonargparse[signatures]>=3.11.1
+jsonargparse[signatures]>=3.13.1
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
index bea6c45e95ced..808165d61b053 100644
--- a/tests/callbacks/test_lr_monitor.py
+++ b/tests/callbacks/test_lr_monitor.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
+import torch
 from torch import optim
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks.finetuning import BackboneFinetuning
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
@@ -278,3 +281,102 @@ def configure_optimizers(self):
     )
     trainer.fit(TestModel())
     assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name']
+
+
+def test_multiple_optimizers_basefinetuning(tmpdir):
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.backbone = torch.nn.Sequential(
+                torch.nn.Linear(32, 32),
+                torch.nn.Linear(32, 32),
+                torch.nn.Linear(32, 32),
+                torch.nn.ReLU(True),
+            )
+            self.layer = torch.nn.Linear(32, 2)
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            return super().training_step(batch, batch_idx)
+
+        def forward(self, x):
+            return self.layer(self.backbone(x))
+
+        def configure_optimizers(self):
+            parameters = list(filter(lambda p: p.requires_grad, self.parameters()))
+            opt = optim.Adam(parameters, lr=0.1)
+            opt_2 = optim.Adam(parameters, lr=0.1)
+            opt_3 = optim.Adam(parameters, lr=0.1)
+            optimizers = [opt, opt_2, opt_3]
+            schedulers = [
+                optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.5),
+                optim.lr_scheduler.StepLR(opt_2, step_size=1, gamma=0.5),
+            ]
+            return optimizers, schedulers
+
+    class Check(Callback):
+
+        def on_train_epoch_start(self, trainer, pl_module) -> None:
+            num_param_groups = sum([len(opt.param_groups) for opt in trainer.optimizers])
+            assert lr_monitor.lr_sch_names == ['lr-Adam', 'lr-Adam-1']
+            if trainer.current_epoch == 0:
+                assert num_param_groups == 3
+            elif trainer.current_epoch == 1:
+                assert num_param_groups == 4
+                assert list(lr_monitor.lrs) == ['lr-Adam-1', 'lr-Adam/pg1', 'lr-Adam/pg2']
+            elif trainer.current_epoch == 2:
+                assert num_param_groups == 5
+                assert list(lr_monitor.lrs) == ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2']
+            else:
+                expected = ['lr-Adam/pg1', 'lr-Adam/pg2', 'lr-Adam-1/pg1', 'lr-Adam-1/pg2', 'lr-Adam-1/pg3']
+                assert list(lr_monitor.lrs) == expected
+
+    class TestFinetuning(BackboneFinetuning):
+
+        def freeze_before_training(self, pl_module):
+            self.freeze(pl_module.backbone[0])
+            self.freeze(pl_module.backbone[1])
+            self.freeze(pl_module.layer)
+
+        def finetune_function(self, pl_module, epoch: int, optimizer, opt_idx: int):
+            """Called when the epoch begins."""
+            if epoch == 1 and opt_idx == 0:
+                self.unfreeze_and_add_param_group(pl_module.backbone[0], optimizer, lr=0.1)
+            if epoch == 2 and opt_idx == 1:
+                self.unfreeze_and_add_param_group(pl_module.layer, optimizer, lr=0.1)
+
+            if epoch == 3 and opt_idx == 1:
+                assert len(optimizer.param_groups) == 2
+                self.unfreeze_and_add_param_group(pl_module.backbone[1], optimizer, lr=0.1)
+                assert len(optimizer.param_groups) == 3
+
+    lr_monitor = LearningRateMonitor()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=5,
+        limit_val_batches=0,
+        limit_train_batches=2,
+        callbacks=[TestFinetuning(), lr_monitor, Check()],
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+        checkpoint_callback=False
+    )
+    model = TestModel()
+    model.training_epoch_end = None
+    trainer.fit(model)
+
+    expected = [0.1, 0.05, 0.025, 0.0125, 0.00625]
+    assert lr_monitor.lrs['lr-Adam/pg1'] == expected
+
+    expected = [0.1, 0.05, 0.025, 0.0125]
+    assert lr_monitor.lrs['lr-Adam/pg2'] == expected
+
+    expected = [0.1, 0.05, 0.025, 0.0125, 0.00625]
+    assert lr_monitor.lrs['lr-Adam-1/pg1'] == expected
+
+    expected = [0.1, 0.05, 0.025]
+    assert lr_monitor.lrs['lr-Adam-1/pg2'] == expected
+
+    expected = [0.1, 0.05]
+    assert lr_monitor.lrs['lr-Adam-1/pg3'] == expected
diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py
index 94becf6488fc3..99eedf377f902 100644
--- a/tests/trainer/loops/test_training_loop.py
+++ b/tests/trainer/loops/test_training_loop.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
+
 import pytest
 import torch
 
 from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
 
 
@@ -222,3 +225,75 @@ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
     else:
         assert trainer.batch_idx == batch_idx_
         assert trainer.global_step == batch_idx_ * max_epochs
+
+
+def test_should_stop_mid_epoch(tmpdir):
+    """Test that training correctly stops mid epoch and that validation is still called at the right time"""
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.validation_called_at = None
+
+        def training_step(self, batch, batch_idx):
+            if batch_idx == 4:
+                self.trainer.should_stop = True
+            return super().training_step(batch, batch_idx)
+
+        def validation_step(self, *args):
+            self.validation_called_at = (self.trainer.current_epoch, self.trainer.global_step)
+            return super().validation_step(*args)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=10,
+        limit_val_batches=1,
+    )
+    trainer.fit(model)
+
+    assert trainer.current_epoch == 0
+    assert trainer.global_step == 5
+    assert model.validation_called_at == (0, 4)
+
+
+@pytest.mark.parametrize(['output'], [(5., ), ({'a': 5}, )])
+def test_warning_invalid_trainstep_output(tmpdir, output):
+
+    class InvalidTrainStepModel(BoringModel):
+
+        def training_step(self, batch, batch_idx):
+            return output
+
+    model = InvalidTrainStepModel()
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
+    with pytest.raises(
+        MisconfigurationException,
+        match=re.escape(
+            "In automatic optimization, `training_step` must either return a Tensor, "
+            "a dict with key 'loss' or None (where the step will be skipped)."
+        )
+    ):
+        trainer.fit(model)
+
+
+def test_warning_valid_train_step_end(tmpdir):
+
+    class ValidTrainStepEndModel(BoringModel):
+
+        def training_step(self, batch, batch_idx):
+            output = self(batch)
+            return {'output': output, 'batch': batch}
+
+        def training_step_end(self, outputs):
+            loss = self.loss(outputs['batch'], outputs['output'])
+            return loss
+
+    # No error is raised
+    model = ValidTrainStepEndModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
+
+    trainer.fit(model)
diff --git a/tests/utilities/test_apply_func.py b/tests/utilities/test_apply_func.py
index a7eea3a749f26..7454ce01d3bee 100644
--- a/tests/utilities/test_apply_func.py
+++ b/tests/utilities/test_apply_func.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numbers
-from collections import namedtuple
+from collections import namedtuple, OrderedDict
 
 import numpy as np
 import torch
@@ -76,3 +76,19 @@ def test_recursive_application_to_collection():
 
     assert isinstance(reduced['g'], numbers.Number), 'Reduction of a number should result in a tensor'
     assert reduced['g'] == expected_result['g'], 'Reduction of a number did not yield the desired result'
+
+    # mapping support
+    reduced = apply_to_collection({'a': 1, 'b': 2}, int, lambda x: str(x))
+    assert reduced == {'a': '1', 'b': '2'}
+    reduced = apply_to_collection(OrderedDict([('b', 2), ('a', 1)]), int, lambda x: str(x))
+    assert reduced == OrderedDict([('b', '2'), ('a', '1')])
+
+    # custom mappings
+    class _CustomCollection(dict):
+
+        def __init__(self, initial_dict):
+            super().__init__(initial_dict)
+
+    to_reduce = _CustomCollection({'a': 1, 'b': 2, 'c': 3})
+    reduced = apply_to_collection(to_reduce, int, lambda x: str(x))
+    assert reduced == _CustomCollection({'a': '1', 'b': '2', 'c': '3'})
diff --git a/tests/utilities/test_cli.py b/tests/utilities/test_cli.py
index 5780a83e75db8..c1eabca5d663d 100644
--- a/tests/utilities/test_cli.py
+++ b/tests/utilities/test_cli.py
@@ -20,18 +20,26 @@
 from argparse import Namespace
 from contextlib import redirect_stdout
 from io import StringIO
+from typing import List, Optional
 from unittest import mock
 
 import pytest
+import torch
 import yaml
+from packaging import version
 
 from pytorch_lightning import LightningDataModule, LightningModule, Trainer
 from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.cli import LightningArgumentParser, LightningCLI, SaveConfigCallback
+from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE
 from tests.helpers import BoringDataModule, BoringModel
 
+torchvision_version = version.parse('0')
+if _TORCHVISION_AVAILABLE:
+    torchvision_version = version.parse(__import__('torchvision').__version__)
+
 
 @mock.patch('argparse.ArgumentParser.parse_args')
 def test_default_args(mock_argparse, tmpdir):
@@ -443,3 +451,49 @@ def __init__(
     assert cli.model.submodule2 == cli.config_init['model']['submodule2']
     assert isinstance(cli.config_init['model']['submodule1'], BoringModel)
     assert isinstance(cli.config_init['model']['submodule2'], BoringModel)
+
+
+@pytest.mark.skipif(torchvision_version < version.parse('0.8.0'), reason='torchvision>=0.8.0 is required')
+def test_lightning_cli_torch_modules(tmpdir):
+
+    class TestModule(BoringModel):
+
+        def __init__(
+            self,
+            activation: torch.nn.Module = None,
+            transform: Optional[List[torch.nn.Module]] = None,
+        ):
+            super().__init__()
+            self.activation = activation
+            self.transform = transform
+
+    config = """model:
+        activation:
+          class_path: torch.nn.LeakyReLU
+          init_args:
+            negative_slope: 0.2
+        transform:
+          - class_path: torchvision.transforms.Resize
+            init_args:
+              size: 64
+          - class_path: torchvision.transforms.CenterCrop
+            init_args:
+              size: 64
+    """
+    config_path = tmpdir / 'config.yaml'
+    with open(config_path, 'w') as f:
+        f.write(config)
+
+    cli_args = [
+        f'--trainer.default_root_dir={tmpdir}',
+        '--trainer.max_epochs=1',
+        f'--config={str(config_path)}',
+    ]
+
+    with mock.patch('sys.argv', ['any.py'] + cli_args):
+        cli = LightningCLI(TestModule)
+
+    assert isinstance(cli.model.activation, torch.nn.LeakyReLU)
+    assert cli.model.activation.negative_slope == 0.2
+    assert len(cli.model.transform) == 2
+    assert all(isinstance(v, torch.nn.Module) for v in cli.model.transform)