From f7e76a90851ad056f92b41d1fa5dc93e514123d3 Mon Sep 17 00:00:00 2001
From: Nikolas Wolke <dev@nwolke.de>
Date: Fri, 21 Jul 2023 13:58:56 +0200
Subject: [PATCH 1/3] Support None from training_step in LRFinder

Lightning modules may return None in the training_step, in order to skip the current batch when using automatic optimization. However, the learning rate finder failed in that case.

This fix
* modifies the _LRCallback on train batch end method to support None from the training step by also handling an empty dict (since _AutomaticOptimization.run turns None STEP_OUTPUT into an empty dict)
* makes sure, that the list of losses and lrs have the same length as otherwise the selected learning rate would not be correct. If the lightning module returns None in the training step, nan is added to list of losses. because they are ignored when computing the suggestion.
---
 src/lightning/pytorch/tuner/lr_finder.py    |  6 +++-
 src/lightning/pytorch/tuner/tuning.py       |  2 +-
 tests/tests_pytorch/tuner/test_lr_finder.py | 37 ++++++++++++++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py
index ed326b4c8f384..bd703cda82b2b 100644
--- a/src/lightning/pytorch/tuner/lr_finder.py
+++ b/src/lightning/pytorch/tuner/lr_finder.py
@@ -393,7 +393,11 @@ def on_train_batch_end(
         if (trainer.fit_loop.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
             return
 
-        if outputs is None:
+        # _AutomaticOptimization.run turns None STEP_OUTPUT into an empty dict
+        if outputs is None or not outputs:
+            # need to add an element, because we also added one element to lrs in on_train_batch_start
+            # so add nan, because they are not considered when computing the suggestion
+            self.losses.append(float("nan"))
             return
 
         if self.progress_bar:
diff --git a/src/lightning/pytorch/tuner/tuning.py b/src/lightning/pytorch/tuner/tuning.py
index 64065b9576faa..53b7b45210ef9 100644
--- a/src/lightning/pytorch/tuner/tuning.py
+++ b/src/lightning/pytorch/tuner/tuning.py
@@ -113,7 +113,7 @@ def lr_find(
         max_lr: float = 1,
         num_training: int = 100,
         mode: str = "exponential",
-        early_stop_threshold: float = 4.0,
+        early_stop_threshold: Optional[float] = 4.0,
         update_attr: bool = True,
         attr_name: str = "",
     ) -> Optional["pl.tuner.lr_finder._LRFinder"]:
diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py
index bc8e529def2cb..87a7412396b91 100644
--- a/tests/tests_pytorch/tuner/test_lr_finder.py
+++ b/tests/tests_pytorch/tuner/test_lr_finder.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import math
 import os
 from copy import deepcopy
+from typing import Any
 from unittest import mock
 
 import pytest
@@ -26,6 +28,7 @@
 from lightning.pytorch.tuner.lr_finder import _LRFinder
 from lightning.pytorch.tuner.tuning import Tuner
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
+from lightning.pytorch.utilities.types import STEP_OUTPUT
 from tests_pytorch.helpers.datamodules import ClassifDataModule
 from tests_pytorch.helpers.runif import RunIf
 from tests_pytorch.helpers.simple_models import ClassificationModel
@@ -229,7 +232,7 @@ def __init__(self):
     lr_finder = tuner.lr_find(model, early_stop_threshold=None)
 
     assert lr_finder.suggestion() != 1e-3
-    assert len(lr_finder.results["lr"]) == 100
+    assert len(lr_finder.results["lr"]) == len(lr_finder.results["loss"]) == 100
     assert lr_finder._total_batch_idx == 199
 
 
@@ -503,3 +506,35 @@ def configure_optimizers(self):
 
     assert trainer.num_val_batches[0] == len(trainer.val_dataloaders)
     assert trainer.num_val_batches[0] != num_lr_tuner_training_steps
+
+
+def test_lr_finder_training_step_none_output(tmpdir):
+    # add some nans into the skipped steps (first 10) but also into the steps used to compute the lr
+    none_steps = [5, 12, 17]
+
+    class CustomBoringModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.lr = 0.123
+
+        def training_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT:
+            if self.trainer.global_step in none_steps:
+                return None
+
+            return super().training_step(batch, batch_idx)
+
+    seed_everything(1)
+    model = CustomBoringModel()
+
+    trainer = Trainer(default_root_dir=tmpdir)
+
+    tuner = Tuner(trainer)
+    # restrict number of steps for faster test execution
+    # and disable early stopping to easily check expected number of lrs and losses
+    lr_finder = tuner.lr_find(model=model, update_attr=True, num_training=20, early_stop_threshold=None)
+    assert len(lr_finder.results["lr"]) == len(lr_finder.results["loss"]) == 20
+    assert torch.isnan(torch.tensor(lr_finder.results["loss"])[none_steps]).all()
+
+    suggested_lr = lr_finder.suggestion()
+    assert math.isfinite(suggested_lr)
+    assert math.isclose(model.lr, suggested_lr)

From 4b53cc648931ee202f682553b05e17c3d147f693 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 3 Aug 2023 13:25:26 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning/pytorch/loops/progress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning/pytorch/loops/progress.py b/src/lightning/pytorch/loops/progress.py
index d2e52f44d7ba7..367ae79530ed5 100644
--- a/src/lightning/pytorch/loops/progress.py
+++ b/src/lightning/pytorch/loops/progress.py
@@ -124,7 +124,7 @@ class _Progress(_BaseProgress):
     current: _ReadyCompletedTracker = field(default_factory=_ProcessedTracker)
 
     def __post_init__(self) -> None:
-        if type(self.total) is not type(self.current):  # noqa: E721
+        if type(self.total) is not type(self.current):
             raise ValueError("The `total` and `current` instances should be of the same class")
 
     def increment_ready(self) -> None:

From 8f831e53ec1585ce0231ded6a24f33149b641c84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 3 Aug 2023 15:22:27 -0400
Subject: [PATCH 3/3] Update src/lightning/pytorch/tuner/lr_finder.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 src/lightning/pytorch/tuner/lr_finder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py
index bd703cda82b2b..f83c390d271c3 100644
--- a/src/lightning/pytorch/tuner/lr_finder.py
+++ b/src/lightning/pytorch/tuner/lr_finder.py
@@ -394,7 +394,7 @@ def on_train_batch_end(
             return
 
         # _AutomaticOptimization.run turns None STEP_OUTPUT into an empty dict
-        if outputs is None or not outputs:
+        if not outputs:
             # need to add an element, because we also added one element to lrs in on_train_batch_start
             # so add nan, because they are not considered when computing the suggestion
             self.losses.append(float("nan"))