Lightning-AI · carmocca · May 5, 2022 · Apr 19, 2022 · Apr 20, 2022 · Apr 20, 2022
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Allow to load last checkpoint using `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816))
+
 
 - Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772))
 

@@ -44,10 +44,13 @@ To run the test set after training completes, use this method.
     # (1) load the best checkpoint automatically (lightning tracks this for you)
     trainer.test(ckpt_path="best")
 
-    # (2) test using a specific checkpoint
+    # (2) load the last available checkpoint
+    trainer.test(ckpt_path="last")
+
+    # (3) test using a specific checkpoint
     trainer.test(ckpt_path="/path/to/my_checkpoint.ckpt")
 
-    # (3) test with an explicit model (will use this model and not load a checkpoint)
+    # (4) test with an explicit model (will use this model and not load a checkpoint)
     trainer.test(model)
 
 .. warning::

@@ -15,13 +15,15 @@
 import inspect
 import logging
 import math
+import operator
 import os
 import traceback
 import warnings
 from argparse import ArgumentParser, Namespace
 from contextlib import contextmanager
 from copy import deepcopy
 from datetime import timedelta
+from functools import partial
 from pathlib import Path
 from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Type, Union
 from weakref import proxy
@@ -1386,28 +1388,53 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
         from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
 
         ft_checkpoints = [cb for cb in self.callbacks if isinstance(cb, _FaultToleranceCheckpoint)]
+        ft_ckpt_path = None
         if ft_checkpoints:
-            ft_ckpt_path = ft_checkpoints[0].ckpt_path
-            fs = get_filesystem(ft_ckpt_path)
-            if fs.exists(ft_ckpt_path):
-                return ft_ckpt_path
-
-        if model_provided and ckpt_path is None:
-            # use passed model to function without loading weights
-            return
+            tmp_ft_ckpt_path = ft_checkpoints[0].ckpt_path
+            fs = get_filesystem(tmp_ft_ckpt_path)
+            if fs.exists(tmp_ft_ckpt_path):
+                ft_ckpt_path = tmp_ft_ckpt_path
 
         fn = self.state.fn.value
 
-        if model_connected and ckpt_path is None:
+        if ckpt_path is None and ft_ckpt_path is not None and self.state.fn == TrainerFn.FITTING:
+            ckpt_path = "last"
             rank_zero_warn(
                 f"`.{fn}(ckpt_path=None)` was called without a model."
-                " The best model of the previous `fit` call will be used."
-                f" You can pass `{fn}(ckpt_path='best')` to use and best model"
-                " checkpoint and avoid this warning or"
-                " `ckpt_path=trainer.checkpoint_callback.last_model_path` to use the last model."
+                " Because fault tolerance is enabled, the last model of the previous `fit` call will be used."
+                f" You can pass `{fn}(ckpt_path='best')` to use the best model or"
+                f" `{fn}(ckpt_path='last')` to use the last model."
+                " If you pass a value, this warning will be silenced."
             )
+
+        if model_provided and ckpt_path is None:
+            # use passed model to function without loading weights
+            return
+
+        if model_connected and ckpt_path is None:
+            if ft_ckpt_path:
+                full_msg = (
+                    f"`.{fn}(ckpt_path=None)` was called without a model."
+                    " The best model of the previous `fit` call will be used."
+                    " There is also a fault-tolerant checkpoint available,"
+                    " however it is default only when fitting."
+                    f" You can pass `{fn}(ckpt_path='best')` to use the best model or"
+                    f" `{fn}(ckpt_path='last')` to use the last model."
+                    " If you pass a value, this warning will be silenced."
+                )
+            else:
+                full_msg = (
+                    f"`.{fn}(ckpt_path=None)` was called without a model."
+                    " The best model of the previous `fit` call will be used."
+                    f" You can pass `{fn}(ckpt_path='best')` to use the best model or"
+                    f" `{fn}(ckpt_path='last')` to use the last model."
+                    " If you pass a value, this warning will be silenced."
+                )
+
             ckpt_path = "best"
 
+            rank_zero_warn(full_msg)
+
         if ckpt_path == "best":
             if len(self.checkpoint_callbacks) > 1:
                 rank_zero_warn(
@@ -1432,6 +1459,21 @@ def __set_ckpt_path(self, ckpt_path: Optional[str], model_provided: bool, model_
             # load best weights
             ckpt_path = self.checkpoint_callback.best_model_path
 
+        if ckpt_path == "last":
+            candidates = [ft.ckpt_path for ft in ft_checkpoints] + [
+                cb.last_model_path for cb in self.checkpoint_callbacks
+            ]
+            candidates_fs = {path: get_filesystem(path) for path in candidates if path}
+            candidates_ts = {path: fs.modified(path) for path, fs in candidates_fs.items() if fs.exists(path)}
+            if not candidates_ts:
+                rank_zero_warn(
+                    f'.{fn}(ckpt_path="last") is set, but there is no fault tolerant'
+                    " or last checkpoint available. No checkpoint will be loaded."
+                )
+                return
+
+            ckpt_path = max(candidates_ts.keys(), key=partial(operator.getitem, candidates_ts))
+
         if not ckpt_path:
             raise MisconfigurationException(
                 f"`.{fn}()` found no path for the best weights: {ckpt_path!r}. Please"

@@ -18,6 +18,7 @@
 import pickle
 import sys
 from argparse import Namespace
+from contextlib import nullcontext
 from copy import deepcopy
 from pathlib import Path
 from unittest import mock
@@ -664,6 +665,131 @@ def test_benchmark_option(benchmark_, deterministic, expected):
     torch.backends.cudnn.benchmark = original_val
 
 
+@pytest.mark.parametrize("ckpt_path", (None, "last"))
+@pytest.mark.parametrize("fn", ("fit", "validate"))
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+def test_checkpoint_path_input_last_fault_tolerant(tmpdir, ckpt_path, fn):
+    should_signal = True
+
+    class ExitGracefullyException(Exception):
+        pass
+
+    class TestModel(BoringModel):
+        def validation_step(self, batch, batch_idx):
+            self.log("foo", -batch_idx)
+            if should_signal and batch_idx == 1:
+                raise ExitGracefullyException
+            return super().validation_step(batch, batch_idx)
+
+        def training_step(self, batch, batch_idx):
+            if should_signal and batch_idx == 1:
+                raise ExitGracefullyException
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    model.test_epoch_end = None
+    mc = ModelCheckpoint(monitor="foo")
+    trainer = Trainer(
+        max_epochs=2,
+        limit_val_batches=3,
+        enable_progress_bar=False,
+        default_root_dir=tmpdir,
+        callbacks=[mc],
+    )
+    assert trainer.ckpt_path is None
+    trainer_fn = getattr(trainer, fn)
+
+    from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
+
+    ft_checkpoints = [cb for cb in trainer.callbacks if isinstance(cb, _FaultToleranceCheckpoint)]
+    ft_ckpt_path = ft_checkpoints[0].ckpt_path
+
+    if fn == "validate":
+        should_signal = False
+        trainer.fit(model)
+        should_signal = True
+
+    with pytest.raises(ExitGracefullyException):
+        trainer_fn(model)
+
+    should_signal = False
+
+    if ckpt_path == "last":
+        ctxt = nullcontext()
+        final_path = ft_ckpt_path
+
+    elif fn == "fit":  # and ckpt_path == best
+        ctxt = pytest.warns(UserWarning, match="Because fault tolerance is enabled")
+        final_path = ft_ckpt_path
+    else:  # ckpt_path == best and fn == validate
+        ctxt = pytest.warns(UserWarning, match="There is also a fault-tolerant checkpoint available")
+        final_path = mc.best_model_path
+
+    with ctxt:
+        if fn == "fit":
+            trainer_fn(model, ckpt_path=ckpt_path)
+        else:
+            trainer_fn(ckpt_path=ckpt_path)
+        assert trainer.ckpt_path == final_path
+
+
+@pytest.mark.parametrize("ckpt_path", (None, "last"))
+@pytest.mark.parametrize("save_last", (True, False))
+@pytest.mark.parametrize("fn", ("fit", "validate"))
+def test_checkpoint_path_input_last(tmpdir, ckpt_path, save_last, fn):
+    class TestModel(BoringModel):
+        def validation_step(self, batch, batch_idx):
+            self.log("foo", -batch_idx)
+            return super().validation_step(batch, batch_idx)
+
+        def training_step(self, batch, batch_idx):
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    model.test_epoch_end = None
+    mc = ModelCheckpoint(monitor="foo", save_last=save_last)
+    trainer = Trainer(
+        max_epochs=2,
+        limit_val_batches=3,
+        enable_progress_bar=False,
+        default_root_dir=tmpdir,
+        callbacks=[mc],
+    )
+    assert trainer.ckpt_path is None
+    trainer_fn = getattr(trainer, fn)
+
+    if fn == "fit":
+        if ckpt_path is None:
+            ctxt = nullcontext()
+        else:
+            ctxt = pytest.warns(UserWarning, match="No checkpoint will be loaded")
+
+        with ctxt:
+            trainer_fn(model, ckpt_path=ckpt_path)
+
+        assert trainer.ckpt_path is None
+    else:
+        trainer.fit(model)
+        if ckpt_path is None:
+            ctxt = pytest.warns(
+                UserWarning,
+                match=r"(?!.*however it is default only when fitting)^"
+                r".*The best model of the previous `fit` call will be used",
+            )
+            final_path = mc.best_model_path
+        else:
+            if save_last:
+                ctxt = nullcontext()
+                final_path = mc.last_model_path
+            else:
+                ctxt = pytest.warns(UserWarning, match="No checkpoint will be loaded")
+                final_path = None
+
+        with ctxt:
+            trainer_fn(ckpt_path=ckpt_path)
+            assert trainer.ckpt_path == final_path
+
+
 @pytest.mark.parametrize("ckpt_path", (None, "best", "specific"))
 @pytest.mark.parametrize("save_top_k", (-1, 0, 1, 2))
 @pytest.mark.parametrize("fn", ("validate", "test", "predict"))
@@ -693,7 +819,7 @@ def predict_step(self, batch, *_):
     trainer.fit(model)
 
     trainer_fn = getattr(trainer, fn)
-    assert getattr(trainer, "ckpt_path") is None
+    assert trainer.ckpt_path is None
 
     if ckpt_path == "best":
         # ckpt_path is 'best', meaning we load the best weights
@@ -704,20 +830,20 @@ def predict_step(self, batch, *_):
                 trainer_fn(model, ckpt_path=ckpt_path)
         else:
             trainer_fn(ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+            assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
 
             trainer_fn(model, ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+            assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     elif ckpt_path is None:
         # ckpt_path is None, meaning we don't load any checkpoints and use the provided model
         trainer_fn(model, ckpt_path=ckpt_path)
-        assert getattr(trainer, "ckpt_path") is None
+        assert trainer.ckpt_path is None
 
         if save_top_k > 0:
             # ckpt_path is None with no model provided means load the best weights
             with pytest.warns(UserWarning, match="The best model of the previous `fit` call will be used"):
                 trainer_fn(ckpt_path=ckpt_path)
-                assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+                assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     else:
         # specific checkpoint, pick one from saved ones
         if save_top_k == 0:
@@ -730,10 +856,10 @@ def predict_step(self, batch, *_):
                 ].absolute()
             )
             trainer_fn(ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == ckpt_path
+            assert trainer.ckpt_path == ckpt_path
 
             trainer_fn(model, ckpt_path=ckpt_path)
-            assert getattr(trainer, "ckpt_path") == ckpt_path
+            assert trainer.ckpt_path == ckpt_path
 
 
 @pytest.mark.parametrize("enable_checkpointing", (False, True))
@@ -764,14 +890,14 @@ def predict_step(self, batch, *_):
     trainer.fit(model)
 
     trainer_fn = getattr(trainer, fn)
-    assert getattr(trainer, "ckpt_path") is None
+    assert trainer.ckpt_path is None
 
     if enable_checkpointing:
         trainer_fn(ckpt_path="best")
-        assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+        assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
 
         trainer_fn(model, ckpt_path="best")
-        assert getattr(trainer, "ckpt_path") == trainer.checkpoint_callback.best_model_path
+        assert trainer.ckpt_path == trainer.checkpoint_callback.best_model_path
     else:
         with pytest.raises(MisconfigurationException, match="`ModelCheckpoint` is not configured."):
             trainer_fn(ckpt_path="best")