Lightning-AI · awaelchli · Sep 20, 2022 · Mar 10, 2022 · Mar 10, 2022 · Mar 15, 2022
@@ -346,6 +346,10 @@ def finalize(self, status: str) -> None:
         This happens automatically in the :meth:`~CometLogger.experiment` property, when
         ``self._experiment`` is set to ``None``, i.e. ``self.reset_experiment()``.
         """
+        if self._experiment is None:
+            # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been
+            # initialized there
+            return
         self.experiment.end()
         self.reset_experiment()
 

@@ -208,6 +208,10 @@ def save(self) -> None:
 
     @rank_zero_only
     def finalize(self, status: str) -> None:
+        if self._experiment is None:
+            # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been
+            # initialized there
+            return
         self.save()
 
     @property

@@ -254,9 +254,13 @@ def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None)
             self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step)
 
     @rank_zero_only
-    def finalize(self, status: str = "FINISHED") -> None:
-        super().finalize(status)
-        status = "FINISHED" if status == "success" else status
+    def finalize(self, status: str = "success") -> None:
+        if not self._initialized:
+            return
+        if status == "success":
+            status = "FINISHED"
+        elif status == "failed":
+            status = "FAILED"
         if self.experiment.get_run(self.run_id):
             self.experiment.set_terminated(self.run_id, status)
 

@@ -430,6 +430,10 @@ def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[i
 
     @rank_zero_only
     def finalize(self, status: str) -> None:
+        if not self._run_instance:
+            # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been
+            # initialized there
+            return
         if status:
             self.run[self._construct_path_with_prefix("status")] = status
 

@@ -271,7 +271,10 @@ def finalize(self, status: str) -> None:
         if self._experiment is not None:
             self.experiment.flush()
             self.experiment.close()
-        self.save()
+
+        if status == "success":
+            # saving hparams happens independent of experiment manager
+            self.save()
 
     @property
     def name(self) -> str:

@@ -552,6 +552,10 @@ def use_artifact(self, artifact: str, artifact_type: Optional[str] = None) -> "w
 
     @rank_zero_only
     def finalize(self, status: str) -> None:
+        if self._experiment is None:
+            # When using multiprocessing, finalize() should be a no-op on the main process, as no experiment has been
+            # initialized there
+            return
         # log checkpoints as artifacts
         if self._checkpoint_callback:
             self._scan_and_log_checkpoints(self._checkpoint_callback)

@@ -644,12 +644,16 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
             if not self.interrupted:
                 self.state.status = TrainerStatus.INTERRUPTED
                 self._call_callback_hooks("on_exception", exception)
+                for logger in self.loggers:
+                    logger.finalize("failed")
         except BaseException as exception:
             self.state.status = TrainerStatus.INTERRUPTED
             if distributed_available() and self.world_size > 1:
                 # try syncing remaining processes, kill otherwise
                 self.strategy.reconciliate_processes(traceback.format_exc())
             self._call_callback_hooks("on_exception", exception)
+            for logger in self.loggers:
+                logger.finalize("failed")
             self._teardown()
             # teardown might access the stage so we reset it after
             self.state.stage = None

@@ -259,3 +259,21 @@ def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
     logger._mlflow_client.create_experiment.assert_called_once_with(
         name="test", artifact_location="my_artifact_location"
     )
+
+
+@mock.patch("pytorch_lightning.loggers.mlflow.mlflow")
+@mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient")
+def test_mlflow_logger_finalize_when_exception(*_):
+    logger = MLFlowLogger("test")
+
+    # Pretend we are on the main process and failing
+    assert logger._mlflow_client
+    assert not logger._initialized
+    logger.finalize("failed")
+    logger.experiment.set_terminated.assert_not_called()
+
+    # Pretend we are in a worker process and failing
+    _ = logger.experiment
+    assert logger._initialized
+    logger.finalize("failed")
+    logger.experiment.set_terminated.assert_called_once_with(logger.run_id, "FAILED")
@@ -22,7 +22,7 @@
 from pathlib import Path
 from re import escape
 from unittest import mock
-from unittest.mock import ANY, call, patch
+from unittest.mock import ANY, call, Mock, patch
 
 import cloudpickle
 import pytest
@@ -2195,3 +2195,20 @@ def test_trainer_save_checkpoint_no_model_attached():
     assert trainer.model is None
     with pytest.raises(AttributeError, match="Saving a checkpoint is only possible if a model is attached"):
         trainer.save_checkpoint("checkpoint.ckpt")
+
+
+def test_trainer_calls_logger_finalize_on_exception(tmpdir):
+    class CustomModel(BoringModel):
+        def on_fit_start(self):
+            super().on_fit_start()
+            raise Exception("logger-finalize")
+
+    model = CustomModel()
+    logger = TensorBoardLogger(save_dir=tmpdir)
+    logger.finalize = Mock()
+    trainer = Trainer(logger=logger)
+
+    with pytest.raises(Exception, match="logger-finalize"):
+        trainer.fit(model)
+
+    logger.finalize.assert_called_once_with("failed")