Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set MLFlowLogger status to FAILED when training raises an error #12292

Merged
merged 29 commits into from Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2bf6244
bugfix: update MLFlowLogger's status to be FAILED when trainig raises…
Mar 10, 2022
bee913f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 10, 2022
dbbfef8
disable finalize method if status == failed
Mar 15, 2022
f57dd5a
Merge branch 'bugfix/12291_mlflow' of github.com:ritsuki1227/pytorch-…
Mar 15, 2022
eaf467c
Merge branch 'master' into bugfix/12291_mlflow
awaelchli Mar 20, 2022
5860efa
Merge branch 'master' into bugfix/12291_mlflow
Borda Jun 21, 2022
bd7768b
Merge branch 'master' into bugfix/12291_mlflow
carmocca Jul 23, 2022
7a73573
close finalizer on tensorboard logger & refactor mlflow logger test
Aug 1, 2022
9f5a31e
Merge branch 'master' into bugfix/12291_mlflow
Aug 1, 2022
10e133a
wip: initialize self._version
Aug 31, 2022
156e12b
Merge branch 'master' into bugfix/12291_mlflow
Aug 31, 2022
846c5bb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 31, 2022
3c39f08
revert the tensorboard logger fix
Sep 11, 2022
48e64dd
bugfix: logger.finalize is called only when the logger is mlflow logg…
Sep 11, 2022
038b87f
Merge branch 'master' into bugfix/12291_mlflow
Sep 11, 2022
40e82a5
revert unnecessary fix
Sep 11, 2022
7510410
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 11, 2022
82935b7
bugfix: add patches
Sep 11, 2022
b3d1931
Merge branch 'bugfix/12291_mlflow' of github.com:ritsuki1227/pytorch-…
Sep 11, 2022
09097ef
Merge branch 'master' into bugfix/12291_mlflow
awaelchli Sep 17, 2022
d6438ed
handle mlflow finalize on main process
awaelchli Sep 17, 2022
e379ac0
improve the testing
awaelchli Sep 17, 2022
11a0cc6
adjust other loggers
awaelchli Sep 17, 2022
d923981
handle special tensorboard hparams file saving logic
awaelchli Sep 17, 2022
e1064c2
Merge branch 'master' into bugfix/12291_mlflow
awaelchli Sep 18, 2022
a93d7a4
finalize checkpoints in wandb only on success
awaelchli Sep 19, 2022
6c6c333
Merge branch 'master' into bugfix/12291_mlflow
awaelchli Sep 19, 2022
0b59285
remove unused import
awaelchli Sep 19, 2022
a6d5fd7
add changelog
awaelchli Sep 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/pytorch_lightning/loggers/mlflow.py
Expand Up @@ -245,7 +245,10 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
@rank_zero_only
def finalize(self, status: str = "FINISHED") -> None:
super().finalize(status)
status = "FINISHED" if status == "success" else status
if status == "success":
status = "FINISHED"
elif status == "failed":
status = "FAILED"
if self.experiment.get_run(self.run_id):
self.experiment.set_terminated(self.run_id, status)

Expand Down
7 changes: 4 additions & 3 deletions src/pytorch_lightning/loggers/tensorboard.py
Expand Up @@ -267,9 +267,10 @@ def save(self) -> None:

@rank_zero_only
def finalize(self, status: str) -> None:
self.experiment.flush()
self.experiment.close()
self.save()
if status != "failed":
carmocca marked this conversation as resolved.
Show resolved Hide resolved
self.experiment.flush()
self.experiment.close()
self.save()

@property
def name(self) -> str:
Expand Down
4 changes: 4 additions & 0 deletions src/pytorch_lightning/trainer/trainer.py
Expand Up @@ -651,12 +651,16 @@ def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs:
self.state.status = TrainerStatus.INTERRUPTED
self._call_callback_hooks("on_keyboard_interrupt")
self._call_callback_hooks("on_exception", exception)
for logger in self.loggers:
logger.finalize("failed")
awaelchli marked this conversation as resolved.
Show resolved Hide resolved
except BaseException as exception:
self.state.status = TrainerStatus.INTERRUPTED
if distributed_available() and self.world_size > 1:
# try syncing remaining processes, kill otherwise
self.strategy.reconciliate_processes(traceback.format_exc())
self._call_callback_hooks("on_exception", exception)
for logger in self.loggers:
logger.finalize("failed")
awaelchli marked this conversation as resolved.
Show resolved Hide resolved
self._teardown()
# teardown might access the stage so we reset it after
self.state.stage = None
Expand Down
20 changes: 20 additions & 0 deletions tests/tests_pytorch/loggers/test_mlflow.py
Expand Up @@ -259,3 +259,23 @@ def test_mlflow_logger_experiment_calls(client, mlflow, time, tmpdir):
logger._mlflow_client.create_experiment.assert_called_once_with(
name="test", artifact_location="my_artifact_location"
)


@mock.patch("pytorch_lightning.loggers.mlflow.mlflow")
@mock.patch("pytorch_lightning.loggers.mlflow.MlflowClient")
def test_mlflow_logger_run_status_failed(client, mlflow):
class CustomModel(BoringModel):
def training_step(self, batch, batch_idx):
ritsuki1227 marked this conversation as resolved.
Show resolved Hide resolved
super().training_step(batch, batch_idx)
raise BaseException

model = CustomModel()
logger = MLFlowLogger("test")
run = MagicMock()
run.info.run_id = "run_id"
logger._mlflow_client.create_run = MagicMock(return_value=run)
trainer = Trainer(logger=logger)

with pytest.raises(BaseException):
trainer.fit(model)
client.return_value.set_terminated.assert_called_once_with(logger.run_id, "FAILED")