Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))


- Fixed `ModelCheckpoint.file_exists` using broadcast in DDP, reducing memory usage when checking for existing checkpoints ([#19674](https://github.com/Lightning-AI/pytorch-lightning/issues/19674))


---

## [2.5.6] - 2025-11-05
Expand Down
8 changes: 5 additions & 3 deletions src/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,10 +997,12 @@ def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
yaml.dump(best_k, fp)

def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
"""Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal
"""Checks if a file exists on rank 0 and synchronizes the result to all other ranks, preventing the internal
state to diverge between ranks."""
exists = self._fs.exists(filepath)
return trainer.strategy.broadcast(exists)
# In distributed setups, only global rank 0 touches the filesystem
local_decision = self._fs.exists(filepath) if trainer.is_global_zero else False
# Reduce the decision across ranks using an "any"-style reduction to decide if the file exists anywhere
return trainer.strategy.reduce_boolean_decision(local_decision, all=False)

def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, current: str) -> bool:
"""Checks if the previous checkpoint should be deleted.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,28 @@ def on_train_epoch_end(self):
trainer.fit(model)
if os.getenv("LOCAL_RANK") == "0":
assert save_mock.call_count == expected


@RunIf(min_cuda_gpus=2, standalone=True)
def test_model_checkpoint_ddp_monitor_none(tmp_path):
"""Ensure that ModelCheckpoint with monitor=None works correctly under DDP and exercises the file_exists path."""

model = BoringModel()
checkpoint = callbacks.ModelCheckpoint(dirpath=tmp_path, monitor=None, save_top_k=1)

trainer = Trainer(
default_root_dir=tmp_path,
callbacks=[checkpoint],
enable_progress_bar=False,
enable_model_summary=False,
max_epochs=1,
strategy="ddp",
accelerator="gpu",
devices=2,
limit_train_batches=2,
limit_val_batches=0,
)

trainer.fit(model)
if os.getenv("LOCAL_RANK") == "0":
assert checkpoint.best_model_path
Loading