diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index cad387a99..3b2bb99b0 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -705,14 +705,14 @@ def async_save_hf( if isinstance(hf_dir, str): hf_dir = Path(hf_dir) tmp_hf_dir = hf_dir.with_name(f"{hf_dir.name}.incomplete") + status_dir = tmp_hf_dir.parent / f".{tmp_hf_dir.name}.async-hf-writer-status" if rank == 0: if tmp_hf_dir.exists(): rmtree(tmp_hf_dir) tmp_hf_dir.mkdir(parents=True, exist_ok=True) + status_dir.mkdir(parents=True, exist_ok=True) - status_path = ( - tmp_hf_dir.parent / f"{tmp_hf_dir.name}.{self._async_hf_writer_status_filename(rank, world_size)}" - ) + status_path = status_dir / self._async_hf_writer_status_filename(rank, world_size) cleanup_done_path = tmp_hf_dir.parent / f"{tmp_hf_dir.name}.cleanup-done" if rank == 0: cleanup_done_path.unlink(missing_ok=True) @@ -874,11 +874,12 @@ def wait_async_hf(self, handle: AsyncHFSaveHandle | None = None) -> Path | None: if rank == 0: self._write_hf_index_and_config(hf_dir=tmp_hf_dir, weight_map=merged_weight_map) - if dist.is_initialized(): - dist.barrier() status_path.unlink(missing_ok=True) cleanup_done_path.unlink(missing_ok=True) + if dist.is_initialized(): + dist.barrier() if rank == 0: + rmtree(status_path.parent, ignore_errors=True) if hf_dir.exists(): rmtree(hf_dir) tmp_hf_dir.rename(hf_dir) diff --git a/xtuner/v1/model/compose/base.py b/xtuner/v1/model/compose/base.py index 68c9957ba..11d17b4ef 100644 --- a/xtuner/v1/model/compose/base.py +++ b/xtuner/v1/model/compose/base.py @@ -192,14 +192,14 @@ def async_save_hf( if isinstance(hf_dir, str): hf_dir = Path(hf_dir) tmp_hf_dir = hf_dir.with_name(f"{hf_dir.name}.incomplete") + status_dir = tmp_hf_dir.parent / f".{tmp_hf_dir.name}.async-hf-writer-status" if rank == 0: if tmp_hf_dir.exists(): rmtree(tmp_hf_dir) tmp_hf_dir.mkdir(parents=True, exist_ok=True) + status_dir.mkdir(parents=True, exist_ok=True) - status_path = ( - tmp_hf_dir.parent / f"{tmp_hf_dir.name}.{self._async_hf_writer_status_filename(rank, world_size)}" - ) + status_path = status_dir / self._async_hf_writer_status_filename(rank, world_size) cleanup_done_path = tmp_hf_dir.parent / f"{tmp_hf_dir.name}.cleanup-done" if rank == 0: cleanup_done_path.unlink(missing_ok=True)