diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py index e18005c348..96ae08fa88 100644 --- a/dali/python/nvidia/dali/pipeline.py +++ b/dali/python/nvidia/dali/pipeline.py @@ -121,38 +121,19 @@ class Pipeline(object): If True, DALI will trace states of the operators. In that case, calling the ``checkpoint`` method returns serialized state of the pipeline. The same pipeline can be later rebuilt with the serialized state passed as the `checkpoint` parameter to resume running - from the saved iteration:: + from the saved iteration. - @pipeline_def(..., enable_checkpointing=True) - def pipeline(): - ... - - p = pipeline() - p.build() - for _ in range(iters): - output = p.run() - ... - checkpoint = p.checkpoint() - - ... - - p_restored = pipeline(checkpoint=checkpoint) - p_restored.build() + More details can be found in + `this documentation section `_. - .. warning:: - This is an experimental feature. The API may change without notice. Checkpoints - created with this DALI version may not be compatible with the future releases. - Currently, some operators do not support checkpointing. `checkpoint`: str, optional, default = None Serialized checkpoint, received from ``checkpoint`` method. When pipeline is built, its state is restored from the `checkpoint` and the pipeline resumes execution from the saved iteration. - .. warning:: - This is an experimental feature. The API may change without notice. Checkpoints - created with this DALI version may not be compatible with the future releases. - Currently, some operators do not support checkpointing. + More details can be found in + `this documentation section `_. `py_num_workers`: int, optional, default = 1 The number of Python workers that will process ``ExternalSource`` callbacks. @@ -1526,10 +1507,8 @@ def checkpoint(self, filename=None): The same pipeline can be later rebuilt with the saved checkpoint passed as a `checkpoint` parameter to resume execution from the saved iteration. - .. warning:: - This is an experimental feature. The API may change without notice. Checkpoints - created with this DALI version may not be compatible with the future releases. - Currently, some operators do not support checkpointing. + More details can be found in + `this documentation section `_. Parameters ---------- diff --git a/docs/advanced_topics_checkpointing.rst b/docs/advanced_topics_checkpointing.rst new file mode 100644 index 0000000000..4cb2410ffe --- /dev/null +++ b/docs/advanced_topics_checkpointing.rst @@ -0,0 +1,89 @@ +Checkpointing +============= + +.. currentmodule:: nvidia.dali + +Checkpointing is a feature in DALI which allows you to save the current state of the pipeline to a file. +Then, you can restore the pipeline from a saved checkpoint and the new pipeline will produce exactly the same outputs as the old one would. +It is particularly useful for long-running training jobs which are likely to be interrupted. + +A checkpoint of DALI pipeline contains information about states of all random number generators used in the pipeline and about the progress of each reader. + +Checkpointing API +----------------- + +Enabling checkpointing +~~~~~~~~~~~~~~~~~~~~~~ + +To enable checkpointing, set ``enable_checkpointing=True`` when creating a pipeline. +With this option enabled, DALI will track the state of each operator, allowing you to save it on demand. +Enabling checkpointing shouldn't have any impact on the performance. + +.. code-block:: python + + @pipeline_def(..., enable_checkpointing=True) + def pipeline(): + ... + + p = pipeline() + p.build() + + +.. note:: + Readers with ``shuffle_after_epoch=True`` might shuffle samples differently if checkpointing is enabled. + + +Saving a checkpoint +~~~~~~~~~~~~~~~~~~~ + +To save a checkpoint, you need to call :meth:`Pipeline.checkpoint` method, which will return a serialized checkpoint as a string. +Optionally, you can pass filename as an argument and DALI will save the checkpoint there. + +.. code-block:: python + + for _ in range(iters): + output = p.run() + + # Write the checkpoint to file: + checkpoint = p.checkpoint() + open('checkpoint_file.cpt', 'wb') + + # Or simply: + checkpoint = p.checkpoint('checkpoint_file.cpt') + +.. note:: + Calling :meth:`Pipeline.checkpoint` method may introduce an observable overhead. + We recommend you not to call it too often. + +Restoring from checkpoint +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can later restore pipeline state from a saved checkpoint. +To do so, pass `checkpoint` argument to :class:`Pipeline` on construction. +Such a pipeline should then return exactly the same outputs as the original one. + +.. code-block:: python + + checkpoint = open('checkpoint_file.cpt', 'rb').read() + p_restored = pipeline(checkpoint=checkpoint) + p_restored.build() + +.. warning:: + Make sure that the pipeline that you're restoring is the same as the original one, + i.e. contains the same operators with the same arguments. + Restoring from a checkpoint created with a different pipeline will result in undefined behavior. + +External source checkpointing +----------------------------- + +:meth:`fn.external_source` operator only partially supports checkpointing. + +Checkpointing is supported only if ``source`` is a single-argument callable accepting +batch index, ``BatchInfo`` or ``SampleInfo``. +For such ``sources``, the queries will continue from the point saved in the checkpoint. + +Other kinds of ``source`` don't support checkpointing. +Their state won't be saved in a checkpoint and +after restoring from a checkpoint, they will start from the beginning. +If you want to use checkpointing, we recommend you rewrite your source +to be a supported callable. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 52d1f5d4b4..5de1666778 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -59,6 +59,7 @@ NVIDIA DALI Documentation advanced_topics_performance_tuning advanced_topics_sharding advanced_topics_pipe_run + advanced_topics_checkpointing advanced_topics_experimental compilation