NVIDIA · titu1994 · Jan 17, 2023 · Jan 9, 2023 · Jan 10, 2023 · Jan 10, 2023
diff --git a/docs/source/common/callbacks.rst b/docs/source/common/callbacks.rst
@@ -0,0 +1,56 @@
+*********
+Callbacks
+*********
+
+Exponential Moving Average (EMA)
+================================
+
+During training, EMA maintains a moving average of the trained parameters.
+EMA parameters can produce significantly better results and faster convergence for a variety of different domains and models.
+
+EMA is a simple calculation. EMA Weights are pre-initialized with the model weights at the start of training.
+
+Every training update, the EMA weights are updated based on the new model weights.
+
+.. math::
+    ema_w = ema_w * decay + model_w * (1-decay)
+
+Enabling EMA is straightforward. We can pass the additional argument to the experiment manager at runtime.
+
+.. code-block:: bash
+
+    python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \
+        model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \
+        trainer.devices=2 \
+        trainer.accelerator='gpu' \
+        trainer.max_epochs=50 \
+        exp_manager.ema.enable=True # pass this additional argument to enable EMA
+
+To change the decay rate, pass the additional argument.
+
+.. code-block:: bash
+
+    python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \
+        model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \
+        trainer.devices=2 \
+        trainer.accelerator='gpu' \
+        trainer.max_epochs=50 \
+        exp_manager.ema.enable=True \
+        exp_manager.ema.decay=0.999
+
+We also offer other helpful arguments.
+
+.. code-block:: bash
+
+    python examples/asr/asr_ctc/speech_to_text_ctc.py \
+        model.train_ds.manifest_filepath=/path/to/my/train/manifest.json \
+        model.validation_ds.manifest_filepath=/path/to/my/validation/manifest.json \
+        trainer.devices=2 \
+        trainer.accelerator='gpu' \
+        trainer.max_epochs=50 \
+        exp_manager.ema.enable=True \
+        exp_manager.ema.validate_original_weights=True \ # validate the original weights instead of EMA weights.
+        exp_manager.ema.every_n_steps=2 \ # apply EMA every N steps instead of every step.
+        exp_manager.ema.cpu_offload=True # offload EMA weights to CPU. May introduce significant slow-downs.
diff --git a/docs/source/common/intro.rst b/docs/source/common/intro.rst
@@ -3,37 +3,10 @@ Common Collection
 
 The common collection contains things that could be used across all collections.
 
-Tokenizers
-----------
-.. automodule:: nemo.collections.common.tokenizers.AutoTokenizer
-    :special-members: __init__
-.. automodule:: nemo.collections.common.tokenizers.SentencePieceTokenizer
-    :special-members: __init__
-.. automodule:: nemo.collections.common.tokenizers.TokenizerSpec
-    :special-members: __init__
+.. toctree::
+   :maxdepth: 8
 
-
-Losses
-------
-.. automodule:: nemo.collections.common.losses.AggregatorLoss
-    :special-members: __init__
-
-.. automodule:: nemo.collections.common.losses.CrossEntropyLoss
-    :special-members: __init__
-
-.. automodule:: nemo.collections.common.losses.MSELoss
-    :special-members: __init__
-
-.. automodule:: nemo.collections.common.losses.SmoothedCrossEntropyLoss
-    :special-members: __init__
-.. automodule:: nemo.collections.common.losses.SpanningLoss
-    :special-members: __init__
-
-
-Metrics
--------
-
-.. autoclass:: nemo.collections.common.metrics.Perplexity
-    :show-inheritance:
-    :members:
-    :undoc-members:
+   callbacks
+   losses
+   metrics
+   tokenizers
diff --git a/docs/source/common/losses.rst b/docs/source/common/losses.rst
@@ -0,0 +1,16 @@
+Losses
+------
+.. autoclass:: nemo.collections.common.losses.AggregatorLoss
+    :special-members: __init__
+
+.. autoclass:: nemo.collections.common.losses.CrossEntropyLoss
+    :special-members: __init__
+
+.. autoclass:: nemo.collections.common.losses.MSELoss
+    :special-members: __init__
+
+.. autoclass:: nemo.collections.common.losses.SmoothedCrossEntropyLoss
+    :special-members: __init__
+
+.. autoclass:: nemo.collections.common.losses.SpanningLoss
+    :special-members: __init__
diff --git a/docs/source/common/metrics.rst b/docs/source/common/metrics.rst
@@ -0,0 +1,7 @@
+Metrics
+-------
+
+.. autoclass:: nemo.collections.common.metrics.Perplexity
+    :show-inheritance:
+    :members:
+    :undoc-members:
diff --git a/docs/source/common/tokenizers.rst b/docs/source/common/tokenizers.rst
@@ -0,0 +1,8 @@
+Tokenizers
+----------
+.. autoclass:: nemo.collections.common.tokenizers.AutoTokenizer
+    :special-members: __init__
+.. autoclass:: nemo.collections.common.tokenizers.SentencePieceTokenizer
+    :special-members: __init__
+.. autoclass:: nemo.collections.common.tokenizers.TokenizerSpec
+    :special-members: __init__
diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
@@ -19,9 +19,9 @@
 
 import pytorch_lightning as pl
 import torch
-from lightning_utilities.core.rank_zero import rank_zero_info
 from pytorch_lightning import Callback
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.rank_zero import rank_zero_info
 
 
 class EMA(Callback):

diff --git a/requirements/requirements_docs.txt b/requirements/requirements_docs.txt
@@ -1,3 +1,4 @@
+Jinja2<3.1
 latexcodec
 numpy
 sphinx>=3.0