Merge branch 'main' into audio_norm

NVIDIA · May 28, 2021 · b5663d6 · b5663d6
2 parents 1b4b993 + bee43e8
commit b5663d6
Show file tree

Hide file tree

Showing 104 changed files with 2,604 additions and 1,977 deletions.
diff --git a/README.rst b/README.rst
@@ -32,12 +32,6 @@
 Introduction
 ------------
 
-NeMo is a toolkit for creating `Conversational AI <https://developer.nvidia.com/conversational-ai#started>`_ applications.
-
-`NeMo product page. <https://developer.nvidia.com/nvidia-nemo>`_
-
-`Introductory video. <https://www.youtube.com/embed/wBgpMf_KQVw>`_
-
 The toolkit comes with extendable collections of pre-built modules and ready-to-use models for:
 
 * `Automatic Speech Recognition (ASR) <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_

diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -62,19 +62,19 @@ Modules
 Parts
 -----
 
-.. autoclass:: nemo.collections.asr.parts.jasper.JasperBlock
+.. autoclass:: nemo.collections.asr.parts.submodules.jasper.JasperBlock
     :show-inheritance:
     :members:
 
 
 Mixins
 ------
 
-.. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
 
@@ -129,39 +129,39 @@ Audio Augmentors
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.SpeedPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.SpeedPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.TimeStretchPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TimeStretchPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.GainPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.GainPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.ImpulsePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ImpulsePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.ShiftPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ShiftPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.NoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.NoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.WhiteNoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.WhiteNoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.RirAndNoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.RirAndNoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.TranscodePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TranscodePerturbation
     :show-inheritance:
     :members:
 
@@ -179,25 +179,25 @@ RNNT Decoding
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_greedy_decoding.GreedyRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyRNNTInfer
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_greedy_decoding.GreedyBatchedRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyBatchedRNNTInfer
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_beam_decoding.BeamRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_beam_decoding.BeamRNNTInfer
     :show-inheritance:
     :members:
 
 Hypotheses
 ~~~~~~~~~~
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_utils.Hypothesis
+.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.Hypothesis
     :show-inheritance:
     :no-members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_utils.NBestHypotheses
+.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.NBestHypotheses
     :show-inheritance:
     :no-members:
diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -342,7 +342,7 @@ configuration is a shortform notation for Citrinet-21x5xC, such that ``B = 21``
 not be changed.
 
 To use Citrinet instead of QuartzNet, refer to the ``citrinet_512.yaml`` configuration found inside the ``examples/asr/conf/citrinet``
-directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.jasper.JasperBlock` as ``Jasper`` or
+directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.submodules.jasper.JasperBlock` as ``Jasper`` or
 ``QuartzNet`.
 
 While the configs for Citrinet and QuartzNet are similar, we note the additional flags used for Citrinet below. Refer to the
@@ -442,7 +442,7 @@ changed slightly as Citrinet utilizes sub-word tokenization.
 .. note::
     The following information is relevant to any of the above models that implements its encoder as an :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder`, and utilizes the ``SqueezeExcite`` mechanism.
 
-The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.ASRModuleMixin.change_conv_asr_se_context_window`
+The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin.change_conv_asr_se_context_window`
 
 .. code-block:: python
 
@@ -473,3 +473,56 @@ specify the tokenizer if you want to use sub-word encoding instead of character-
 
 The encoder section includes the details about the Conformer-CTC encoder architecture. You may find more information in the 
 config files and also :doc:`nemo.collections.asr.modules.ConformerEncoder<./api.html#nemo.collections.asr.modules.ConformerEncoder>`.
+
+
+Fine-tuning Configurations
+-------------------------
+
+All ASR scripts support easy fine-tuning by partially/fully loading the pretrained weights from a checkpoint into the currently instantiated model. Pre-trained weights can be provided in multiple ways -
+
+1) Providing a path to a NeMo model (via ``init_from_nemo_model``)
+2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``)
+3) Providing a path to a Pytorch Lightning checkpoint file (via ``init_from_ptl_ckpt``)
+
+Fine-tuning via a NeMo model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_nemo_model="<path to .nemo model file>"
+
+
+Fine-tuning via a NeMo pretrained model name
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_pretrained_model="<name of pretrained checkpoint>"
+
+Fine-tuning via a Pytorch Lightning checkpoint
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
diff --git a/docs/source/asr/speaker_diarization/api.rst b/docs/source/asr/speaker_diarization/api.rst
@@ -12,6 +12,6 @@ Model Classes
 Mixins
 ------
 
-.. autoclass:: nemo.collections.asr.parts.mixins.DiarizationMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.DiarizationMixin
     :show-inheritance:
     :members:
diff --git a/docs/source/asr/speaker_recognition/configs.rst b/docs/source/asr/speaker_recognition/configs.rst
@@ -80,7 +80,7 @@ minimum and maximum SNR specified with min_snr and max_snr respectively. This se
           max_snr_db: 15        
 
 
-See the :class:`nemo.collections.asr.parts.perturb.AudioAugmentor`  API section for more details.
+See the :class:`nemo.collections.asr.parts.preprocessing.perturb.AudioAugmentor`  API section for more details.
 
 
 Model Architecture Configurations

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -52,6 +52,7 @@
     'nemo_text_processing.inverse_text_normalization',  # Not installed automatically
     'nemo_text_processing.text_normalization',  # Not installed automatically
     'attr',  # attrdict in requirements, attr in import
+    'torchmetrics',  # inherited from PTL
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']

diff --git a/examples/asr/speech_to_label.py b/examples/asr/speech_to_label.py
@@ -102,20 +102,37 @@
     +trainer.precision=16 \
     +trainer.amp_level=O1  # needed if using PyTorch < 1.6
 
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/results.html#
+
 """
 import pytorch_lightning as pl
+from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecClassificationModel
 from nemo.core.config import hydra_runner
+from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
 @hydra_runner(config_path="conf", config_name="matchboxnet_3x1x64_v1")
 def main(cfg):
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:

diff --git a/examples/asr/speech_to_text.py b/examples/asr/speech_to_text.py
@@ -12,21 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-
-from nemo.collections.asr.models import EncDecCTCModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
 """
+# Training the model
+
 Basic run (on CPU for 50 epochs):
     python examples/asr/speech_to_text.py \
-        model.train_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_train.json" \
-        model.validation_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_val.json" \
-        hydra.run.dir="." \
+        # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
         trainer.gpus=0 \
         trainer.max_epochs=50
 
@@ -41,19 +34,19 @@
 
 Override some args of optimizer:
     python speech_to_text.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.train_ds.manifest_filepath="./an4/train_manifest.json" \
     model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
-    hydra.run.dir="." \
     trainer.gpus=2 \
     trainer.max_epochs=2 \
     model.optim.args.betas=[0.8,0.5] \
     model.optim.args.weight_decay=0.0001
 
-Overide optimizer entirely
+Override optimizer entirely
     python speech_to_text.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.train_ds.manifest_filepath="./an4/train_manifest.json" \
     model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
-    hydra.run.dir="." \
     trainer.gpus=2 \
     trainer.max_epochs=2 \
     model.optim.name=adamw \
@@ -62,16 +55,38 @@
     +model.optim.args.betas=[0.8,0.5]\
     +model.optim.args.weight_decay=0.0005
 
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html
+
 """
 
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.models import EncDecCTCModel
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
 
 @hydra_runner(config_path="conf", config_name="config")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:

diff --git a/examples/asr/speech_to_text_bpe.py b/examples/asr/speech_to_text_bpe.py
@@ -50,7 +50,19 @@
     exp_manager.wandb_logger_kwargs.name="<Name of experiment>" \
     exp_manager.wandb_logger_kwargs.project="<Name of project>"
 ```
+
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html
+
 """
+
 import pytorch_lightning as pl
 from omegaconf import OmegaConf
 
@@ -63,12 +75,14 @@
 @hydra_runner(config_path="experimental/configs/", config_name="config_bpe")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
-    print(OmegaConf.to_yaml(cfg))
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
-
     asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: