Merge branch 'main' into megatron_nmt_sample_training

NVIDIA · Jun 21, 2022 · 9131474 · 9131474
2 parents 140000d + e542d7f
commit 9131474
Show file tree

Hide file tree

Showing 40 changed files with 691 additions and 118 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -63,7 +63,7 @@ COPY . .
 
 # start building the final container
 FROM nemo-deps as nemo
-ARG NEMO_VERSION=1.10.0
+ARG NEMO_VERSION=1.11.0
 
 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
 # version information as runtime environment variable for introspection purposes

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -137,18 +137,18 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22'
           }
         }
       }
@@ -165,7 +165,7 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
             sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
@@ -175,7 +175,7 @@ pipeline {
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -184,23 +184,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-8'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/6-14-22'
           }
         }
       }

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -119,6 +119,7 @@
     'nlp/text_normalization/tn_itn_all.bib',
     'tools/tools_all.bib',
     'tts_all.bib',
+    'core/adapters/adapter_bib.bib',
 ]
 
 intersphinx_mapping = {
@@ -210,7 +211,7 @@
 
 html_theme_options = {
     'logo_only': True,
-    # 'display_version': True,
+    'display_version': True,
     # 'prev_next_buttons_location': 'bottom',
     # 'style_external_links': False,
     # 'style_nav_header_background': '#000000',

diff --git a/docs/source/core/adapters/adapter_bib.bib b/docs/source/core/adapters/adapter_bib.bib
@@ -0,0 +1,21 @@
+
+
+@inproceedings{houlsby2019adapter,
+  title={Parameter-efficient transfer learning for NLP},
+  author={Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
+  booktitle={International Conference on Machine Learning},
+  pages={2790--2799},
+  year={2019},
+  organization={PMLR}
+}
+
+@misc{Junxian2021unified,
+  doi = {10.48550/ARXIV.2110.04366},
+  url = {https://arxiv.org/abs/2110.04366},
+  author = {He, Junxian and Zhou, Chunting and Ma, Xuezhe and Berg-Kirkpatrick, Taylor and Neubig, Graham},
+  keywords = {Computation and Language (cs.CL), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Towards a Unified View of Parameter-Efficient Transfer Learning},
+  publisher = {arXiv},
+  year = {2021},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
@@ -0,0 +1,60 @@
+Adapters API
+============
+
+Core
+----
+
+.. autoclass:: nemo.core.adapter_mixins.AdapterModuleMixin
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
+.. autoclass:: nemo.core.adapter_mixins.AdapterModelPTMixin
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
+Adapter Networks
+----------------
+
+
+.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
+-----
+
+.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
+-----
+
+Adapter Strategies
+------------------
+
+
+.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
+.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
@@ -0,0 +1,90 @@
+Adapter Components
+==================
+
+Adapters can be considered as any set of parameters that are added to a pre-existing module/model. In our case, we currently support the standard adapter in literature, more advanced adapter modules are being researched and can potentially be supported by NeMo.
+
+An adapter module can be any pytorch module, but it must follow certain straightforward requirements -
+
+1) The model accepts an input of some input dimension, and its output must match this dimension.
+2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added.
+
+According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components -
+
+1) Functional form - the trainable parameters that will modify the input
+2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass).
+3) Composition function - How the adapters outputs are integrated with the inputs. It can be as simple as residual addition connection, or concatenation, or point-wise multiplication etc.
+
+Functional Form - Adapter Networks
+==================================
+
+Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`.
+
+.. note::
+
+    All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AbstractAdapterModule` and should ideally have an equivalent DataClass config for easy instantiation !
+
+
+.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
+-----
+
+.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
+
+Insertion Form - Module Adapters
+--------------------------------
+
+Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
+
+On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``.
+
+``Module Adapters`` are very simply defined when adding an adapter - by specifying the module that the adapter should be inserted into.
+
+.. code-block:: python
+
+    # Get the list of supported modules / locations in a adapter compatible Model
+    print(model.adapter_module_names)  # assume ['', 'encoder', 'decoder']
+
+    # When calling add_adapter, specify the module name in the left of the colon symbol, and the adapter name afterwords.
+    # The adapter is then directed to the decoder module instead of the default / encoder module.
+    model.add_adapter("decoder:first_adapter", cfg=...)
+
+You might note that ``model.adapter_module_names`` can sometimes return ``''`` as one of the supported module names - this refers to the "default module". Generally we try to provide the default as the most commonly used adapter in literature - for example, Encoder adapters in NLP/NMT/ASR.
+
+Composition Function - Adapter Strategies
+-----------------------------------------
+
+Finally, we discuss how to compose the input and output of adapter modules. In order to generalize this step, we construct ``Adapter Strategies``.
+A strategy is any class (not torch.nn.Module!) that extends :class:`~nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy`, and provides a ``forward()`` method that accepts a specific signature of the inputs and produces an output tensor which combines the input and output with some specific method.
+
+We discuss a simple residual additional connection strategy below - that accepts an input to the adapter and an adapters output and simply adds them together. It also supports ``stochastic_depth`` which enables adapters to be dynamically switched off during training, making training more robust.
+
+.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.AbstractAdapterStrategy
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
+.. autoclass:: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+    :undoc-members: adapter_module_names
+
+-----
+
+
+References
+----------
+
+.. bibliography:: ./adapter_bib.bib
+    :style: plain
+    :keyprefix: adapters-