Multi-lookahead cache-aware streaming models (#6711)

* added methods. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added methods. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added initial code. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added initial code. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added initial code. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added config files. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed bugs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated confs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated confs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated confs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated confs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * improved f.conv1d Signed-off-by: Vahid <vnoroozi@nvidia.com> * pulled from main. Signed-off-by: Vahid <vnoroozi@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pulled from main. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added postpostnorm. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed the target continiouse bug. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added dw_striding causal. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added print for debugging. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added print for debugging. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed causal convolutions. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added _midnorm. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed transcribe. Signed-off-by: Vahid <vnoroozi@nvidia.com> * cleaned code. Signed-off-by: Vahid <vnoroozi@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * moved back configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved back configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated fast emit for FC models. Signed-off-by: Vahid <vnoroozi@nvidia.com> * updated fast emit for FC models. Signed-off-by: Vahid <vnoroozi@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed bug. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed bug and addressed comments. Signed-off-by: Vahid <vnoroozi@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * dropped the test. Signed-off-by: Vahid <vnoroozi@nvidia.com> --------- Signed-off-by: Vahid <vnoroozi@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Jun 8, 2023 · b67d410 · b67d410
1 parent 9cca92b
commit b67d410
Show file tree

Hide file tree

Showing 12 changed files with 272 additions and 158 deletions.
diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml
@@ -103,10 +103,16 @@ model:
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
-    # for chunked_limited you may calculate the look-ahead or right context by the following formula:
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[140,27],[140,13],[140,2],[140,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [140, 27] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     untie_biases: true # unties the biases of the TransformerXL layers

diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml
@@ -113,10 +113,16 @@ model:
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
-    # for chunked_limited you may calculate the look-ahead or right context by the following formula:
-    # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 27*4*0.01=1.08s
+    # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[140,27],[140,13],[140,2],[140,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [140, 27] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     untie_biases: true # unties the biases of the TransformerXL layers

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_bpe_streaming.yaml
@@ -97,10 +97,17 @@ model:
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
-    # for chunked_limited you may calculate the look-ahead or right context by the following formula:
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
+
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000

diff --git a/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml b/examples/asr/conf/fastconformer/cache_aware_streaming/fastconformer_ctc_char_streaming.yaml
@@ -100,11 +100,19 @@ model:
     n_heads: 8 # may need to be lower for smaller d_models
 
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
-    # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large
+    # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
+
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000

diff --git a/.../asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml b/.../asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_bpe_streaming.yaml
@@ -102,10 +102,17 @@ model:
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
-    # for chunked_limited you may calculate the look-ahead or right context by the following formula:
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
+
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000
@@ -191,9 +198,9 @@ model:
     loss_name: "default"
     warprnnt_numba_kwargs:
       # FastEmit regularization: https://arxiv.org/abs/2010.11148
-      # You may enable FastEmit to reduce the latency of the model for streaming
-      # It also helps to improve the accuracy of the model in streaming mode
-      fastemit_lambda: 1e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming
+      # You may set it to lower values like 1e-3 for models with larger right context
+      fastemit_lambda: 5e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
       clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
 
   optim:

diff --git a/...asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml b/...asr/conf/fastconformer/cache_aware_streaming/fastconformer_transducer_char_streaming.yaml
@@ -106,11 +106,19 @@ model:
     n_heads: 8 # may need to be lower for smaller d_models
 
     # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
-    # for att_context_style=regular, the right context is recommended to be a small number around 0 to 2 as multiple-layers may increase the effective right context too large
+    # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
+
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000
@@ -196,9 +204,9 @@ model:
     loss_name: "default"
     warprnnt_numba_kwargs:
       # FastEmit regularization: https://arxiv.org/abs/2010.11148
-      # You may enable FastEmit to reduce the latency of the model for streaming
-      # It also helps to improve the accuracy of the model in streaming mode
-      fastemit_lambda: 1e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming
+      # You may set it to lower values like 1e-3 for models with larger right context
+      fastemit_lambda: 5e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
       clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
 
   optim:

diff --git a/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -8,6 +8,8 @@
 # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
 # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
 
+# Note: if training loss does not converge, you may increase warm-up to 20K.
+
 name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming"
 
 model:
@@ -106,8 +108,15 @@ model:
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000
@@ -206,9 +215,9 @@ model:
     loss_name: "default"
     warprnnt_numba_kwargs:
       # FastEmit regularization: https://arxiv.org/abs/2010.11148
-      # You may enable FastEmit to reduce the latency of the model for streaming
-      # It also helps to improve the accuracy of the model in streaming mode
-      fastemit_lambda: 1e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming
+      # You may set it to lower values like 1e-3 for models with larger right context
+      fastemit_lambda: 5e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
       clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
 
   optim:

diff --git a/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -8,6 +8,8 @@
 # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
 # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
 
+# Note: if training loss does not converge, you may increase warm-up to 20K.
+
 name: "FastConformer-Hybrid-Transducer-CTC-Char-Streaming"
 
 model:
@@ -111,8 +113,15 @@ model:
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For multi-lookahead models, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000
@@ -211,9 +220,9 @@ model:
     loss_name: "default"
     warprnnt_numba_kwargs:
       # FastEmit regularization: https://arxiv.org/abs/2010.11148
-      # You may enable FastEmit to reduce the latency of the model for streaming
-      # It also helps to improve the accuracy of the model in streaming mode
-      fastemit_lambda: 1e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
+      # You may enable FastEmit to increase the accuracy and reduce the latency of the model for streaming
+      # You may set it to lower values like 1e-3 for models with larger right context
+      fastemit_lambda: 5e-3  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
       clamp: -1.0  # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
 
   optim: