NVIDIA · VahidooX · Nov 3, 2020 · Oct 21, 2020 · Oct 21, 2020 · Oct 21, 2020
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -209,10 +209,10 @@ pipeline {
           }
         }
 
-        stage('L2: Speech to Text WPE') {
+        stage('L2: Speech to Text WPE - CitriNet') {
           steps {
             sh 'python examples/asr/speech_to_text_bpe.py \
-            --config-path="experimental/configs/" --config-name="config_bpe" \
+            --config-path="experimental/citrinet/" --config-name="config_bpe" \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
@@ -223,6 +223,21 @@ pipeline {
             sh 'rm -rf examples/asr/speech_to_text_wpe_results'
           }
         }
+
+        stage('L2: Speech to Text WPE - Conformer') {
+          steps {
+            sh 'python examples/asr/speech_to_text_bpe.py \
+            --config-path="experimental/conformer" --config-name="conformer_bpe" \
+            model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+            model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+            model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
+            model.tokenizer.type="wpe" \
+            trainer.gpus=[1] \
+            +trainer.fast_dev_run=True \
+            exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results'
+            sh 'rm -rf examples/asr/speech_to_text_wpe_conformer_results'
+          }
+        }
       }
     }
 

diff --git a/.../asr/experimental/configs/config_bpe.yaml → ...asr/experimental/citrinet/config_bpe.yaml b/.../asr/experimental/configs/config_bpe.yaml → ...asr/experimental/citrinet/config_bpe.yaml
diff --git a/...textnet_bpe/contextnet_192_2x_stride.yaml → ...al/citrinet/contextnet_192_2x_stride.yaml b/...textnet_bpe/contextnet_192_2x_stride.yaml → ...al/citrinet/contextnet_192_2x_stride.yaml
diff --git a/...textnet_bpe/contextnet_192_4x_stride.yaml → ...al/citrinet/contextnet_192_4x_stride.yaml b/...textnet_bpe/contextnet_192_4x_stride.yaml → ...al/citrinet/contextnet_192_4x_stride.yaml
diff --git a/...textnet_bpe/contextnet_192_8x_stride.yaml → ...al/citrinet/contextnet_192_8x_stride.yaml b/...textnet_bpe/contextnet_192_8x_stride.yaml → ...al/citrinet/contextnet_192_8x_stride.yaml
diff --git a/examples/asr/experimental/conformer/conformer_bpe.yaml b/examples/asr/experimental/conformer/conformer_bpe.yaml
@@ -0,0 +1,162 @@
+name: &name "Conformer-BPE"
+
+model:
+  sample_rate: &sample_rate 16000
+  log_prediction: true
+  load_weights_from_checkpoint: null
+  ctc_reduction: 'mean_batch'
+
+  train_ds:
+    manifest_filepath: ???
+    sample_rate: 16000
+    batch_size: 16
+    trim_silence: false
+    max_duration: 16.7
+    min_duration: 0.1
+    shuffle: true
+    is_tarred: false
+    tarred_audio_filepaths: null
+    num_workers: 4
+    pin_memory: false
+    use_start_end_token: true
+
+  validation_ds:
+    manifest_filepath: ???
+    sample_rate: 16000
+    batch_size: 16
+    shuffle: false
+    num_workers: 4
+    pin_memory: false
+    use_start_end_token: true
+
+  test_ds:
+    manifest_filepath: null
+    sample_rate: 16000
+    batch_size: 16
+    shuffle: false
+    num_workers: 4
+    pin_memory: false
+    use_start_end_token: true
+
+  tokenizer:
+    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
+    type: ???  # Can be either bpe or wpe
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: *sample_rate
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: &n_mels 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 16
+    stft_conv: false
+
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    # SpecAug params
+    freq_masks: 2 # set to zero to disable the SpecAug augmentation
+    time_masks: 2 # set to zero to disable the SpecAug augmentation
+    freq_width: 27
+    time_width: 100
+    # Cut-off params
+    rect_masks: 0 # set to zero to disable the cut-off augmentation
+    rect_time: 120
+    rect_freq: 50
+
+
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: *n_mels
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 16
+    d_model: 256
+
+    # Sub-sampling params
+    subsampling: vggnet # vggnet or striding
+    subsampling_factor: 4 # must be power of 2
+    subsampling_conv_channels: 64 # set to -1 to make it equal to the d_model
+
+    # Feed forward module's params
+    ff_expansion_factor: 4
+
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos, abs_pos
+    n_heads: 4
+    xscaling: true
+
+    # Convolution module's params
+    conv_kernel_size: 31
+
+    ### regularization
+    dropout: 0.1 # The dropout used inside the Conformer Modules
+    dropout_emb: 0.1 # The dropout used embeddings
+    dropout_att: 0.0 # The dropout for multi-headed attention modules
+
+  decoder:
+    _target_: nemo.collections.asr.modules.LSTMDecoder
+    feat_in: null # If not provided, the feat_out of the encoder would be used
+    num_classes: -1  # filled with vocabulary size from tokenizer at runtime
+    vocabulary: []  # filled with vocabulary from tokenizer at runtime
+    lstm_hidden_size: 640
+    bidirectional: False
+    num_layers: 1
+
+  optim:
+    name: novograd
+    lr: 0.01
+    # optimizer arguments
+    betas: [0.8, 0.5]
+    weight_decay: 0.001
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 4000
+      warmup_ratio: null
+      min_lr: 1e-9
+      last_epoch: -1
+
+trainer:
+  gpus: 0 # number of gpus
+  num_nodes: 1
+  max_epochs: 100
+  max_steps: null # computed at runtime if not set
+  val_check_interval: 1 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  distributed_backend: ddp
+  accumulate_grad_batches: 2
+  gradient_clip_val: 0.0
+  amp_level: O0 # O1/O2 for mixed precision
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 10  # Interval of logging.
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 10 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  checkpoint_callback: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+
+exp_manager:
+  exp_dir: null
+  name: *name
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+hydra:
+  run:
+    dir: .
+  job_logging:
+    root:
+      handlers: null
diff --git a/examples/asr/experimental/conformer/conformer_char.yaml b/examples/asr/experimental/conformer/conformer_char.yaml
@@ -0,0 +1,159 @@
+name: &name "Conformer-char"
+
+model:
+  sample_rate: &sample_rate 16000
+  labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+                   "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
+  log_prediction: true
+  load_weights_from_checkpoint: null
+  ctc_reduction: 'mean_batch'
+
+  train_ds:
+    manifest_filepath: ???
+    labels: *labels
+    sample_rate: 16000
+    batch_size: 16
+    trim_silence: false
+    max_duration: 16.7
+    min_duration: 0.1
+    shuffle: true
+    is_tarred: false
+    tarred_audio_filepaths: null
+    num_workers: 4
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    labels: *labels
+    sample_rate: 16000
+    batch_size: 16
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  test_ds:
+    manifest_filepath: null
+    labels: *labels
+    sample_rate: 16000
+    batch_size: 16
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    sample_rate: *sample_rate
+    normalize: "per_feature"
+    window_size: 0.025
+    window_stride: 0.01
+    window: "hann"
+    features: &n_mels 80
+    n_fft: 512
+    frame_splicing: 1
+    dither: 0.00001
+    pad_to: 16
+    stft_conv: false
+
+  spec_augment:
+    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+    # SpecAug params
+    freq_masks: 2 # set to zero to disable the SpecAug augmentation
+    time_masks: 2 # set to zero to disable the SpecAug augmentation
+    freq_width: 27
+    time_width: 100
+    # Cut-off params
+    rect_masks: 0 # set to zero to disable the cut-off augmentation
+    rect_time: 120
+    rect_freq: 50
+
+  encoder:
+    _target_: nemo.collections.asr.modules.ConformerEncoder
+    feat_in: *n_mels
+    feat_out: -1 # you may set it if you need different output size other than the default d_model
+    n_layers: 16
+    d_model: 256
+
+    # Sub-sampling params
+    subsampling: vggnet # vggnet or striding
+    subsampling_factor: 4 # must be power of 2
+    subsampling_conv_channels: 64 # set to -1 to make it equal to the d_model
+
+    # Feed forward module's params
+    ff_expansion_factor: 4
+
+    # Multi-headed Attention Module's params
+    self_attention_model: rel_pos # rel_pos, abs_pos
+    n_heads: 4
+    xscaling: true
+
+    # Convolution module's params
+    conv_kernel_size: 31
+
+    ### regularization
+    dropout: 0.1 # The dropout used inside the Conformer Modules
+    dropout_emb: 0.1 # The dropout used embeddings
+    dropout_att: 0.0 # The dropout for multi-headed attention modules
+
+  decoder:
+    _target_: nemo.collections.asr.modules.LSTMDecoder
+    feat_in: null # If not provided, the feat_out of the encoder would be used
+    num_classes: 28
+    vocabulary: *labels
+    lstm_hidden_size: 640
+    bidirectional: False
+    num_layers: 1
+
+  optim:
+    name: novograd
+    lr: 0.01
+    # optimizer arguments
+    betas: [0.8, 0.5]
+    weight_decay: 0.001
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 1000
+      warmup_ratio: null
+      min_lr: 1e-9
+      last_epoch: -1
+
+trainer:
+  gpus: 0 # number of gpus
+  num_nodes: 1
+  max_epochs: 100
+  max_steps: null # computed at runtime if not set
+  val_check_interval: 1 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  distributed_backend: ddp
+  accumulate_grad_batches: 2
+  gradient_clip_val: 0.0
+  amp_level: O0 # O1/O2 for mixed precision
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 10  # Interval of logging.
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 10 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  checkpoint_callback: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+
+exp_manager:
+  exp_dir: null
+  name: *name
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+hydra:
+  run:
+    dir: .
+  job_logging:
+    root:
+      handlers: null