examples/nlp/token_classification/conf/punctuation_capitalization_lexical_audio_config.yaml

# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Punctuation and capitalization lexical audio model with pretrained BERT-like models and Encoder-Decoder-like models.
pretrained_model: null # pretrained Punctuation and Capitalization Lexical Audio model from list_available_models(), for example:
#
# or your_model.nemo
trainer:
  devices: -1 # the number of gpus, 0 for CPU
  num_nodes: 1
  max_epochs: 5
  max_steps: -1 # precedence over max_epochs
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 0.0
  precision: 32 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
  accelerator: gpu
  strategy: ddp
  enable_checkpointing: False  # Provided by exp_manager
  logger: false  # Provided by exp_manager
  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step,
  # LR schedulers, apex, etc.
  log_every_n_steps: 50

exp_manager:
  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
  name: Punctuation_and_Capitalization_Lexical_Audio  # The name of your model
  create_tensorboard_logger: true  # Whether you want exp_manger to create a tb logger
  create_checkpoint_callback: true  # Whether you want exp_manager to create a model checkpoint callback
  checkpoint_callback_params:
    save_top_k: 3
    monitor: "val_loss"
    mode: "min"
    save_best_model: true
  resume_from_checkpoint: null

model:
  audio_encoder:
    pretrained_model: stt_en_conformer_ctc_medium # You can choose any pretrained ASR model from list_available_models() of EncDecCTCModel.
    freeze:
      is_enabled: false # If set to True weights of audio encoder will not be updated during training.
      d_model: 256 # Input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward
      d_ff: 1024 # Hidden dimension of PositionwiseFeedForward
      num_layers: 4 # Number of additional Conformer layers
    adapter:
      enable: false # If set to True will enable adapters for audio encoder.
      config:
        # For more details see `nemo.collections.common.parts.LinearAdapter` class
        in_features: -1 # Will be replaced with size of audio encoder
        dim: 128 # Hidden dimension of the feed forward network.
        activation: 'swish' # Str name for an activation function.
    fusion:
      num_layers: 4 # Number of layers to use in fusion
      num_attention_heads: 4 # Number of attention heads to use in fusion
      inner_size: 2048 # Fusion inner size

  class_labels:
    punct_labels_file: punct_label_ids.txt
    capit_labels_file: capit_label_ids.txt

  common_dataset_parameters:
    pad_label: 'O'
    ignore_extra_tokens: false
    ignore_start_end: true
    punct_label_ids: null
    capit_label_ids: null
    label_vocab_dir: null

  train_ds:
    # Tarred dataset is recommended if all dataset cannot be loaded in memory. Use script
    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py` for tarred dataset
    # creation.
    use_tarred_dataset: false

    # A path to directory where `tar_metadata_file` or `text_file` and `labels_file` and `audio_file` are stored.
    ds_item: ???
    text_file: text_train.txt
    labels_file: labels_train.txt
    audio_file: audio_train.txt

    use_audio: true # Has to be set to true to use it for lexical audio model.
    use_bucketing: true # If set to true batches will be sorted by length of audios and packed in batches limited by `tokens_in_batch`. Otherwise, provide `batch_size` parameter.
    # If set to true audios will be loaded to memory during __init__ call of `BertPunctuationCapitalizationDataset`, consumes more RAM.
    # Otherwise, audios will be loaded during `collate_fn` call of `BertPunctuationCapitalizationDataset`.
    preload_audios: true

    # A max number of source text tokens in a batch. Examples are sorted by number of tokens in a source text before
    # batching. Examples which number of tokens do not differ much are added to the batch. This procedure reduces
    # number of pad tokens in a batch. A number of examples in a batch varies: longer input sequences -> less
    # examples in a batch.
    tokens_in_batch: 2048
    max_seq_length: 512

    sample_rate: 16000 # Target sample rate of audios can be used for downsampling or upsamling.
    num_workers: 0

    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
    # number of jobs is equal to the number of CPU cores.
    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
    n_jobs: 0

    # Path to tarred dataset metadata file. Required if tarred dataset is used. Metadata file is a JSON file which
    # contains total number of batches in the dataset, a list of paths to tar files and paths to label vocabularies.
    # Metadata file is create by script
    # `examples/nlp/token_classification/create_punctuation_capitalization_tarred_dataset.py`
    tar_metadata_file: null
    # Controls batch shuffling in tarred dataset. `tar_shuffle_n` is a size of shuffled batch buffer. Mind that this
    # shuffling only permutes batches and doesn't exchange samples between batches. Proper shuffling is turned on in
    # regular dataset.
    tar_shuffle_n: 1

  validation_ds:
    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
    # See `train_ds` section for more details on tarred dataset
    use_tarred_dataset: false
    # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square
    # brackets)
    ds_item: ???

    text_file: text_dev.txt
    labels_file: labels_dev.txt
    audio_file: audio_dev.txt

    use_audio: true
    use_bucketing: false
    preload_audios: false

    shuffle: false
    num_samples: -1
    batch_size: 32
    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
    # number of jobs is equal to the number of CPU cores.
    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
    n_jobs: 0

    # For more details see `train_ds` section.
    tar_metadata_file: null

    sample_rate: 16000
    num_workers: 0

  test_ds:
    # if evaluation data is not in the model.train_ds.ds_item as the training data or multiple datasets are used for
    # evaluation is needed, specify ds_item, otherwise by default model.train_ds.ds_item is used
    # See `train_ds` section for more details on tarred dataset
    use_tarred_dataset: false
    # expected format: `[PATH_TO_DEV1,PATH_TO_DEV2]` OR `PATH_TO_DEV` (Note no space between the paths and square
    # brackets)
    ds_item: ???

    text_file: text_dev.txt
    labels_file: labels_dev.txt
    audio_file: audio_dev.txt

    use_audio: true
    use_bucketing: false
    preload_audios: false

    shuffle: false
    num_samples: -1
    batch_size: 32
    # Number of jobs for tokenization and labels encoding. If 0, then multiprocessing is not used. If null,
    # number of jobs is equal to the number of CPU cores.
    # WARNING: can cause deadlocks with tokenizers, which use multiprocessing (e.g. SentencePiece)
    n_jobs: 0

    # For more details see `train_ds` section.
    tar_metadata_file: null

    sample_rate: 16000
    num_workers: 0

  tokenizer:
    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
    vocab_file: null # path to vocab file
    tokenizer_model: null # only used if tokenizer is sentencepiece
    special_tokens: null

  language_model:
    pretrained_model_name: bert-base-uncased
    lm_checkpoint: null
    config_file: null # json file, precedence over config
    config: null

  punct_head:
    num_fc_layers: 1
    fc_dropout: 0.1
    activation: 'relu'
    use_transformer_init: True

  capit_head:
    num_fc_layers: 1
    fc_dropout: 0.1
    activation: 'relu'
    use_transformer_init: true

  optim:
    name: adam
    lr: 1e-4
    weight_decay: 0.00

    sched:
      name: WarmupAnnealing
      # Scheduler params
      warmup_steps: null
      warmup_ratio: 0.1
      last_epoch: -1

      # pytorch lightning args
      monitor: val_loss
      reduce_on_plateau: false

hydra:
  run:
    dir: .
  job_logging:
    root:
      handlers: null