NVIDIA · blisc · May 30, 2023 · Jun 13, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
@@ -0,0 +1,149 @@
+name: megatron_t5_speechllm_tts_inference
+checkpoint_path: ???
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 3
+  gradient_clip_val: 1.0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 2
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 16
+  micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  temperature: 0.7 # Temperature to be used for inference
+  top_k: 80 # Top k to be used for inference
+  max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for
+
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: null
+    validation_ds: null
+    test_ds: ???
+    max_seq_length: 1536
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30000
+    train_task: asr
+    sup_data_path: None
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 5e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
diff --git a/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml b/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
@@ -0,0 +1,221 @@
+name: megatron_t5_speechllm_tts_inference
+checkpoint_path: ???
+
+trainer:
+  devices: 1
+  accelerator: gpu
+  num_nodes: 1
+  precision: 32
+  logger: False
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: 10000
+  max_steps: -1
+  log_every_n_steps: 10
+  val_check_interval: null
+  check_val_every_n_epoch: 3
+  gradient_clip_val: 1.0
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  resume_if_exists: False
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 2
+    mode: min
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
+    filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: True
+  create_early_stopping_callback: False
+  early_stopping_callback_params:
+    monitor: "val_loss"
+    mode: "min"
+    min_delta: 0.001
+    patience: 10
+    verbose: True
+
+model:
+  seed: 1234
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  global_batch_size: 16
+  micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
+  validation_global_batch_size: ${model.global_batch_size}
+  validation_micro_batch_size: ${model.micro_batch_size}
+  validation_drop_last: False
+  report_validation_metric: False
+  validation_metric: accuracy
+  num_speech_tokens: 10112 # Vocabulary size pertaining to speech
+  seq_pattern: "parallel" # parallel, delay_parallel, flatten
+  speech_head_type: "linear" # token_level, linear
+  cross_entropy_type: "vocab_parallel" # regular, vocab_parallel
+  temperature: 0.7 # Temperature to be used for inference
+  top_k: 80 # Top k to be used for inference
+  max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for
+
+  restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  existing_tasks: []
+  new_tasks: ["squad"]
+  codecmodel_type: nemo_codec
+  codecmodel_path: ???
+  english_only_model: true
+  context_conditioning: decoder
+  train_from_scratch: true
+  override_tokenizer_vocab_file: ???
+  use_flash_attention: true
+  lm_vocab_size: 30000
+
+  frozen_model:
+    # micro_batch_size: null
+    # global_batch_size: null
+    # megatron_amp_O2: true
+    # seq_length: 512
+    # max_position_embeddings: 512
+    # precision: bf16
+    # Above is overridden in code
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    pipeline_model_parallel_split_rank: 0
+    make_vocab_size_divisible_by: 128
+    pre_process: true
+    post_process: true
+    gradient_as_bucket_view: true
+    native_amp_init_scale: 4294967296
+    native_amp_growth_interval: 1000
+    fp16_lm_cross_entropy: false
+    seed: 1234
+    use_cpu_initialization: false
+    apex_transformer_log_level: 30
+    tokenizer:
+      library: megatron
+      type: BertWordPieceCase
+      model: null
+      vocab_file: null
+      merge_file: null
+      # num_sentinel_tokens: 100
+    optim:
+      name: null
+    data:
+      dataset_type: t5
+    encoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+    decoder:
+      arch: transformer
+      bias_activation_fusion: false
+      use_flash_attention: ${model.use_flash_attention}
+      num_layers: 12
+      hidden_size: 768
+      ffn_hidden_size: 2048
+      num_attention_heads: 12
+      init_method_std: 0.015
+      hidden_dropout: 0.1
+      attention_dropout: 0.1
+      kv_channels: 64
+      activation: geglu
+
+  task_templates:
+  - taskname: "squad"
+    prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
+    total_virtual_tokens: 3
+    virtual_token_splits: [3]
+    truncate_field: context
+    answer_field: answer
+
+  p_tuning: # P-tuning specific params
+      encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
+      num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
+      dropout: 0.0
+
+  prompt_tuning: # Prompt tunin specific params
+    new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
+    new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random
+
+  data:
+    grapheme_prefix: null
+    train_ds: null
+    validation_ds: null
+    test_ds: ???
+    max_seq_length: 1536
+    sample_rate: 24000
+    add_eos: true
+    add_bos: false
+    decoder_starts_with_pad: False
+    add_eos_to_decoder_output: True
+    add_sentinel_to_input: True
+    ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
+    shuffle: true
+    num_workers: 4
+    pin_memory: true
+    speech_offset: 30000
+    train_task: asr
+    sup_data_path: None
+    num_speech_codebooks: 8
+    codebook_fps: 86
+    context_duration_min: 2.9
+    context_duration_max: 2.9
+    g2p:
+      english:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+        heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_prefix: ${model.data.grapheme_prefix}
+      spanish:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
+        phoneme_probability: 0.8
+        use_chars: True
+        use_stresses: True
+        ignore_ambiguous_words: False
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "es-ES"
+      mandarin:
+        _target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
+        phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
+        word_segmenter: "jieba"
+        phoneme_prefix: ""
+        phoneme_case: "lower"
+        tone_prefix: "#"
+        ascii_letter_prefix: ${model.data.grapheme_prefix}
+        ascii_letter_case: "upper"
+      german:
+        _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
+        phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
+        heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
+        phoneme_probability: 0.8
+        ignore_ambiguous_words: False
+        use_chars: True
+        use_stresses: True
+        grapheme_case: mixed
+        grapheme_prefix: ${model.data.grapheme_prefix}
+        locale: "de-DE"
+
+  optim:
+    name: fused_adam
+    lr: 5e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98