Fix a wrong description in offline_diarization_with_asr.yaml (#4141)

* Updated a line in yaml Signed-off-by: Taejin Park <tango4j@gmail.com> * Fixed typo and grammatical errors Signed-off-by: Taejin Park <tango4j@gmail.com>
NVIDIA · May 10, 2022 · 650718f · 650718f
1 parent 1d64497
commit 650718f
Showing 1 changed file with 10 additions and 10 deletions.
diff --git a/examples/speaker_tasks/diarization/conf/offline_diarization_with_asr.yaml b/examples/speaker_tasks/diarization/conf/offline_diarization_with_asr.yaml
@@ -7,7 +7,7 @@ batch_size: 64
 diarizer:
   manifest_filepath: ???
   out_dir: ???
-  oracle_vad: False # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
+  oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps
   collar: 0.25 # Collar value for scoring
   ignore_overlap: True # Consider or ignore overlap segments while scoring
 
@@ -34,29 +34,29 @@ diarizer:
       window_length_in_sec: 1.5 # Window length(s) in sec (floating-point number). Either a number or a list. Ex) 1.5 or [1.5,1.0,0.5]
       shift_length_in_sec: 0.75 # Shift length(s) in sec (floating-point number). Either a number or a list. Ex) 0.75 or [0.75,0.5,0.25]
       multiscale_weights: null # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. Ex) [0.33,0.33,0.33]
-      save_embeddings: False # Save embeddings as pickle file for each audio input.
+      save_embeddings: False # Save speaker embeddings in pickle format.
 
   clustering:
     parameters:
       oracle_num_speakers: False # If True, use num of speakers value provided in manifest file.
-      max_num_speakers: 20 # Max number of speakers for each recording. If oracle num speakers is passed, this value is ignored.
+      max_num_speakers: 20 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored.
       enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. 
 
   asr:
-    model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purpose.
+    model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
     parameters:
       asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
-      asr_based_vad_threshold: 50 # threshold (multiple of 10ms) for ignoring the gap between two words when generating VAD timestamps using ASR based VAD.
+      asr_based_vad_threshold: 0.05 # Threshold (in sec) that caps the gap between two words when generating VAD timestamps using ASR based VAD.
       asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
-      lenient_overlap_WDER: True # If true, when a word falls into speaker-ovelapped regions, consider the word as a correctly diarized word.
-      decoder_delay_in_sec: null # Native deocder delay. null is recommended to use the default values for each ASR model.
+      lenient_overlap_WDER: True # If true, when a word falls into  speaker-overlappedregions, consider the word as a correctly diarized word.
+      decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
       word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05  0.2]. 
       word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
-      fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide VAD model to use this feature.
-      colored_text: False # If True, use colored text to distiguish speakers in the output transcript.
-      print_time: True # If True, the start of end time of each speaker turn is printed in the output transcript.
+      fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
+      colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
+      print_time: True # If True, the start and end time of each speaker turn is printed in the output transcript.
       break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)
 
     ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)