Adding long-form audio speaker diarization (clustering) class and fun…

…ctions (#7737) * Adding long-form audio clustering for diarization Signed-off-by: Taejin Park <tango4j@gmail.com> * Adding unit test changes Signed-off-by: Taejin Park <tango4j@gmail.com> * Added tests for torch jit script Signed-off-by: Taejin Park <tango4j@gmail.com> * Added variable value checking line Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Added needed params to all yamls Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Merged latest main and updated speaker utils Signed-off-by: Taejin Park <tango4j@gmail.com> * Fixed code formatting error in speaker_utils.py Signed-off-by: Taejin Park <tango4j@gmail.com> * Some minor fixes for doc-strings Signed-off-by: Taejin Park <tango4j@gmail.com> * Removed unnecessary comments Signed-off-by: Taejin Park <tango4j@gmail.com> * Refelcted comments and made changes Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Minor changes on typos and comments Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes for code QL Signed-off-by: Taejin Park <tango4j@gmail.com> * Fixed docstring errors Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reflected the second batch of comments Signed-off-by: Taejin Park <tango4j@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updating all yamls for inference Signed-off-by: Taejin Park <tango4j@gmail.com> * Added None-checker to forward to prevent type errors Signed-off-by: Taejin Park <tango4j@gmail.com> --------- Signed-off-by: Taejin Park <tango4j@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
NVIDIA · Nov 7, 2023 · df9f0d1 · df9f0d1
1 parent b1bd2db
commit df9f0d1
Show file tree

Hide file tree

Showing 8 changed files with 707 additions and 103 deletions.
diff --git a/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml b/examples/speaker_tasks/diarization/conf/inference/diar_infer_general.yaml
@@ -52,7 +52,9 @@ diarizer:
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
-
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
+
   msdd_model:
     model_path: null  # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
     parameters:
@@ -88,5 +90,4 @@ diarizer:
       arpa_language_model: null # Provide a KenLM language model in .arpa format.
       min_number_of_words: 3 # Min number of words for the left context.
       max_number_of_words: 10 # Max number of words for the right context.
-      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.
-
+      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.
diff --git a/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml b/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml
@@ -52,6 +52,8 @@ diarizer:
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
 
   msdd_model:
     model_path: null # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
@@ -88,5 +90,4 @@ diarizer:
       arpa_language_model: null # Provide a KenLM language model in .arpa format.
       min_number_of_words: 3 # Min number of words for the left context.
       max_number_of_words: 10 # Max number of words for the right context.
-      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.
-
+      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.
diff --git a/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml b/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml
@@ -44,14 +44,16 @@ diarizer:
       multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
       save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`.
 
-  clustering:
+  clustering: 
     parameters:
       oracle_num_speakers: False # If True, use num of speakers value provided in manifest file.
       max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored.
       enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
       max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
       sparse_search_volume: 30 # The higher the number, the more values will be examined with more time. 
       maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
+      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
+      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 
 
   msdd_model:
     model_path: diar_msdd_telephonic # .nemo local model path or pretrained model name for multiscale diarization decoder (MSDD)
@@ -88,5 +90,4 @@ diarizer:
       arpa_language_model: null # Provide a KenLM language model in .arpa format.
       min_number_of_words: 3 # Min number of words for the left context.
       max_number_of_words: 10 # Max number of words for the right context.
-      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.
-
+      logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.