From 9d450aaa544ca65e214b78f182c9e99e86f8b89d Mon Sep 17 00:00:00 2001
From: stevehuang52 <heh@nvidia.com>
Date: Mon, 3 Jun 2024 14:17:08 -0400
Subject: [PATCH] move AED chunked infer script

Signed-off-by: stevehuang52 <heh@nvidia.com>
---
 examples/asr/asr_chunked_inference/README.md          |  8 ++++++--
 .../aed}/speech_to_text_aed_chunked_infer.py          | 11 +++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)
 rename examples/asr/{speech_multitask => asr_chunked_inference/aed}/speech_to_text_aed_chunked_infer.py (96%)

diff --git a/examples/asr/asr_chunked_inference/README.md b/examples/asr/asr_chunked_inference/README.md
index 5b4c79613ed9..fec2e2901c18 100644
--- a/examples/asr/asr_chunked_inference/README.md
+++ b/examples/asr/asr_chunked_inference/README.md
@@ -1,6 +1,6 @@
-# Streaming / Buffered ASR
+# Streaming / Buffered / Chunked ASR
 
-Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models.
+Contained within this directory are scripts to perform streaming or buffered inference of audio files using CTC / Transducer ASR models, and chunked inference for MultitaskAED models (e.g., "nvidia/canary-1b").
 
 ## Difference between streaming and buffered ASR
 
@@ -9,3 +9,7 @@ While we primarily showcase the defaults of these models in buffering mode, note
 If you reduce your chunk size, the latency for your first prediction is reduced, and the model appears to predict the text with shorter delay. On the other hand, since the amount of information in the chunk is reduced, it causes higher WER.
 
 On the other hand, if you increase your chunk size, then the delay between spoken sentence and the transcription increases (this is buffered ASR). While the latency is increased, you are able to obtain more accurate transcripts since the model has more context to properly transcribe the text.
+
+## Chunked Inference
+
+For MultitaskAED models, we provide a script to perform chunked inference. This script will split the input audio into non-overlapping chunks and perform inference on each chunk. The script will then concatenate the results to provide the final transcript.
diff --git a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
similarity index 96%
rename from examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
rename to examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
index 52d3a86c1018..39b7547923cd 100644
--- a/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py
+++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -88,7 +88,9 @@ class TranscriptionConfig:
 
     # Chunked configs
     chunk_len_in_secs: float = 40.0  # Chunk length in seconds
-    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    model_stride: int = (
+        8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    )
 
     # Decoding strategy for MultitaskAED models
     decoding: MultiTaskDecodingConfig = MultiTaskDecodingConfig()
@@ -209,7 +211,12 @@ def autocast(*args, **kwargs):
     with autocast(dtype=amp_dtype):
         with torch.no_grad():
             hyps = get_buffered_pred_feat_multitaskAED(
-                frame_asr, model_cfg.preprocessor, model_stride_in_secs, asr_model.device, manifest, filepaths,
+                frame_asr,
+                model_cfg.preprocessor,
+                model_stride_in_secs,
+                asr_model.device,
+                manifest,
+                filepaths,
             )
 
     output_filename, pred_text_attr_name = write_transcription(