In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

In [None]:
import json
import torch
import os
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.collections.asr.parts.utils.vad_utils import stitch_segmented_asr_output

# Offline ASR+VAD

In this tutorial, we will demonstrate how to use offline VAD before ASR to extract speech segments. This will help to exclude some non_speech utterances and could save computation resources by removing unnecessary input to ASR system. 

The pipeline includes the following steps.

0. [Prepare data for demonstration](#Prepare-data-for-demonstration)
1. [Use offline VAD to extract speech segments](#Use-offline-VAD-to-extract-speech-segments)
2. [Transcribe speech segments](#Transcribe-speech-segments)
3. [Stitch the prediction text of speech segments](#Stitch-the-prediction-text-of-speech-segments)
4. [Evaluate offline VAD+ASR performance](#Evaluate-offline-VAD+ASR-performance)

## Prepare data for demonstration


In [None]:
!mkdir -p data
!wget -P data/ https://nemo-public.s3.us-east-2.amazonaws.com/chris-sample01_02.wav
!wget -P data/ https://nemo-public.s3.us-east-2.amazonaws.com/chris-sample03.wav
!wget https://nemo-public.s3.us-east-2.amazonaws.com/chris_demo.json

In [None]:
input_manifest="chris_demo.json"
vad_out_manifest_filepath="vad_out.json"
vad_model="vad_marblenet" # here we use vad_marblenet for example, you can choose other VAD models.

## Use offline VAD to extract speech segments

Here we are using very simple parameters to demonstrate the process. 

Please choose or tune your own postprocessing paramters. 

You can find more details in 
```python 
<NeMo_git_root>/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb and 
<NeMo_git_root>/scripts/voice_activity_detection/vad_tune_threshold.py
```

The <code>vad_infer.py</code> script will have you generate speech segments. See more details in script.

In [None]:
%run -i ../../examples/asr/speech_classification/vad_infer.py --config-path="../conf/VAD" --config-name="vad_inference_postprocessing.yaml" \
dataset=$input_manifest \
vad.model_path=$vad_model \
frame_out_dir="chris_demo" \
vad.parameters.window_length_in_sec=0.63 \
vad.parameters.postprocessing.onset=0.5 \
vad.parameters.postprocessing.offset=0.5 \
vad.parameters.postprocessing.min_duration_on=0.5 \
vad.parameters.postprocessing.min_duration_off=0.5 \
out_manifest_filepath=$vad_out_manifest_filepath

## Transcribe speech segments

In [None]:
segmented_output_manifest="asr_segmented_output_manifest.json"
asr_model="stt_en_citrinet_1024_gamma_0_25" # here we use citrinet for example, you can choose other ASR models.

The <code>transcribe_speech.py</code> script will have you transcribe each speech segments. See more details in script.

In [None]:
%run -i ../../examples/asr/transcribe_speech.py \
    pretrained_name=$asr_model \
    dataset_manifest=$vad_out_manifest_filepath \
    batch_size=32 \
    amp=True \
    output_filename=$segmented_output_manifest

Let's have a look at the segmented ASR transcript

In [None]:
!head -n 5 $segmented_output_manifest

## Stitch the prediction text of speech segments

You can also evaluate the whole ASR output by stitch the segmented outputs together. 

Note, there would be better method to stitch them together. Here, we just demonstrate the simpliest method, concatenating.

In [None]:
stitched_output_manifest="stitched_asr_output_manifest.json"

In [None]:
stitched_output_manifest = stitch_segmented_asr_output(segmented_output_manifest)

In [None]:
stitched_output=[]
for line in open(stitched_output_manifest, 'r', encoding='utf-8'):
    file = json.loads(line)
    stitched_output.append(file)

Let's have a look at the stored speech segments of first sample

In [None]:
print(stitched_output[0])
print(f"\n The speech segments of above file is \n {torch.load(stitched_output[0]['speech_segments_filepath'])}")

# Evaluate offline VAD+ASR performance

if we have groundtruth <code>'text'</code> in input_manifest, we can evaluate our performance of stitched output. 

In [None]:
asr_input=[]
for line in open(input_manifest, 'r', encoding='utf-8'):
    file = json.loads(line)
    asr_input.append(file)

In [None]:
# Make sure we stitch the ASR output correctly 
# And since we didn't shuffle the evaluation, 
# we don't need to worry about the order of samples in input_manifest and stitched_output_manifest
assert len(asr_input) == len(stitched_output)

In [None]:
predicted_text, ground_truth_text = [], []
for i in range(len(asr_input)):
    assert asr_input[i]['audio_filepath'] == stitched_output[i]['audio_filepath']
    predicted_text.append(stitched_output[i]['pred_text'])
    ground_truth_text.append(asr_input[i]['text'])

In [None]:
metric_value = word_error_rate(hypotheses=predicted_text, references=ground_truth_text, use_cer=False)
print(f"WER is {metric_value}")