# Streaming multispeaker ASR and diarization tutorial

In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.

if False: 
    ## Install dependencies
    !pip install wget
    !apt-get install sox libsndfile1 ffmpeg
    !pip install text-unidecode

    # ## Install NeMo
    BRANCH = 'main'
    !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

    ## Install TorchAudio
    !pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
import os
import sys
print(sys.path)
sys.path.insert(0,f'/home/taejinp/projects/online_diar/NeMo/')
import nemo
print("Nemo PATH:", nemo.__path__)
BRANCH = 'streaming_mulspk_asr'

# Online Speaker Diarization

Speaker diarization is the process of determining "who spoke when" in a given audio clip. Depending on the method of processing, speaker diarization can be categorized into two types:

- **Offline Speaker Diarization**: This method assumes access to the entire audio clip. The transcription, indicating which speaker spoke at which time, is provided after processing the audio from start to end.

- **Online Speaker Diarization**: In this approach, the system only gradually gains access to short segments of the audio, typically a few seconds long. The transcription is generated and displayed in real-time as the segmented audio is being processed.


In [None]:
import sys
import socket
if socket.gethostname() == "aiapps-06052021":
    sys.path.insert(0,'/home/taejinp/projects/streaming_mulspk_asr/NeMo')
else:
    sys.path.insert(0,'/your/path/to/NeMo/')
    
import nemo
print("Using Nemo PATH:", nemo.__path__[0])

# !pip install gradio==2.9.0

# Introduction to Online Speaker Diarization

As covered in Speaker diarization inference tutorial, speaker diarization is the task of segmenting audio recordings by speaker labels and answers the question "Who Speaks When?".

While offline speaker diarization has access to the entire audio file and return the speaker labels all at once, online speaker diarization is a streaming task that processes audio in small chunks. 
Since we only have access to a small chunk of audio at a time, the online speaker diarization system needs to maintain a memory buffer to store the history of the speakers in the past. At the sametime, the system needs to be able to detect new speakers that are not in the memory buffer.

This tutorial will cover the followings:

- How to run online speaker diarization with NeMo
- How online speaker clustering and memory buffer works together


In [None]:
from nemo.collections.asr.parts.utils.speaker_utils import audio_rttm_map
from nemo.core.config import hydra_runner
import gradio as gr
from scipy.io import wavfile
import numpy as np
import os
import torch

Read yaml file for online diarization. You have to specifty the following items:
    
- input manifest file (If  simulation)
- VAD model path
- Speaker embedding extractor model path
- Diarization Decoder model path (Coming soon)
- Punctuation model path (automatically download from NGC)
- Language model path (Coming soon)

Download nemo models and specify the path to config struct.

In [None]:
import omegaconf

YAML_FILE="/home/taejinp/projects/streaming_mulspk_asr/NeMo/examples/speaker_tasks/diarization/conf/inference/online_diar_infer_general.yaml"
cfg = omegaconf.OmegaConf.load(YAML_FILE)
import socket

cfg.diarizer.out_dir = "./streaming_diar_output"

os.makedirs(cfg.diarizer.out_dir, exist_ok=True)
cfg.diarizer.asr.parameters.colored_text = False
print(f"socket.gethostname() {socket.gethostname()}")
if socket.gethostname() == "aiapps-06052021":
    cfg.diarizer.manifest_filepath = "/home/taejinp/projects/data/diar_manifest_input/online_diar_demo_01.json"
    cfg.diarizer.vad.model_path = "/home/taejinp/gdrive/model/VAD_models/mVAD_lin_marblenet-3x2x64-4N-256bs-50e-0.01lr-0.001wd.nemo"
    cfg.diarizer.speaker_embeddings.model_path = "/home/taejinp/Downloads/titanet_target_fixed/titanet-l.nemo"
    cfg.diarizer.asr.model_path = "/home/taejinp/gdrive/model/ASR_models/Conformer-CTC-BPE_large_Riva_ASR_set_3.0_ep60.nemo"

else:
    # Please download the following models and run the code. 
    # Download CH109 dataset at: https://drive.google.com/drive/folders/1ksq10H-NZbKRfMjEP_WWyBF_G0iAJt6b?usp=sharing
    cfg.diarizer.manifest_filepath = "/your/path/to/ch109.json"
    # Download streaming VAD model at: https://drive.google.com/file/d/1ab42CaYeTkuJSMsMsMLbSS9m5e1isJzx/view?usp=sharing
    cfg.diarizer.vad.model_path = "/your/path/to/mVAD_lin_marblenet-3x2x64-4N-256bs-50e-0.01lr-0.001wd.nemo"
    # Download titanet-m model at: https://drive.google.com/file/d/1xAgjm0udKogPrlQF6cdHLobEKHLY9azA/view?usp=sharing
    cfg.diarizer.speaker_embeddings.model_path = "/your/path/to/titanet-m.nemo"
    # Download Conformer-CTC ASR model at: https://drive.google.com/file/d/1Xg075IbiwL0szI4_a8gYmCPaG1UsgR6E/view?usp=sharing
    cfg.diarizer.asr.model_path = "/your/path/to/Conformer-CTC-BPE_large_Riva_ASR_set_3.0_ep60.nemo"


Now, let's setup the parameters for online diarization.

In [None]:
from nemo.collections.asr.models import OnlineClusteringDiarizer
import os

params = {}
params['use_cuda'] = True
AUDIO_RTTM_MAP = audio_rttm_map(cfg.diarizer.manifest_filepath)

diar = OnlineClusteringDiarizer(cfg)
from nemo.collections.asr.parts.utils.diarization_utils import OnlineDiarWithASR, write_txt

cfg.diarizer.simulation_uniq_id='citadel_ken'
cfg.diarizer.out_dir = '/home/taejinp/projects/run_time/streaming_diar_output_univ'
cfg.diarizer.asr.parameters.streaming_simulation=True
cfg.diarizer.asr.parameters.enforce_real_time=True 
cfg.diarizer.asr.parameters.colored_text=False
 
fn = os.path.join(cfg.diarizer.out_dir, "print_script.sh")

diar.uniq_id = cfg.diarizer.simulation_uniq_id 
diar.single_audio_file_path = AUDIO_RTTM_MAP[diar.uniq_id]['audio_filepath']
diar.rttm_file_path = AUDIO_RTTM_MAP[diar.uniq_id]['rttm_filepath']
diar._init_segment_variables()


online_diar_asr = OnlineDiarWithASR(cfg=cfg)
diar = online_diar_asr.diar

diar.device = online_diar_asr.device
online_diar_asr.reset()

cfg.diarizer.asr.parameters.streaming_simulation=True
# cfg.diarizer.asr.parameters.streaming_simulation=False

if not cfg.diarizer.asr.parameters.streaming_simulation:
    cfg.diarizer.asr.parameters.enforce_real_time=False

We can run simulated audio stream to check if streaming system is working properly. After you initiate the following function and while the function is running, you can check the transcription is being generated in realtime.  The path is ./streaming_diar_output/print_script.sh, and this can be viewed using "streaming_diarization_viewer.ipynb"


In [None]:
import ipywidgets
import time
box_layout = ipywidgets.Layout(height="500px", width="90%")
widget = ipywidgets.Textarea(value='', disabled=True, layout=box_layout)
display(widget)  # display widget

In [None]:
diar.uniq_id = cfg.diarizer.simulation_uniq_id
online_diar_asr.get_audio_rttm_map(diar.uniq_id)
diar.single_audio_file_path = diar.AUDIO_RTTM_MAP[diar.uniq_id]['audio_filepath']
online_diar_asr.rttm_file_path = diar.AUDIO_RTTM_MAP[diar.uniq_id]['rttm_filepath']

diar._init_segment_variables()
diar.device = online_diar_asr.device
write_txt(f"{diar._out_dir}/print_script.sh", "")

if cfg.diarizer.asr.parameters.streaming_simulation == True:
    samplerate, sdata = wavfile.read(diar.single_audio_file_path)
    if  diar.AUDIO_RTTM_MAP[diar.uniq_id]['offset'] and diar.AUDIO_RTTM_MAP[diar.uniq_id]['duration']:
        
        offset = samplerate*diar.AUDIO_RTTM_MAP[diar.uniq_id]['offset']
        duration = samplerate*diar.AUDIO_RTTM_MAP[diar.uniq_id]['duration']
        stt, end = int(offset), int(offset + duration)
        sdata = sdata[stt:end]

    for index in range(int(np.floor(sdata.shape[0]/online_diar_asr.n_frame_len))):
        shift = online_diar_asr.CHUNK_SIZE
        sample_audio = sdata[shift*index:shift*(index+1)]
        online_diar_asr.buffer_counter = index
        online_diar_asr.streaming_step(sample_audio)
        
        widget.value += f" update {index}"
        fp = open(f'{diar._out_dir}/print_script.sh','r').read()
        widget.value = fp
else:
    isTorch = torch.cuda.is_available()
    iface = gr.Interface(
    fn=online_diar_asr.audio_queue_launcher,
    inputs=[
        gr.Audio(source="microphone", type="numpy", streaming=True), 
        "state",
    ],
    outputs=[
        "textbox",
        "state",
    ],
    layout="horizontal",
    theme="huggingface",
    title=f"NeMo Streaming Conformer CTC Large - English, CUDA:{isTorch}",
    description="Demo for English speech recognition using Conformer Transducers",
    allow_flagging='never',
    live=True,
    )
    iface.launch(share=True)



Now, go to streaming_diarization_viewer.ipynb and check the realtime output.

In [None]:
while True:
    fp = open(f'{diar._out_dir}/print_script.sh','r').read()
    widget.value = fp
    time.sleep(0.1)