# Streaming multispeaker ASR and diarization tutorial

Set your NeMo path

In [None]:
import sys
# sys.path.insert(0,'/your/path/to/NeMo/')
sys.path.insert(0,'/home/taejinp/projects/streaming_mulspk_asr/NeMo')

import nemo
print("Using Nemo PATH:", nemo.__path__[0])

# !pip install gradio==2.9.0

Set your NeMo path

In [None]:
from nemo.collections.asr.parts.utils.speaker_utils import audio_rttm_map
from nemo.core.config import hydra_runner
import gradio as gr
from scipy.io import wavfile
import numpy as np
import hydra
import os
import torch
from streaming_asr_and_diar import (
OnlineDiarizer,
ASR_DIAR_ONLINE
)

Read yaml file for online diarization. You have to specifty the following items:
    
- input manifest file (If  simulation)
- VAD model path
- Speaker embedding extractor model path
- Diarization Decoder model path (Coming soon)
- Punctuation model path (automatically download from NGC)
- Language model path (Coming soon)

Download nemo models and specify the path to config struct.

In [None]:
hydra.initialize(config_path="conf")
cfg = hydra.compose(config_name="/online_diarization_with_asr.yaml")
import socket
print(socket.gethostname())

if socket.gethostname() != "aiapps-06052021":
    # Please download the following models and run the code. 

    cfg.diarizer.out_dir = "./streaming_diar_output"
    os.makedirs(cfg.diarizer.out_dir, exist_ok=True)
    
    # Download CH109 dataset at: https://drive.google.com/drive/folders/1ksq10H-NZbKRfMjEP_WWyBF_G0iAJt6b?usp=sharing
    cfg.diarizer.manifest_filepath = "/your/path/to/ch109.json"

    # Download streaming VAD model at: https://drive.google.com/file/d/1ab42CaYeTkuJSMsMsMLbSS9m5e1isJzx/view?usp=sharing
    cfg.diarizer.vad.model_path = "/your/path/to/mVAD_lin_marblenet-3x2x64-4N-256bs-50e-0.01lr-0.001wd.nemo"

    # Download titanet-m model at: https://drive.google.com/file/d/1xAgjm0udKogPrlQF6cdHLobEKHLY9azA/view?usp=sharing
    cfg.diarizer.speaker_embeddings.model_path = "/your/path/to/titanet-m.nemo"

    # Download Conformer-CTC ASR model at: https://drive.google.com/file/d/1Xg075IbiwL0szI4_a8gYmCPaG1UsgR6E/view?usp=sharing
    cfg.diarizer.asr.model_path = "/your/path/to/Conformer-CTC-BPE_large_Riva_ASR_set_3.0_ep60.nemo"


Initialize ASR_DIAR_ONLINE and OnlineDiarizer Class.

In [None]:
params = {}
params['use_cuda'] = True
AUDIO_RTTM_MAP = audio_rttm_map(cfg.diarizer.manifest_filepath)

diar = OnlineDiarizer(cfg)

diar.uniq_id = "en_0638"
diar.single_audio_file_path = AUDIO_RTTM_MAP[diar.uniq_id]['audio_filepath']
diar.rttm_file_path = AUDIO_RTTM_MAP[diar.uniq_id]['rttm_filepath']
# diar.rttm_file_path = None # DER calculation slows down online diarization speed
diar._init_segment_variables()

asr_diar = ASR_DIAR_ONLINE(diar, cfg=cfg.diarizer, params=params)
diar.device = asr_diar.device
asr_diar.reset()

simulation = True
# simulation = False # Run Gradio server with your microphone.

Let's run simulated audio stream to check if streaming system is working properly. After you initiate the following function and while the function is running, you can check the transcription is being generated in realtime.  The path is ./streaming_diar_output/print_script.sh, and this can be viewed using "streaming_diarization_viewer.ipynb"


In [None]:
if simulation:
    samplerate, sdata = wavfile.read(diar.single_audio_file_path)
    for index in range(int(np.floor(sdata.shape[0]/asr_diar.n_frame_len))):
        asr_diar.buffer_counter = index
        sample_audio = sdata[asr_diar.CHUNK_SIZE*(asr_diar.buffer_counter):asr_diar.CHUNK_SIZE*(asr_diar.buffer_counter+1)]
        asr_diar.callback_sim(sample_audio)
else:
    isTorch = torch.cuda.is_available()
    asr_diar.rttm_file_path = None
    iface = gr.Interface(
        fn=asr_diar.audio_queue_launcher,
        inputs=[
            gr.inputs.Audio(source="microphone", type='filepath'), 
            "state",
        ],
        outputs=[
            "textbox",
            "state",
        ],
        layout="horizontal",
        theme="huggingface",
        title=f"NeMo Streaming Conformer CTC Large - English, CUDA:{isTorch}",
        description="Demo for English speech recognition using Conformer Transducers",
        allow_flagging='never',
        live=True,
    )
    iface.launch(share=False)


Now, go to streaming_diarization_viewer.ipynb and check the realtime output.