# Korean ASR with Riva

Refer to: https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-customizing.html

## Download Korean ASR models from NGC

In [1]:
!mkdir -p models/korean
!cd models/korean

In [None]:
# Acoustic models from NGC.
# Note that 
#     1) NGC API is installed.
#     2) The directory where being downloaded must have "write" authority.
#     3) NGC CLI should be executed at the location where models are going to be downloaded.(i.e., at ./models/korean)
#     4) It's OK to execute the NGC CLI outside the container(i.e., local workstation). However, make sure that the downloaded directory should be mounted to the container so that you can access to the models inside the container.

# Conformer-CTC
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_conformer:deployable_v1.0"
# Citrinet-1024
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_citrinet:deployable_v1.0"

In [None]:
# N-gram decoder models from NGC
# N-Gram
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_lm:deployable_v1.0"

In [None]:
# Punctuation(optional)
!ngc registry model download-version "nvidia/tao/punctuationcapitalization_ko_kr_bert_base:deployable_v1.1"

In [2]:
!cd ../..

## Build and deploy

Let's deploy Korean Citrinet-1024 as an example. The detailed pipeline configurations are specified in https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-pipeline-configuration.html.
1. Launch Riva Servicemaker **at your workstation**.
    - `./scripts/build_deploy/riva_servicemaker.sh`
2. Build rmir file **inside the servicemaker container**.
    - `cd /servicemaker-dev`
    - `./scripts/build_deploy/korean_models/riva_asr_citrinet_kr_build.sh`
    - This shell script consists of like:
        ```sh
        riva-build speech-recognition \
            /servicemaker-dev/<rmir_filename>:<encryption_key> \
            /servicemaker-dev/<riva_filename>:<encryption_key> \
            --name=<pipeline_name> 
            --decoder_type=flashlight \
            --decoding_language_model_binary=<KENLM_binary_filename> \
            --decoding_vocab=<decoder_vocab_file>
            ...
        ```
3. Deploy models **inside the servicemaker container**. 
    - `./scripts/build_deploy/korean_models/riva_asr_citrinet_kr_deploy.sh`
    
You can also deploy Conformer-CTC for Korean with the same procedure using those scripts:
- `./scripts/build_deploy/korean_models/riva_asr_conformer_kr_build.sh`
- `./scripts/build_deploy/korean_models/riva_asr_conformer_kr_deploy.sh`

For reference, **building and deploying each model takes approximately over 30 mins(Citrinet-1024 takes especially much longer), respectively.** After the deployment is done, restart the riva server at your local workstation.
```bash 
bash resources/riva_quickstart_v2.8.1/riva_stop.sh
bash resources/riva_quickstart_v2.8.1/riva_start.sh
```

## Check whether your model is successfully deployed using Trtion APIs.

In [3]:
# install triton client
!pip install tritonclient

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
import grpc
from tritonclient.grpc import service_pb2
from tritonclient.grpc import service_pb2_grpc

trt_channel = grpc.insecure_channel("riva-speech:8001")
# trt_channel = grpc.insecure_channel("localhost:8001")
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(trt_channel)

try:
    request = service_pb2.ServerLiveRequest()
    response = grpc_stub.ServerLive(request)
    print("server {}".format(response))
except Exception as ex:
    print(ex)

server live: true



In [5]:
request = service_pb2.RepositoryIndexRequest()
response = grpc_stub.RepositoryIndex(request)

print("num models: {}\n".format(len(response.models)))
print([i for i in response.models if "woojin" in i.name])

num models: 35

[name: "riva-trt-woojin-citrinet-1024-ko-KR-asr-streaming-am-streaming"
version: "1"
state: "READY"
, name: "riva-trt-woojin-conformer-ko-KR-asr-streaming-am-streaming"
version: "1"
state: "READY"
, name: "woojin-citrinet-1024-ko-KR-asr-streaming"
version: "1"
state: "READY"
, name: "woojin-citrinet-1024-ko-KR-asr-streaming-ctc-decoder-cpu-streaming"
version: "1"
state: "READY"
, name: "woojin-citrinet-1024-ko-KR-asr-streaming-endpointing-streaming"
version: "1"
state: "READY"
, name: "woojin-citrinet-1024-ko-KR-asr-streaming-feature-extractor-streaming"
version: "1"
state: "READY"
, name: "woojin-conformer-ko-KR-asr-streaming"
version: "1"
state: "READY"
, name: "woojin-conformer-ko-KR-asr-streaming-ctc-decoder-cpu-streaming"
version: "1"
state: "READY"
, name: "woojin-conformer-ko-KR-asr-streaming-endpointing-streaming"
version: "1"
state: "READY"
, name: "woojin-conformer-ko-KR-asr-streaming-feature-extractor-streaming"
version: "1"
state: "READY"
]


## Offline test

In [None]:
# librosa and its dependency
!apt-get install -y libsndfile1-dev
!pip install librosa

In [8]:
import IPython.display as ipd
import io
import librosa
import riva.client

# Create Riva clients and connect to Riva Speech API server
auth =riva.client.Auth(uri="riva-speech:50051")

#server
riva_asr = riva.client.ASRService(auth)
# riva_nlp = riva.client.NLPService(auth)
# riva_tts = riva.client.SpeechSynthesisService(auth)

sample = "./samples/korean_sample.wav"
audio, sr = librosa.core.load(sample, sr=None)
with io.open(sample, 'rb') as fh:
    content = fh.read()
ipd.Audio(sample)

### Offline synchronous call

In [15]:
from copy import deepcopy
offline_config = riva.client.RecognitionConfig(
    encoding=riva.client.AudioEncoding.LINEAR_PCM,                     # Supports LINEAR_PCM, FLAC, MULAW and ALAW audio encodings
    sample_rate_hertz = sr,                                            # Audio will be resampled if necessary
    max_alternatives=1,                                                # How many top-N hypotheses to return
    enable_automatic_punctuation=True,                                 # Add punctuation when end of VAD detected
    audio_channel_count = 1,                                           # Mono channel"
    verbatim_transcripts=False,
    model="woojin-citrinet-1024-ko-KR-asr-streaming"                   #  In the case where multiple models might be able to fulfill the client request, one model is selected at random. Y
    # model="woojin-conformer-ko-KR-asr-streaming",
)
response = riva_asr.offline_recognize(content, offline_config)
asr_best_transcript = response.results[0].alternatives[0].transcript
print("ASR Transcript:", asr_best_transcript)

print("\n\nFull Response Message:")
print(response)

ASR Transcript: 안녕하세요 만나뵙게 돼서 반갑습니다. 


Full Response Message:
results {
  alternatives {
    transcript: "안녕하세요 만나뵙게 돼서 반갑습니다. "
    confidence: -0.525097251
  }
  channel_tag: 1
  audio_processed: 3.68000054
}



### Offline asyncrnous call
Just by adding the argument `future=True` in `ASRservice.offline_recognize()`.

In [13]:
from time import time
num_repeats = 10

#sync
sync_transcripts = []
start_time = time()
for _ in range(num_repeats):
    sync_transcripts.append(
        riva_asr.offline_recognize(content, offline_config).results[0].alternatives[0].transcript
    )
print(f"Time spent on synchronous recognition: {time() - start_time:.2f}")

#async
async_transcripts = []
start_time = time()
futures = []
for _ in range(num_repeats):
    futures.append(riva_asr.offline_recognize(content, offline_config, future=True))
for f in futures:
    async_transcripts.append(f.result().results[0].alternatives[0].transcript)
print(f"Time spent on async recognition: {time() - start_time:.2f}")

Time spent on synchronous recognition: 1.18
Time spent on async recognition: 0.49


In [None]:
# Punctuation model test(optional)
model_name = "woojin-punctuation-KR"
response = riva_nlp.transform_text(input_strings=asr_best_transcript, model_name=model_name)

print("Transformed results are:")
print("\n".join([i for i in response.text]))

## Streaming test

To imitate audio streaming, use `riva.client.AudioChunkFileIterator`. You can imitate realtime audio by providing a delay callback to the iterator.

In [17]:
streaming_config = riva.client.StreamingRecognitionConfig(config=deepcopy(offline_config), interim_results=True)
wav_parameters = riva.client.get_wav_file_parameters(sample)
chunk_size = wav_parameters['framerate'] #corresponds to 1 sec of audio

with riva.client.AudioChunkFileIterator(
    sample, chunk_size, delay_callback=riva.client.sleep_audio_length,
) as audio_chunk_iterator:
    for i, chunk in enumerate(audio_chunk_iterator):
        print(i, len(chunk))

0 32000
1 32000
2 32000
3 21418


Then audio chunks are passed to `ASRService.streaming_response_generator()` and response generator is created(response type: `StreamingRecognizeResponse`). For some descriptions regarding metrics in the response object, please refer to https://cloud.google.com/speech-to-text/docs/speech-to-text-requests.

In [19]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(sample, chunk_size)
response_generator = riva_asr.streaming_response_generator(audio_chunk_iterator, streaming_config)
pbToList = [res.results for res in response_generator]
print(pbToList[-1][0].alternatives[0].transcript)

안녕하세요 만나뵙게 돼서 반갑습니다. 


Riva is providing the function `riva.client.print_streaming()` to show protobuf streaming results conveniently.

In [22]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(sample, chunk_size)
response_generator = riva_asr.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, additional_info='time')

>>>Time 1673430265.44s: 안녕
>>>Time 1673430265.44s: 안녕하세요
>>>Time 1673430265.45s: 안녕하세요
>>>Time 1673430265.45s: 안녕하세요
>>>Time 1673430265.45s: 안녕하세요
>>>Time 1673430265.45s: 안녕하세요 만나뵙
>>>Time 1673430265.46s: 안녕하세요 만나뵙게 되
>>>Time 1673430265.46s: 안녕하세요 만나뵙게 돼서
>>>Time 1673430265.46s: 안녕하세요 만나 뵙게 돼서 반갑
>>>Time 1673430265.47s: 안녕하세요 만나뵙게 돼서 반갑습니다
>>>Time 1673430265.47s: 안녕하세요 만나뵙게 돼서 반갑습니다
>>>Time 1673430265.47s: 하세요 만나뵙게 돼서 반갑습니다
>>>Time 1673430265.48s: 세요 만나뵙게 돼서 반갑습니다
>>>Time 1673430265.48s: 만나뵙게 돼서 반갑습니다
>>>Time 1673430265.48s: 많나게 돼서 반갑습니다
Time 0.13s: Transcript 0: 안녕하세요 만나뵙게 돼서 반갑습니다. 


If you set a delay callback in audio chunk iterator and `show_intermediate=True` in `riva.client.print_streaming()`, you will be able watch transcript forming.

In [24]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(sample, chunk_size, riva.client.sleep_audio_length)
response_generator = riva_asr.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, show_intermediate=True)

## 안녕하세요 만나뵙게 돼서 반갑습니다.  


It is also possible to print streaming results in several places, e.g. in STDOUT and a file.

In [25]:
import sys
output_file = "./outputs/kr_asr_streaming_test_results.txt"
audio_chunk_iterator = riva.client.AudioChunkFileIterator(sample, chunk_size)
response_generator = riva_asr.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, additional_info='confidence', output_file=[sys.stdout, output_file])

>> 안녕
Stability:    0.1000
----
>> 안녕하세요
Stability:    0.1000
----
>> 안녕하세요
Stability:    0.1000
----
>> 안녕하세요
Stability:    0.1000
----
>> 안녕하세요
Stability:    0.1000
----
>> 안녕하세요 만나뵙
Stability:    0.1000
----
>> 안녕하세요 만나뵙게 되
Stability:    0.1000
----
>> 안녕하세요 만나뵙게 돼서
Stability:    0.1000
----
>> 안녕하세요 만나 뵙게 돼서 반갑
Stability:    0.1000
----
>> 안녕하세요 만나뵙게 돼서 반갑습니다
Stability:    0.1000
----
>> 안녕하세요 만나뵙게 돼서 반갑습니다
Stability:    0.1000
----
>> 하세요 만나뵙게 돼서 반갑습니다
Stability:    0.1000
----
>> 세요 만나뵙게 돼서 반갑습니다
Stability:    0.1000
----
>> 만나뵙게 돼서 반갑습니다
Stability:    0.1000
----
>> 많나게 돼서 반갑습니다
Stability:    0.1000
----
## 안녕하세요 만나뵙게 돼서 반갑습니다. 
Confidence:   -0.5250
----


## Audio input/output test

It has depedency on **PyAudio**. Please install it. 

In [None]:
!apt-get install -y libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 python3-pyaudio
!pip install pyaudio
# For some errors: https://stackoverflow.com/questions/59006083/how-to-install-portaudio-on-pi-properly

In [26]:
import riva.client.audio_io

### Playing audio during transcribing

For playing audio simultaneously with transcribing, provide an instance of `riva.client.audio_io.SoundCallBack` as a `delay_callback` to `riva.client.AudioChunkFileIterator`.

In [None]:
# show available output devices
riva.client.audio_io.list_output_devices()

In [27]:
output_device = None  # use default device
wav_parameters = riva.client.get_wav_file_parameters(sample)
sound_callback = riva.client.audio_io.SoundCallBack(
    output_device, wav_parameters['sampwidth'], wav_parameters['nchannels'], wav_parameters['framerate'],
)
audio_chunk_iterator = riva.client.AudioChunkFileIterator(sample, chunk_size, sound_callback)
response_generator = riva_asr.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, show_intermediate=True)
sound_callback.close()

ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:869:(find_matching_chmap) Found no matching channel map


## 안녕하세요 만나뵙게 돼서 반갑습니다.  


### Streaming from microphone

In [None]:
riva.client.audio_io.list_input_devices()

In [None]:
input_device = None  # default device
with riva.client.audio_io.MicrophoneStream(
    rate=streaming_config.config.sample_rate_hertz,
    chunk=streaming_config.config.sample_rate_hertz // 10,
    device=input_device,
) as audio_chunk_iterator:
    riva.client.print_streaming(
        responses=riva_asr.streaming_response_generator(
            audio_chunks=audio_chunk_iterator,
            streaming_config=streaming_config,
        ),
        show_intermediate=True,
    )

## Simple Korean ASR Application from microphone

A simple web application for Korean ASR.
1. It receives your voice via microphone while recording
2. Store your voice into the server.
3. Transcribe your voice to text through riva

In [None]:
# Install dependencies
!apt-get install -y libsndfile1-dev
!pip install flask librosa

In [None]:
!python3 ./samples/simple_kr_asr/app.py