In [None]:
# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT

# ASR API tutorial

This tutorial demonstates how to use Python Riva API.

## <font color="blue">Server</font>

Before running client part of Riva, please set up a server. The simplest
way to do this is to follow
[quick start guide](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/quick-start-guide.html#local-deployment-using-quick-start-scripts).


## <font color="blue">Authentication</font>

Before using Riva services you will need to establish connection with a server.

In [1]:
import riva.client

uri = "10.10.162.12:50051"  # Default value

auth = riva.client.Auth(uri=uri)

## <font color="blue">Setting up service</font>

To instantiate a service pass `riva.client.Auth` instance to a constructor.

In [2]:
asr_service = riva.client.ASRService(auth)

For speech recognition you will need to create a recognition config (an instance of `riva.client.RecognitionConfig`). 
A detailed description of config fields is available in Riva 
[documentation](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/reference/protos/riva_asr.proto.html?highlight=max%20alternatives#riva-proto-riva-asr-proto).
If you intend to use streaming recognition, an offline config has to wrapped into `riva.client.StreamingRecognitionConfig`.


In [3]:
from copy import deepcopy
offline_config = riva.client.RecognitionConfig(
    encoding=riva.client.AudioEncoding.LINEAR_PCM,
    max_alternatives=1,
    enable_automatic_punctuation=True,
    verbatim_transcripts=False,
)
streaming_config = riva.client.StreamingRecognitionConfig(config=deepcopy(offline_config), interim_results=True)

You also need to a set frame rate and number of channels of audio which is going to be processed. If you'd like to process file `data/examples/en-US_AntiBERTa_for_word_boosting_testing.wav`, then your code will be

In [4]:
my_wav_file = 'mono.wav'
riva.client.add_audio_file_specs_to_config(offline_config, my_wav_file)
riva.client.add_audio_file_specs_to_config(streaming_config, my_wav_file)

If you intent to use word boosting, then use convenience method `riva.client.add_word_boosting_to_config()` to add boosting parameters to config.

In [5]:
boosted_lm_words = ['AntiBERTa', 'ABlooper']
boosted_lm_score = 20.0
riva.client.add_word_boosting_to_config(offline_config, boosted_lm_words, boosted_lm_score)
riva.client.add_word_boosting_to_config(streaming_config, boosted_lm_words, boosted_lm_score)

In [6]:
print(offline_config)

encoding: LINEAR_PCM
sample_rate_hertz: 44100
max_alternatives: 1
speech_contexts {
  phrases: "AntiBERTa"
  phrases: "ABlooper"
  boost: 20
}
audio_channel_count: 1
enable_automatic_punctuation: true



In [7]:
print(streaming_config)

config {
  encoding: LINEAR_PCM
  sample_rate_hertz: 44100
  max_alternatives: 1
  speech_contexts {
    phrases: "AntiBERTa"
    phrases: "ABlooper"
    boost: 20
  }
  audio_channel_count: 1
  enable_automatic_punctuation: true
}
interim_results: true



## <font color="blue">Offline</font>

To run offline speech recognition read data from a file and pass to a service.

In [8]:
with open(my_wav_file, 'rb') as fh:
    data = fh.read()

In [9]:
response = asr_service.offline_recognize(data, offline_config)

In [10]:
print(response)

results {
  alternatives {
    transcript: "The stale smell of old beer lingers. "
    confidence: -2.30204725
  }
  channel_tag: 1
  audio_processed: 4.8
}
results {
  alternatives {
    transcript: "It takes heat to bring out the odor. "
    confidence: -1.76582098
  }
  channel_tag: 1
  audio_processed: 9.6
}
results {
  alternatives {
    transcript: "A cold dip restores Health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite a zest ful food is the hot cross bun. "
    confidence: -2.43338275
  }
  channel_tag: 1
  audio_processed: 19.2
}



To extract a transcript you may use

In [13]:
print(response.results[1].alternatives[0].transcript)

It takes heat to bring out the odor. 


In [12]:
print(response.results[0].alternatives[0].confidence)

-2.3020472526550293


### <font color="green">Asynchronous calls</font>

You can recognize speech asynchronously by setting `future=True` in `ASRService.offline_recognize()`.

In [14]:
from time import time

num_repeats = 10

In [15]:
sync_transcripts = []
start_time = time()
for _ in range(num_repeats):
    sync_transcripts.append(
        asr_service.offline_recognize(data, offline_config).results[0].alternatives[0].transcript
    )
print(f"Time spent on synchronous recognition: {time() - start_time:.2f}")

Time spent on synchronous recognition: 30.67


In [16]:
async_transcripts = []
start_time = time()
futures = []
for _ in range(num_repeats):
    futures.append(asr_service.offline_recognize(data, offline_config, future=True))
for f in futures:
    async_transcripts.append(f.result().results[0].alternatives[0].transcript)
print(f"Time spent on async recognition: {time() - start_time:.2f}")

Time spent on async recognition: 20.83


In [17]:
assert sync_transcripts == async_transcripts

## <font color="blue">Streaming</font>

To imitate audio streaming use `riva.client.AudioChunkFileIterator`. You can imitate realtime audio by providing a delay callback to the iterator.

In [18]:
wav_parameters = riva.client.get_wav_file_parameters(my_wav_file)
# correponds to 1 second of audio
chunk_size = wav_parameters['framerate']
with riva.client.AudioChunkFileIterator(
    my_wav_file, chunk_size, delay_callback=riva.client.sleep_audio_length,
) as audio_chunk_iterator:
    for i, chunk in enumerate(audio_chunk_iterator):
        print(i, len(chunk))

0 88200
1 88200
2 88200
3 88200
4 88200
5 88200
6 88200
7 88200
8 88200
9 88200
10 88200
11 88200
12 88200
13 88200
14 88200
15 88200
16 88200
17 88200
18 31514


Then audio chunks are passed to `ASRService.streaming_response_generator()` and response generator is created.

In [19]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)

You may find description of streaming response (`StreamingRecognizeResponse`) fields in Riva [documentation](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/reference/protos/riva_asr.proto.html?highlight=max%20alternatives#riva-proto-riva-asr-proto).

In [20]:
streaming_response = next(response_generator)

For showing streaming results it is convenient to use function `riva.client.print_streaming()`.

In [21]:
riva.client.print_streaming(response_generator, additional_info='time')

>>>Time 1693326570.47s: this
>>>Time 1693326570.47s: the state
>>>Time 1693326570.47s: the
>>>Time 1693326570.47s: the salesman
>>>Time 1693326570.47s: the stale smell
>>>Time 1693326570.47s: the stale smell of
>>>Time 1693326570.47s: the stale smell of
>>>Time 1693326570.47s: the stale smell of old
>>>Time 1693326570.47s: the stale smell of
>>>Time 1693326570.47s: the stale smell of old beer
>>>Time 1693326570.47s: the stale smell of old beer
>>>Time 1693326570.47s: stale smell of old beer lingered
>>>Time 1693326570.47s: the stale smell of old beer lingers
>>>Time 1693326570.47s: the smell of old beer lingers
>>>Time 1693326570.47s: the smell of old beer lingers
>>>Time 1693326570.47s: the stale mell of old beer lingers
>>>Time 1693326570.47s: the stale of old beer lingers
>>>Time 1693326570.47s: the stale of old beer lingers
Time 0.00s: Transcript 0: The stale smell of old beer lingers. 
>>>Time 1693326570.47s: it takes
>>>Time 1693326570.47s: it takes
>>>Time 1693326570.47s: it tak

If you set a delay callback in audio chunk iterator and `show_intermediate=True` in `riva.client.print_streaming()`, then you will be able watch transcript forming.

In [22]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800, riva.client.sleep_audio_length)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, show_intermediate=True)

## The stale smell of old beer lingers.  
## It takes heat to bring out the odor.         
## A cold dip restores Health and zest a salt pickle taste fine with ham tacos al pastor are my favourite a zest ful food is the hot cross bun. 


It is also possible to print streaming results in several places, e.g. in STDOUT and a file.

In [23]:
import sys
output_file = "my_results.txt"
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, additional_info='confidence', output_file=[sys.stdout, output_file])

>> this
Stability:    0.1000
----
>> the state
Stability:    0.1000
----
>> the
Stability:    0.1000
----
>> the salesman
Stability:    0.1000
----
>> the stale smell
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of old
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of old beer
Stability:    0.1000
----
>> the stale smell of old beer
Stability:    0.1000
----
>> stale smell of old beer lingered
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> stale smell of old beer lingers
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> smell of old beer lingers
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> smell of old beer lingers
Stability:    0.1000
----
>> the stale 
Stability:    0.9000
>> mell of old beer lingers
Stability:    0.1000
----
>> the stale 
Stability:    0.9000
>> of old beer lingers
Stability:    0.1000
----
>> t

Showing file and clean up in bash

In [24]:
!cat $output_file

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [25]:
!rm $output_file

'rm' is not recognized as an internal or external command,
operable program or batch file.


Showing file and clean up in cmd.exe

In [26]:
!type $output_file

>> this
Stability:    0.1000
----
>> the state
Stability:    0.1000
----
>> the
Stability:    0.1000
----
>> the salesman
Stability:    0.1000
----
>> the stale smell
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of old
Stability:    0.1000
----
>> the stale smell of
Stability:    0.1000
----
>> the stale smell of old beer
Stability:    0.1000
----
>> the stale smell of old beer
Stability:    0.1000
----
>> stale smell of old beer lingered
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> stale smell of old beer lingers
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> smell of old beer lingers
Stability:    0.1000
----
>> the 
Stability:    0.9000
>> smell of old beer lingers
Stability:    0.1000
----
>> the stale 
Stability:    0.9000
>> mell of old beer lingers
Stability:    0.1000
----
>> the stale 
Stability:    0.9000
>> of old beer lingers
Stability:    0.1000
----
>> t

In [27]:
!del $output_file

## <font color="blue">Audio input/output</font>

For using audio input and output you need to install PyAudio.

```bash
conda install -c anaconda pyaudio
```

### <font color="green">Playing audio during transcribing</font>

For playing audio simultaneously with transcribing, provide an instance of `riva.client.audio_io.SoundCallBack` as a `delay_callback` to `riva.client.AudioChunkFileIterator`.

In [28]:
import pyaudio
import riva.client.audio_io

### <font color="green">Streaming from microphone</font>

In [36]:
riva.client.audio_io.list_input_devices()

Input audio devices:
0: Microsoft Sound Mapper - Input
1: Microphone Array (Intel® Smart 
5: Microphone (USB Audio)
9: PC Speaker (Realtek HD Audio output with SST)
10: Stereo Mix (Realtek HD Audio Stereo input)
11: Microphone (Realtek HD Audio Mic input)
14: PC Speaker (Realtek HD Audio 2nd output with SST)
15: Microphone Array 1 (Intel® Smart Sound Technology DMIC Microphone)
16: Microphone Array 2 (Intel® Smart Sound Technology DMIC Microphone)
17: Microphone Array 3 (Intel® Smart Sound Technology DMIC Microphone)
18: Microphone Array 4 (Intel® Smart Sound Technology DMIC Microphone)


Run code below and then say something in English

In [38]:
input_device = None  # default device
with riva.client.audio_io.MicrophoneStream(
    rate=streaming_config.config.sample_rate_hertz,
    chunk=streaming_config.config.sample_rate_hertz // 10,
    device=input_device,
) as audio_chunk_iterator:
    riva.client.print_streaming(
        responses=asr_service.streaming_response_generator(
            audio_chunks=audio_chunk_iterator,
            streaming_config=streaming_config,
        ),
        show_intermediate=True,
    )

_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "WSA Error"
	debug_error_string = "UNKNOWN:Error received from peer  {created_time:"2023-08-29T16:46:17.297302656+00:00", grpc_status:14, grpc_message:"WSA Error"}"
>