In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg portaudio19-dev
!pip install unidecode
!pip install pyaudio

# ## Install NeMo
!python -m pip install --upgrade git+https://github.com/NVIDIA/NeMo.git@candidate#egg=nemo_toolkit[asr]

## Install TorchAudio
!pip install torchaudio>=0.6.0 -f https://download.pytorch.org/whl/torch_stable.html

## Grab the config we'll use in this example
!mkdir configs

This notebook demonstrates automatic speech recognition (ASR) from a microphone's stream in NeMo.

It is **not a recommended** way to do inference in production workflows. If you are interested in 
production-level inference using NeMo ASR models, please sign-up to Jarvis early access program: https://developer.nvidia.com/nvidia-jarvis

The notebook requires PyAudio library to get a signal from an audio device.
For Ubuntu, please run the following commands to install it:
```
sudo apt-get install -y portaudio19-dev
pip install pyaudio
```

This notebook requires the `torchaudio` library to be installed for MatchboxNet. Please follow the instructions available at the [torchaudio Github page](https://github.com/pytorch/audio#installation) to install the appropriate version of torchaudio.

If you would like to install the latest version, please run the following command to install it:

```
conda install -c pytorch torchaudio
```

In [1]:
import numpy as np
import pyaudio as pa
import os, time

In [2]:
import nemo
import nemo.collections.asr as nemo_asr

[NeMo W 2020-08-24 13:45:55 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-08-24 13:45:55 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-08-24 13:45:55 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-08-24 13:45:55 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-08-24 13:45:55 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset'> is experimental,

In [3]:
# sample rate, Hz
SAMPLE_RATE = 16000

## Restore the model from NGC

In [7]:
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained('QuartzNet15x5Base-En')

[NeMo I 2020-08-24 13:47:37 cloud:65] Downloading from: https://nemo-public.s3.us-east-2.amazonaws.com/nemo-1.0.0alpha-tests/QuartzNet15x5Base-En.nemo to /home/smajumdar/.cache/torch/NeMo/NEMO_0.88.1b0/QuartzNet15x5Base-En.nemo
[NeMo I 2020-08-24 13:47:59 common:366] Instantiating model from pre-trained checkpoint


    Config key 'cls' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Use '_target_' instead of 'cls'.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    
    Field 'params' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Inline the content of params directly at the containing node.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    


[NeMo I 2020-08-24 13:48:00 features:186] PADDING: 16
[NeMo I 2020-08-24 13:48:00 features:198] STFT using torch
[NeMo I 2020-08-24 13:48:01 modelPT:214] Model EncDecCTCModel was successfully restored from /home/smajumdar/.cache/torch/NeMo/NEMO_0.88.1b0/QuartzNet15x5Base-En.nemo.


## Observing the config of the model

In [8]:
from omegaconf import OmegaConf
import copy

In [12]:
# Preserve a copy of the full config
cfg = copy.deepcopy(asr_model._cfg)
print(OmegaConf.to_yaml(cfg))

preprocessor:
  cls: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
  params:
    normalize: per_feature
    window_size: 0.02
    sample_rate: 16000
    window_stride: 0.01
    window: hann
    features: 64
    n_fft: 512
    frame_splicing: 1
    dither: 1.0e-05
    stft_conv: false
spec_augment:
  cls: nemo.collections.asr.modules.SpectrogramAugmentation
  params:
    rect_freq: 50
    rect_masks: 5
    rect_time: 120
encoder:
  cls: nemo.collections.asr.modules.ConvASREncoder
  params:
    feat_in: 64
    activation: relu
    conv_mask: true
    jasper:
    - filters: 256
      repeat: 1
      kernel:
      - 33
      stride:
      - 2
      dilation:
      - 1
      dropout: 0.0
      residual: false
      separable: true
    - filters: 256
      repeat: 5
      kernel:
      - 33
      stride:
      - 1
      dilation:
      - 1
      dropout: 0.0
      residual: true
      separable: true
    - filters: 256
      repeat: 5
      kernel:
      - 33
      stride:
 

### Modify preprocessor parameters for inference

In [14]:
# Make config overwrite-able
OmegaConf.set_struct(cfg.preprocessor, False)

# some changes for streaming scenario
cfg.preprocessor.params.dither = 0.0
cfg.preprocessor.params.pad_to = 0

# spectrogram normalization constants
normalization = {}
normalization['fixed_mean'] = [
     -14.95827016, -12.71798736, -11.76067913, -10.83311182,
     -10.6746914,  -10.15163465, -10.05378331, -9.53918999,
     -9.41858904,  -9.23382904,  -9.46470918,  -9.56037,
     -9.57434245,  -9.47498732,  -9.7635205,   -10.08113074,
     -10.05454561, -9.81112681,  -9.68673603,  -9.83652977,
     -9.90046248,  -9.85404766,  -9.92560366,  -9.95440354,
     -10.17162966, -9.90102482,  -9.47471025,  -9.54416855,
     -10.07109475, -9.98249912,  -9.74359465,  -9.55632283,
     -9.23399915,  -9.36487649,  -9.81791084,  -9.56799225,
     -9.70630899,  -9.85148006,  -9.8594418,   -10.01378735,
     -9.98505315,  -9.62016094,  -10.342285,   -10.41070709,
     -10.10687659, -10.14536695, -10.30828702, -10.23542833,
     -10.88546868, -11.31723646, -11.46087382, -11.54877829,
     -11.62400934, -11.92190509, -12.14063815, -11.65130117,
     -11.58308531, -12.22214663, -12.42927197, -12.58039805,
     -13.10098969, -13.14345864, -13.31835645, -14.47345634]
normalization['fixed_std'] = [
     3.81402054, 4.12647781, 4.05007065, 3.87790987,
     3.74721178, 3.68377423, 3.69344,    3.54001005,
     3.59530412, 3.63752368, 3.62826417, 3.56488469,
     3.53740577, 3.68313898, 3.67138151, 3.55707266,
     3.54919572, 3.55721289, 3.56723346, 3.46029304,
     3.44119672, 3.49030548, 3.39328435, 3.28244406,
     3.28001423, 3.26744937, 3.46692348, 3.35378948,
     2.96330901, 2.97663111, 3.04575148, 2.89717604,
     2.95659301, 2.90181116, 2.7111687,  2.93041291,
     2.86647897, 2.73473181, 2.71495654, 2.75543763,
     2.79174615, 2.96076456, 2.57376336, 2.68789782,
     2.90930817, 2.90412004, 2.76187531, 2.89905006,
     2.65896173, 2.81032176, 2.87769857, 2.84665271,
     2.80863137, 2.80707634, 2.83752184, 3.01914511,
     2.92046439, 2.78461139, 2.90034605, 2.94599508,
     2.99099718, 3.0167554,  3.04649716, 2.94116777]

cfg.preprocessor.params.normalize = normalization

# Disable config overwriting
OmegaConf.set_struct(cfg.preprocessor, True)

### Setup preprocessor with these settings

In [15]:
asr_model.preprocessor = asr_model.from_config_dict(cfg.preprocessor)

    Config key 'cls' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Use '_target_' instead of 'cls'.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    
    Field 'params' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Inline the content of params directly at the containing node.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    


[NeMo I 2020-08-24 13:58:53 features:186] PADDING: 0
[NeMo I 2020-08-24 13:58:53 features:198] STFT using torch


# Setting up data for Streaming Inference

In [18]:
from nemo.core import typecheck
from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
from nemo.collections.asr.data.audio_to_text import _speech_collate_fn
import torch

In [19]:
# simple data layer to pass audio signal
class AudioDataLayer(IterableDataset):
    @property
    def output_ports(self):
        return {
            'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(self, sample_rate):
        super().__init__()
        self._sample_rate = sample_rate
        self.output = True
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if not self.output:
            raise StopIteration
        self.output = False
        return torch.as_tensor(self.signal, dtype=torch.float32), \
               torch.as_tensor(self.signal_shape, dtype=torch.int64)
        
    def set_signal(self, signal):
        self.signal = np.reshape(signal.astype(np.float32)/32768., [1, -1])
        self.signal_shape = np.expand_dims(self.signal.size, 0).astype(np.int64)
        self.output = True

    def __len__(self):
        return 1

In [26]:
data_layer = AudioDataLayer(sample_rate=cfg.preprocessor.params.sample_rate)

In [27]:
# inference method for audio signal (single instance)
def infer_signal(model, signal):
    data_layer.set_signal(signal)
    batch = next(iter(data_layer))
    audio_signal, audio_signal_len = batch
    log_probs, encoded_len, predictions = model.forward(
        input_signal=audio_signal, input_signal_length=audio_signal_len
    )
    return log_probs

# def validation_step(self, batch, batch_idx, dataloader_idx=0):
#     audio_signal, audio_signal_len, transcript, transcript_len = batch
#     log_probs, encoded_len, predictions = self.forward(
#         input_signal=audio_signal, input_signal_length=audio_signal_len
#     )
#     loss_value = self.loss(
#         log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
#     )
#     wer_num, wer_denom = self._wer(predictions, transcript, transcript_len)
#     return {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom}