In [9]:
# !pip install onnx
# !pip install onnxruntime

In [1]:
import IPython
import soundfile as sf

import collections
import contextlib
import sys
import wave
import json
import pickle
import torch
import os


from os import listdir
from webrtc_utils import *
    
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict
from tqdm import tqdm
import torch.nn as nn

from os import walk
import math
import numpy as np
import librosa

import onnx
import onnxruntime

#display waveform
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import torchaudio

In [13]:
class LSTMModel(nn.Module):
    def __init__(self, inpud_dim=40, hidden_dim=64, n_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()

        self.lstm = nn.LSTM(inpud_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_dim)

        self.fc1 = nn.Linear(hidden_dim, hidden_dim//2)
        self.fc2 = nn.Linear(hidden_dim//2, 1)
        self.dropout1 = nn.Dropout(dropout)
        
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        
        return x

audio_transforms = nn.Sequential(
    torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=40, melkwargs={'win_length':400, 'hop_length':160, "center":True, 'n_mels':64}),
    torchaudio.transforms.SlidingWindowCmn(cmn_window=300, norm_vars=True, center=True)
)
   
    
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [5]:
# Code for recording audio from the browser
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import IPython
import uuid
from google.colab import output


class InvokeButton(object):
  def __init__(self, title, callback):
    self._title = title
    self._callback = callback

  def _repr_html_(self):
    from google.colab import output
    callback_id = 'button-' + str(uuid.uuid4())
    output.register_callback(callback_id, self._callback)

    template = """<button id="{callback_id}" style="cursor:pointer;background-color:#EEEEEE;border-color:#E0E0E0;padding:5px 15px;font-size:14px">{title}</button>
        <script>
          document.querySelector("#{callback_id}").onclick = (e) => {{
            google.colab.kernel.invokeFunction('{callback_id}', [], {{}})
            e.preventDefault();
          }};
        </script>"""
    html = template.format(title=self._title, callback_id=callback_id)
    return html

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
    display(Javascript(RECORD))
    s = output.eval_js('record(%d)' % (sec*1000))
    b = b64decode(s.split(',')[1])
    with open('audio.wav','wb+') as f:
        f.write(b)
    return 'audio.wav'

ModuleNotFoundError: No module named 'google.colab'

#### Pytorch model

In [12]:
model = LSTMModel(inpud_dim=40, hidden_dim=64, n_layers=2, dropout=0.5).float()
model.load_state_dict(torch.load('../data/vad.pt'))
model.eval()

LSTMModel(
  (lstm): LSTM(40, 64, num_layers=2, batch_first=True)
  (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

#### Onnx model

In [11]:
device = 'cuda'
if device == 'cuda':
    providers = ['CUDAExecutionProvider']
else:
    providers = ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession('../data/vad.onnx', providers=providers)



In [14]:
def predict_torch(wavfile, thr=0.7):
    waveform, _ = sf.load(wavfile)
    
    features = audio_transforms(torch.from_numpy(waveform).float())[:, :-1].transpose(0, 1)
    with torch.no_grad():
        output = to_numpy(model(torch.unsqueeze(features, 0).float().to(device))).reshape(features.shape[0])
    
    output = (output > thr).astype(int)
    return output

def predict_onnx(wavfile):
    waveform, _ = sf.load(wavfile)
    
    features = audio_transforms(torch.from_numpy(waveform).float())[:, :-1].transpose(0, 1)
    
    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(torch.unsqueeze(features, 0))}
    ort_outs = ort_session.run(None, ort_inputs)
    output = ort_outs[0].reshape(features.shape[0])
    
    output = (output > thr).astype(int)
    return output

In [19]:
def apply_vad(pred, wavfile):
    waveform, _ = sf.load(wavfile)
    
    vad = []
    for i in pred:
        vad.extend([i] * 160)
    vad = np.array(vad)
    vad = np.pad(nonspeech, (0, len(waveform) - len(vad)), mode='constant', constant_values=1)[:len(waveform)]
    return waveform * vad
    

In [18]:
a = np.array([1,0,1])
b = np.array([1,2,3])
a * b

array([1, 0, 3])

In [None]:
def predict():
    print("Now recording for 3 seconds, say what you will...")
    record()
    os.system('ffmpeg -i audio.wav -ar 16000 -y audio.wav')
    pred = predict_onnx('audio.wav')
    
    amplitudes = apply_vad(pred, 'audio.wav')
    IPython.display.Audio(amplitudes, rate = 16000)
    

InvokeButton('Start recording', predict)