In [1]:
import numpy as np
import IPython
import pyaudio
import wave
import time

In [2]:

CHUNK = 16000
FORMAT = pyaudio.paFloat32
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output_1.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

* recording
* done recording


In [174]:
p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

frames = []
try:
    while True:
        data = stream.read(CHUNK)
        
        frames.append(data)
except KeyboardInterrupt:
    
    print("* done recording")
    stream.stop_stream()
    stream.close()
    p.terminate()

* recording
* done recording


In [69]:
tmp = iter(frames)

In [180]:
IPython.display.Audio(np.frombuffer(frames[4], np.float32), rate = 16000)

In [3]:
IPython.display.Audio(np.frombuffer(b''.join(frames), np.float32), rate = 16000)

In [101]:
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

In [108]:
import array
# f = ''.join(frames)
nums = array.array('d', frames[0])

In [None]:
play=pyaudio.PyAudio()
stream_play=play.open(format=FORMAT,
                      channels=CHANNELS,
                      rate=RATE,
                      output=True)
for data in frames: 
    stream_play.write(data)
stream_play.stop_stream()
stream_play.close()
play.terminate()

In [71]:
play=pyaudio.PyAudio()
stream_play=play.open(format=FORMAT,
                      channels=CHANNELS,
                      rate=RATE,
                      output=True)

In [75]:
stream_play.write(next(tmp))

In [76]:
stream_play.stop_stream()
stream_play.close()
play.terminate()

In [12]:
import torch
from torch import nn
import torch.utils.data as data

class SpeechResModel(nn.Module):
    def __init__(self, n_labels, n_feature_maps=45, n_layers=26, dilation=True, res_pool=2):
        super().__init__()
        self.n_labels = n_labels
        self.n_maps = n_feature_maps
        self.conv0 = nn.Conv2d(1, self.n_maps, (3, 3), padding=(1, 1), bias=False)
        self.avg_pool = res_pool
        if res_pool:
            self.pool = nn.AvgPool2d(res_pool)

        self.n_layers = n_layers
        if dilation:
            self.convs = [nn.Conv2d(self.n_maps, self.n_maps, (3, 3), padding=int(2**(i // 3)), dilation=int(2**(i // 3)),
                bias=False) for i in range(n_layers)]
        else:
            self.convs = [nn.Conv2d(self.n_maps, self.n_maps, (3, 3), padding=1, dilation=1,
                bias=False) for _ in range(n_layers)]
        for i, conv in enumerate(self.convs):
            self.add_module("bn{}".format(i + 1), nn.BatchNorm2d(self.n_maps, affine=False))
            self.add_module("conv{}".format(i + 1), conv)
        self.output = nn.Linear(self.n_maps, self.n_labels)

    def forward(self, x):
        x = x.unsqueeze(1)
        for i in range(self.n_layers + 1):
            y = F.relu(getattr(self, "conv{}".format(i))(x))
            if i == 0:
                if self.avg_pool:
                    y = self.pool(y)
                old_x = y
            if i > 0 and i % 2 == 0:
                x = y + old_x
                old_x = x
            else:
                x = y
            if i > 0:
                x = getattr(self, "bn{}".format(i))(x)
        x = x.view(x.size(0), x.size(1), -1) # shape: (batch, feats, o3)
        x = torch.mean(x, 2)
        return self.output(x)

class SpeechDataset(data.Dataset):
    
#     LABEL_UNKNOWN = "__unknown__"
    def __init__(self, data, set_type, noise_path, labels_set, base_path, unknown_prob=0.1, silence_prob=0.1, noise_prob=0.8, timeshift_ms=100, input_length=16000, n_mels=40, n_mfcc=40, hop_ms=10):
        super().__init__()
        LABEL_SILENCE = "__silence__"
        LABEL_UNKNOWN = "__unknown__"
        self.base_path = base_path
        self.noise_path = noise_path
        self.audio_files = data
        self.set_type = set_type
        labels = list(map(lambda x: x[:x.find('/')], data))
        self.label2ind = {word: i + 2 for i, word in enumerate(labels_set)}
        self.label2ind.update({LABEL_SILENCE: 0, LABEL_UNKNOWN:1})
        self.audio_labels = list(map(lambda x: self.label2ind.get(x, 1), labels))
        self.n_mfcc = n_mfcc
        
        bg_noise_files = list(filter(lambda x: x.endswith("wav"), listdir(noise_path)))
        self.bg_noise_audio = [sf.read(os.path.join(noise_path, file))[0] for file in bg_noise_files]
        self.unknown_prob = unknown_prob
        self.silence_prob = silence_prob
        self.noise_prob = noise_prob
        self.input_length = input_length
        self.timeshift_ms = timeshift_ms
#         self._audio_cache = SimpleCache(config["cache_size"]) # todo
#         self._file_cache = SimpleCache(config["cache_size"])
        self._file_cache = {}
        self._audio_cache = {}
        n_unk = len(list(filter(lambda x: x == 1, self.audio_labels)))
        self.n_silence = int(self.silence_prob * (len(self.audio_labels) - n_unk))
        
        self.augment = Compose([
            TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            Shift(min_fraction=-0.1, max_fraction=0.1, p=0.5, rollover=False),
        ])
        
        self.audio_transforms = nn.Sequential(
            torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=n_mfcc, melkwargs={'hop_length': 16 * hop_ms,
                                                                                "center": True, 'n_mels': n_mels}),
            torchaudio.transforms.SlidingWindowCmn(cmn_window=600, norm_vars=True, center=True)
        )
#         self.train_audio_transforms = nn.Sequential(
#         #     torchaudio.transforms.SlidingWindowCmn(cmn_window=600, norm_vars=True, center=True),
#             torchaudio.transforms.FrequencyMasking(freq_mask_param=3),
#             torchaudio.transforms.TimeMasking(time_mask_param=100)
#         )
        
    def load_audio(self, example, silence=False):
        if silence:
            example = "__silence__"
        if random.random() < 0.7 or not self.set_type == 'train':
            try:
                return self._audio_cache[example]
            except KeyError:
                pass
        in_len = self.input_length
        if self.bg_noise_audio:
            bg_noise = random.choice(self.bg_noise_audio)
            a = random.randint(0, len(bg_noise) - in_len - 1)
            bg_noise = bg_noise[a:a + in_len]
        else:
            bg_noise = np.zeros(in_len)

        if silence:
            audio = np.zeros(in_len, dtype=np.float32)
        else:
            file_data = self._file_cache.get(example)
            audio = sf.read(os.path.join(self.base_path, example))[0] if file_data is None else file_data
            audio = audio.astype(np.float32)
            self._file_cache[example] = audio
        audio = np.pad(audio, (0, max(0, in_len - len(audio))), "constant")
        if self.set_type == 'train':
            audio = self.augment(samples=audio, sample_rate=16000)

        if random.random() < self.noise_prob or silence:
            a = random.random() * 0.1
            audio = np.clip(a * bg_noise + audio, -1, 1)
        
        torch_audio = torch.from_numpy(audio).float()
        transform_audio = self.audio_transforms(torch_audio).reshape(-1, self.n_mfcc)
#         if self.set_type == 'train':
#             transform_audio = self.train_audio_transforms(transform_audio)
        self._audio_cache[example] = transform_audio
        return transform_audio

    def __getitem__(self, index):
        if index >= len(self.audio_labels):
            return self.load_audio(None, silence=True), torch.tensor(0)
        return self.load_audio(self.audio_files[index]), torch.tensor(self.audio_labels[index])

    def __len__(self):
        return len(self.audio_labels) + self.n_silence

In [163]:
labels_set = ['down','go','left','no','off','on','right','stop','up','yes']
device = torch.device('cpu')
model = SpeechResModel(n_labels=len(labels_set)+2, dilation=False).float()
model.load_state_dict(torch.load('../model_checpoint_20211101_141912_37.pt', map_location=device)['model_state_dict']) # model_checpoint_20211101_125446_20.pt model_checpoint_20211101_141912_37.pt
model.eval()

SpeechResModel(
  (conv0): Conv2d(1, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (pool): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (bn1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv1): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv2): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn3): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv3): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn4): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv4): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn5): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv5): Conv2d(45, 45, kernel_s

In [164]:
from os import listdir
import soundfile as sf
import os
from audiomentations import Compose, TimeStretch, PitchShift, Shift
import torchaudio
import random
import torch.nn.functional as F

all_samples = []
base_dir = "../dataset/"
test_file_path = "../dataset/testing_list.txt"
noise_path = "../dataset/_background_noise_/"

with open(test_file_path) as file:
    list_samples = file.read()
test_samples = list_samples.split('\n')[:-1]


test_dataset = SpeechDataset(test_samples, 'test', noise_path, labels_set, base_dir)

In [165]:
import onnx
import onnxruntime

In [166]:
x = torch.randn(1, 101, 40, requires_grad=False)
torch_out = model(x)

# Export the model
torch.onnx.export(model,  # model being run
                  x,  # model input (or a tuple for multiple inputs)
                  "key-spotting.onnx",  # where to save the model (can be a file or file-like object)
                  export_params=True,  # store the trained parameter weights inside the model file
                  opset_version=10,  # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input'],  # the model's input names
                  output_names=['output'],  # the model's output names
                  dynamic_axes={'input': {0: 'batch_size'},  # variable length axes
                                'output': {0: 'batch_size'}})

onnx_model = onnx.load("key-spotting.onnx")
onnx.checker.check_model(onnx_model)

In [187]:
IPython.display.Audio(np.frombuffer(b''.join(frames), np.float32), rate = 16000)

In [181]:
# tmp = np.frombuffer(frames[0], np.float32)
# nonspeech_0, sample_rate = sf.read(base_dir + 'marvin/' + '0447d7c1_nohash_0.wav')
tmp = np.frombuffer(b''.join(frames), np.float32).reshape(5, -1)
# tmp = nonspeech_0

In [182]:
torch_audio = torch.from_numpy(tmp).float()

audio_transforms = nn.Sequential(
            torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=40, melkwargs={'hop_length': 16 * 10,
                                                                                    "center": True, 'n_mels': 40}),
            torchaudio.transforms.SlidingWindowCmn(cmn_window=600, norm_vars=True, center=True)
        )
transform_audio = audio_transforms(torch_audio).reshape(5, -1, 40)

In [183]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [184]:
providers=['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession("key-spotting.onnx", providers=providers)
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(transform_audio)}
ort_outs = ort_session.run(None, ort_inputs)
pred = ort_outs[0]

In [185]:
pred, pred.argmax(1)

(array([[-3.5766153,  2.0491514,  1.5030813,  4.407857 , -3.4799247,
          7.6848354, -4.549578 , -4.7306604, -4.616682 , -3.0827565,
         -4.5187564, -1.010803 ],
        [-4.247282 ,  4.824547 , -3.7522433, -4.713198 ,  4.7570972,
         -2.3865607, -3.7694101, -2.5073724,  0.5022329, -3.6036496,
         -3.7709281, -1.4181529],
        [-3.9763083,  1.8189995,  1.0820456,  1.3266548, -1.59573  ,
          8.811736 , -5.5387363, -4.8127456, -1.2852377, -4.2848864,
         -4.9031878, -0.7117934],
        [-3.6445866,  4.3833275,  3.0583394, -2.5316904, -4.527086 ,
         -3.2883294, -1.5272462,  5.1630735, -7.0954447, -2.5663795,
         -2.7225277, -5.7355423],
        [-3.146779 ,  4.255214 , -4.517156 , -6.0649843,  2.4686706,
         -4.0379057, -3.0143223, -2.4878173,  7.038826 , -3.605792 ,
         -4.0020375, -2.2209773]], dtype=float32),
 array([5, 1, 5, 7, 8]))

In [186]:
for i in pred.argmax(1):
    if i != 1 and i != 0:
        print(labels_set[i-2])
    else:
        print(i)

no
1
no
on
right
