In [1]:
from pathlib import Path
import sys

path_to_import = str(Path().resolve() / "../" )
if path_to_import not in sys.path:
    sys.path.append(path_to_import)

from bcresnet import BCResNets
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
import torchaudio
from torchvision import transforms
import numpy as np
import wave
import noisereduce as nr
import pyaudio
import os

from tqdm import tqdm
from utils import Padding, Preprocess, SpeechCommand, SplitDataset, LogMel, spec_augment

THRESHOLD = 0.85
gpu = 1
device = torch.device("cuda:%d" % gpu if torch.cuda.is_available() else "cpu")

  '"sox" backend is being deprecated. '


labels:	 {'alyona': 1, 'filler': 0}


  return torch._C._cuda_getDeviceCount() > 0


## Загрузка модели

In [4]:
model = BCResNets(int(1 * 8)).to(device)
#model.load_state_dict(torch.load("../models/go_model.pth"))
model.load_state_dict(torch.load("../models/bed_model.pth"))
#model.load_state_dict(torch.load("../models/model.pth"))
model.eval()

BCResNets(
  (cnn_head): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(2, 1), padding=(2, 2), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (BCBlocks): ModuleList(
    (0): ModuleList(
      (0): BCResBlock(
        (f2): Sequential(
          (0): ConvBNReLU(
            (block): Sequential(
              (0): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
          )
          (1): ConvBNReLU(
            (block): Sequential(
              (0): Conv2d(8, 8, kernel_size=(3, 1), stride=(1, 1), padding=[1, 0], dilation=[1, 1], groups=8, bias=False)
              (1): SubSpectralNorm(
                (ssnorm): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
            )
    

## Прослушивание сигнала и идентификация ключевого слова

In [26]:
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
RECORD_SECONDS = 1.2

audio = pyaudio.PyAudio()

def process_sample(sample):
    sample = torch.clamp(sample, -1.0, 1.0)
    SR = 16000
    hop_length=160
    win_length=480
    n_fft=512
    n_mels=40
    feature = LogMel(
                device,
                sample_rate=SR,
                hop_length=hop_length,
                win_length=win_length,
                n_fft=n_fft,
                n_mels=n_mels,
            )
    sample = feature(sample)  
    sample = spec_augment(sample)
    return sample

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

try:
    while True:
        
        # Process audio data within the recording loop
        sample = torch.zeros(1, int(RATE * RECORD_SECONDS))
        for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            audio_tensor = torch.from_numpy(np.frombuffer(data, dtype=np.int16) / 32767.0) 
            sample[0, i*CHUNK:(i+1)*CHUNK] = audio_tensor

        #print(sample.shape)
        #print("Finished recording.")

        # Process the audio tensor with feature extraction and classification
        sample = process_sample(sample)
        sample = sample.to(device)

        outputs = model(sample.unsqueeze(0))
        predictions = F.softmax(outputs)

        #print(predictions)

        if predictions[0][1].item() > 0.99:
            print("Keyword")
        else:
            pass
            print("Filler")

except KeyboardInterrupt:
    pass

stream.stop_stream()
stream.close()
audio.terminate()


Recording...




Filler
Filler
Filler
Filler
Filler
Filler
Keyword
Filler
Filler
Keyword
Filler
Filler
Keyword
Filler
Filler
Filler
Filler
Filler
Filler
Filler
Filler
Keyword
Filler
Filler
Keyword
Filler


## Записать сигнал в .wav файл и отдельно его распознать

In [2]:
def record_wav(filepath:str):
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    CHUNK = 1024
    RECORD_SECONDS = 1.0
    WAVE_OUTPUT_FILENAME = filepath

    audio = pyaudio.PyAudio()

    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording...")

    frames = []

    for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Finished recording.")

    stream.stop_stream()
    stream.close()
    audio.terminate()

    waveFile = wave.open(WAVE_OUTPUT_FILENAME, "wb")
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b"".join(frames))
    waveFile.close()

def record_wav_nr(filepath:str, stationary:bool):
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    CHUNK = 1024
    RECORD_SECONDS = 1.0
    WAVE_OUTPUT_FILENAME = filepath

    audio = pyaudio.PyAudio()

    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording...")

    frames = []

    for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Finished recording.")

    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Combine audio data from frames
    audio_signal = np.frombuffer(b"".join(frames), dtype=np.int16)
    # Perform noise reduction using Noisereduce Library
    
    reduced_noise = nr.reduce_noise(y=audio_signal, sr=RATE, stationary=stationary)

    waveFile = wave.open(WAVE_OUTPUT_FILENAME, "wb")
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(reduced_noise.astype(np.int16).tobytes())
    waveFile.close()

In [58]:
def predict_wav(sample: str):
    sample, _ = torchaudio.load(sample)

    sample = torch.cat([sample, torch.zeros([sample.shape[0], 16000])], dim=-1)

    sample = torch.clamp(sample, -1.0, 1.0)
    SR = 16000
    hop_length=160
    win_length=480
    n_fft=512
    n_mels=40
    feature = LogMel(
                device,
                sample_rate=SR,
                hop_length=hop_length,
                win_length=win_length,
                n_fft=n_fft,
                n_mels=n_mels,
            )
    sample = feature(sample)  
    sample = spec_augment(sample)
    sample = sample.to(device)

    outputs = model(sample.unsqueeze(0))
    predictions = F.softmax(outputs)

    print(predictions)

    if predictions[0][1].item() > THRESHOLD:
        return "Keyword"
    else:
        return "Filler"

## Эксперименты с шумоподавлением

In [99]:
record_wav_nr("output_non_stationary_nr.wav", False)
predict_wav("output_non_stationary_nr.wav")

Recording...
Finished recording.
tensor([[0.9271, 0.0729]], grad_fn=<SoftmaxBackward>)




'Filler'

In [100]:
record_wav_nr("output_stationary_nr.wav", True)
predict_wav("output_stationary_nr.wav")


Recording...
Finished recording.
tensor([[0.2089, 0.7911]], grad_fn=<SoftmaxBackward>)




'Filler'

In [52]:
record_wav_nr("output.wav")
predict_wav("output.wav")

Recording...
Finished recording.
tensor([[0.2531, 0.7469]], grad_fn=<SoftmaxBackward>)




'Filler'

## Записать сигнал и сразу его распознать

In [124]:
import pyaudio
import torch
import torchaudio

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
RECORD_SECONDS = 1.2

audio = pyaudio.PyAudio()

def process_sample(sample):
    sample = torch.clamp(sample, -1.0, 1.0)
    SR = 16000
    hop_length=160
    win_length=480
    n_fft=512
    n_mels=40
    feature = LogMel(
                device,
                sample_rate=SR,
                hop_length=hop_length,
                win_length=win_length,
                n_fft=n_fft,
                n_mels=n_mels,
            )
    sample = feature(sample)  
    sample = spec_augment(sample)
    return sample

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

# Process audio data within the recording loop
sample = torch.zeros(1, int(RATE * RECORD_SECONDS))
for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    audio_tensor = torch.from_numpy(np.frombuffer(data, dtype=np.int16) / 32767.0) 
    sample[0, i*CHUNK:(i+1)*CHUNK] = audio_tensor

print("Finished recording.")

stream.stop_stream()
stream.close()
audio.terminate()

# Process the audio tensor with feature extraction and classification
sample = process_sample(sample)
sample = sample.to(device)

outputs = model(sample.unsqueeze(0))
predictions = F.softmax(outputs)

print(predictions)

if predictions[0][1].item() > 0.99:
    print("Keyword")
else:
    print("Filler")


Recording...
Finished recording.
tensor([[0.0021, 0.9979]], grad_fn=<SoftmaxBackward>)
Keyword




## Записать 100 сигналов по секунде, например, для записи шума

In [8]:
for i in range(201, 251):
    record_wav(f"records/output_{i}.wav")

Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording...
Finished recording.
Recording.

In [87]:
import os

directory = '../data/bed/final_test/bed/'
files = os.listdir(directory)
print("---Keywords---")
for file in files:
    print(file)
    print(predict_wav(directory + file))

print("---Fillers---")
directory = '../data/bed/final_test/filler/'
files = os.listdir(directory)
for file in files:
    print(file)
    print(predict_wav(directory + file))


---Keywords---
0ea0e2f4_nohash_0.wav
tensor([[0.1045, 0.8955]], grad_fn=<SoftmaxBackward>)
Keyword
0c40e715_nohash_0.wav
tensor([[0.0326, 0.9674]], grad_fn=<SoftmaxBackward>)
Keyword
2d82a556_nohash_0.wav
tensor([[0.0725, 0.9275]], grad_fn=<SoftmaxBackward>)
Keyword
1cb788bc_nohash_1.wav
tensor([[0.0194, 0.9806]], grad_fn=<SoftmaxBackward>)
Keyword
1acc97de_nohash_0.wav
tensor([[0.0264, 0.9736]], grad_fn=<SoftmaxBackward>)
Keyword
0cb74144_nohash_0.wav
tensor([[0.4991, 0.5009]], grad_fn=<SoftmaxBackward>)
Filler
2c6d3924_nohash_0.wav
tensor([[0.0033, 0.9967]], grad_fn=<SoftmaxBackward>)
Keyword
1b4c9b89_nohash_0.wav
tensor([[0.0170, 0.9830]], grad_fn=<SoftmaxBackward>)
Keyword
1cb788bc_nohash_0.wav
tensor([[0.1208, 0.8792]], grad_fn=<SoftmaxBackward>)
Keyword
0ea0e2f4_nohash_1.wav
tensor([[0.0289, 0.9711]], grad_fn=<SoftmaxBackward>)
Keyword
---Fillers---
10587.wav




tensor([[0.9960, 0.0040]], grad_fn=<SoftmaxBackward>)
Filler
0a5636ca_nohash_0.wav
tensor([[0.1131, 0.8869]], grad_fn=<SoftmaxBackward>)
Keyword
10547.wav
tensor([[0.9950, 0.0050]], grad_fn=<SoftmaxBackward>)
Filler
10546.wav
tensor([[0.9950, 0.0050]], grad_fn=<SoftmaxBackward>)
Filler
10745.wav
tensor([[0.9952, 0.0048]], grad_fn=<SoftmaxBackward>)
Filler
10551.wav
tensor([[0.9939, 0.0061]], grad_fn=<SoftmaxBackward>)
Filler
0b09edd3_nohash_0.wav
tensor([[0.3080, 0.6920]], grad_fn=<SoftmaxBackward>)
Filler
10431.wav
tensor([[0.9954, 0.0046]], grad_fn=<SoftmaxBackward>)
Filler
10742.wav
tensor([[0.9939, 0.0061]], grad_fn=<SoftmaxBackward>)
Filler
10394.wav
tensor([[0.9945, 0.0055]], grad_fn=<SoftmaxBackward>)
Filler
10585.wav
tensor([[0.9952, 0.0048]], grad_fn=<SoftmaxBackward>)
Filler
0a2b400e_nohash_3.wav
tensor([[0.4602, 0.5398]], grad_fn=<SoftmaxBackward>)
Filler
10618.wav
tensor([[0.9928, 0.0072]], grad_fn=<SoftmaxBackward>)
Filler


# Записывать сигнал и распознавать ключевое слово в потоковом режиме 

In [103]:
import pyaudio
import torch
import torchaudio
import time

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
NUM_OF_CHUNKS = 10
CHUNK = int(RATE/NUM_OF_CHUNKS)
RECORD_SECONDS = 1

audio = pyaudio.PyAudio()

def process_sample(sample):
    sample = torch.clamp(sample, -1.0, 1.0)
    SR = 16000
    hop_length=160
    win_length=480
    n_fft=512
    n_mels=40
    feature = LogMel(
                device,
                sample_rate=SR,
                hop_length=hop_length,
                win_length=win_length,
                n_fft=n_fft,
                n_mels=n_mels,
            )
    sample = feature(sample)  
    sample = spec_augment(sample)
    return sample

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

sample_window = int(RATE * RECORD_SECONDS) # How much samples in recorded audio
chunks_num = int(sample_window/CHUNK) # How much chunks in recorder audio

sample = torch.zeros(1, sample_window) #.to(device)
i = 0
try:
    while True:
        i+=1
        start_time = time.time()

        data = stream.read(CHUNK) # Считываем чанк аудиосигнала с микрофона
        audio_tensor = torch.from_numpy(np.frombuffer(data, dtype=np.int16) / 32767.0) # Преобразуем в тензор
        sample[0, 0:(chunks_num - 1)*CHUNK] = sample[0, CHUNK:chunks_num*CHUNK].clone() # Передвигаем значения сэмпла от второго до последнего чанка в диапазон от первого до предпоследнего чанка
        # print(f"audio_tensor shape: {audio_tensor.shape}, sample shape: {sample.shape}, sample[0, chunk] shape: {sample[0, CHUNK:chunks_num*CHUNK].shape}")
        sample[0, (chunks_num - 1)*CHUNK:chunks_num*CHUNK] = audio_tensor

        # Process the audio tensor with feature extraction and classification
        sample_processed = process_sample(sample)
        sample_processed = sample_processed.to(device)

        outputs = model(sample_processed.unsqueeze(0))
        predictions = F.softmax(outputs)
        
        end_time = time.time()
        print(f"Iteration: {i}, sample: {sample}, time to record and predict: {end_time-start_time}")
        print(predictions)

        if i >= NUM_OF_CHUNKS:
            if predictions[0][1].item() > 0.99:
                print("Keyword")
                #break
            else:
                print("Filler")

except KeyboardInterrupt:
    pass

stream.stop_stream()
stream.close()
audio.terminate()


Recording...
Iteration: 1, sample: tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0462, -0.0603, -0.0490]]), time to record and predict: 0.1319291591644287
tensor([[0.2381, 0.7619]], grad_fn=<SoftmaxBackward>)




Iteration: 2, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1284, 0.1263, 0.1300]]), time to record and predict: 0.12501144409179688
tensor([[0.1135, 0.8865]], grad_fn=<SoftmaxBackward>)
Iteration: 3, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1008, 0.0947, 0.0908]]), time to record and predict: 0.1248788833618164
tensor([[0.0473, 0.9527]], grad_fn=<SoftmaxBackward>)
Iteration: 4, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0033, 0.0056, 0.0147]]), time to record and predict: 0.13573336601257324
tensor([[0.0190, 0.9810]], grad_fn=<SoftmaxBackward>)
Iteration: 5, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0186, 0.0266, 0.0376]]), time to record and predict: 0.020361900329589844
tensor([[0.0125, 0.9875]], grad_fn=<SoftmaxBackward>)
Iteration: 6, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1136, 0.1224, 0.1243]]), time to record and predict: 0.1492922306060791
tensor([[0.0036, 0.9964]], grad_fn=<SoftmaxBackward>)
Iteration: 7, sample: tensor([[0.0000, 0.0000, 0.00

# Запись и распознавание в потоковом режиме с подавлением шумов

In [104]:
import pyaudio
import torch
import torchaudio
import time

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
NUM_OF_CHUNKS = 10
CHUNK = int(RATE/NUM_OF_CHUNKS)
RECORD_SECONDS = 1

audio = pyaudio.PyAudio()

def process_sample(sample):
    sample = torch.clamp(sample, -1.0, 1.0)
    SR = 16000
    hop_length=160
    win_length=480
    n_fft=512
    n_mels=40
    feature = LogMel(
                device,
                sample_rate=SR,
                hop_length=hop_length,
                win_length=win_length,
                n_fft=n_fft,
                n_mels=n_mels,
            )
    sample = feature(sample)  
    sample = spec_augment(sample)
    return sample

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

sample_window = int(RATE * RECORD_SECONDS) # How much samples in recorded audio
chunks_num = int(sample_window/CHUNK) # How much chunks in recorder audio

sample = torch.zeros(1, sample_window)
i = 0
try:
    while True:
        i+=1
        start_time = time.time()

        data = stream.read(CHUNK) # Считываем чанк аудиосигнала с микрофона
        audio_tensor = torch.from_numpy(np.frombuffer(data, dtype=np.int16) / 32767.0) # Преобразуем в тензор

        sample[0, 0:(chunks_num - 1)*CHUNK] = sample[0, CHUNK:chunks_num*CHUNK].clone() # Передвигаем значения сэмпла от второго до последнего чанка в диапазон от первого до предпоследнего чанка
        sample[0, (chunks_num - 1)*CHUNK:chunks_num*CHUNK] = audio_tensor

        nr_sample = torch.tensor(nr.reduce_noise(y=sample, sr=RATE, stationary=True))

        # Process the audio tensor with feature extraction and classification
        sample_processed = process_sample(nr_sample)
        sample_processed = sample_processed.to(device)

        outputs = model(sample_processed.unsqueeze(0))
        predictions = F.softmax(outputs)
        
        end_time = time.time()
        print(f"Iteration: {i}, sample: {sample}, time to record and predict: {end_time-start_time}")
        print(predictions)

        if i >= NUM_OF_CHUNKS:
            if predictions[0][1].item() > 0.99:
                print("Keyword")
                #break
            else:
                print("Filler")

except KeyboardInterrupt:
    pass

stream.stop_stream()
stream.close()
audio.terminate()


Recording...




Iteration: 1, sample: tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.2519, -0.1958, -0.0604]]), time to record and predict: 0.21543359756469727
tensor([[0.1037, 0.8963]], grad_fn=<SoftmaxBackward>)
Iteration: 2, sample: tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0353,  0.0142, -0.0161]]), time to record and predict: 0.06184816360473633
tensor([[0.0339, 0.9661]], grad_fn=<SoftmaxBackward>)
Iteration: 3, sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.2931, 0.2058, 0.0516]]), time to record and predict: 0.19604992866516113
tensor([[0.0197, 0.9803]], grad_fn=<SoftmaxBackward>)
Iteration: 4, sample: tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0670, -0.0308, -0.1085]]), time to record and predict: 0.049411773681640625
tensor([[0.7324, 0.2676]], grad_fn=<SoftmaxBackward>)
Iteration: 5, sample: tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0592,  0.0619, -0.0111]]), time to record and predict: 0.05950045585632324
tensor([[0.9860, 0.0140]], grad_fn=<SoftmaxBackward>)
Iteration: 6, sample: ten

In [3]:
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
RECORD_SECONDS = 1.0
WAVE_OUTPUT_FILENAME = "output.wav"

audio = pyaudio.PyAudio()

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")

frames = []

for i in range(int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("Finished recording.")

stream.stop_stream()
stream.close()
audio.terminate()

waveFile = wave.open(WAVE_OUTPUT_FILENAME, "wb")
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b"".join(frames))
waveFile.close()


Recording...
Finished recording.


In [7]:
audio.get_sample_size(FORMAT)

2