In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
import numpy as np
import pyaudio
import time
import librosa
import wave
import queue
import IPython.display as ipd
from IPython.display import Audio
import soundfile as sf
import os
import threading
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
best_model = load_model("GRU_model_17_7.h5", compile=False)

In [3]:
# Hàm trích xuất đặc trưng MFCC
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name)

    # Tính toán số frame trong mỗi frame có độ dài 25ms với sampling rate là 16000
    frame_length = int(0.025 * sample_rate)
    # Tính toán số sample overlap trong frame overlap 10ms với sampling rate là 16000
    hop_length = int(0.01 * sample_rate)
    # Tính toán kích thước FFT (Fast Fourier Transform) trong hàm mfcc
    n_fft = 2 ** int(np.ceil(np.log2(frame_length)))

    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
    mfccs = np.zeros((13,101))
    mfccs[:mfcc.shape[0], :mfcc.shape[1]]=mfcc
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    mfccs_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    mfccs_features = np.transpose(mfccs_features)

    return mfccs_features

In [4]:
# Hàm dự đoán số 
def mic_check(file_name):
    mfccs_features = features_extractor(file_name)
    data = np.stack(mfccs_features)
    data = np.reshape(data,(1,101,39))
    pre_test = best_model.predict(data, verbose=0)
    s=0
    result = 0
    for i in pre_test[0]:
        if i > 5.0e-1:
            result = i
            break
        s=s+1
    # có số với xác suất lớn hơn 0.5 mới đưa ra màn hình.
    if s != 10:
        if s == 0:
            print("Predict: Eight")
        elif s == 1:
            print("Predict: Five")
        elif s == 2:
            print("Predict: Four")
        elif s == 3:
            print("Predict: Nine")
        elif s == 4:
            print("Predict: One")
        elif s == 5:
            print("Predict: Seven")
        elif s == 6:
            print("Predict: Six")
        elif s == 7:
            print("Predict: Three")
        elif s == 8:
            print("Predict: Two")
        elif s == 9:
            print("Predict: Zero")
        else:
            print("Unrecognizable")
        print('Reliability: %.2f %%' %(result*100.00))
        display(Audio(file_name))

In [5]:
# Định nghĩa các hằng số
RATE = 16000
CHUNK = 1024
FORMAT = pyaudio.paInt16
THRESHOLD_ZCR = 0.1
# TIME = True
# Khởi tạo Queue
audio_queue = queue.Queue()

In [6]:
# Hàm của luồng 1
def record_audio():
    global TIME
    print("Recording audio...")
    i = 0
    frames = []
    frames_to_write = []
    #Khởi tạo Audio
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=1,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    while True:
        # Lấy một khung dữ liệu
        data = stream.read(CHUNK)
        frames.append(data)
        # Tính năng Zero-Crossing Rate
        x = np.frombuffer(data, dtype=np.int16)
        zrca = []
        zrca = np.where(np.diff(np.sign(x)))[0]
        zero_crossing_rate = len(zrca)/len(x)
        # Nếu có giọng nói, thì cắt đoạn âm thanh và lưu vào file wav
        if  zero_crossing_rate > THRESHOLD_ZCR:
            # Lấy thời gian bắt đầu của đoạn âm thanh
            start_time = frames.index(data)
            # Cắt đoạn âm thanh từ lúc nó vượt ngưỡng tới lúc nó dưới ngưỡng lần đầu tiên
            end_time = start_time
            while zero_crossing_rate > THRESHOLD_ZCR:
                end_time += 1
                data = stream.read(CHUNK)
                frames.append(data)
                x = np.frombuffer(data, dtype=np.int16)
                zrca = []
                zrca = np.where(np.diff(np.sign(x)))[0]
                zero_crossing_rate = len(zrca)/len(x)
            #Tính toán và đưa ra khoảng frames bắt đầu đến kết thúc
            end_frame = end_time * CHUNK
            frames_to_write = frames[start_time-5:end_frame]
            # lưu file âm thanh vào file wav
            i= i+1
            filename =  "speech_{}".format(i)+".wav"
#             print(filename)
            # Lưu file âm thanh thu được
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(1)
                wf.setsampwidth(p.get_sample_size(FORMAT))
                wf.setframerate(RATE)
                wf.writeframes(b''.join(frames_to_write))

            # đưa file wav vào queue
            audio_queue.put(filename)


    

In [7]:
# Hàm của luồng 2
def process_audio():
    while (True):
        # Kiểm tra xem nếu queue không trống thì thực hiện đưa file ra và dự đoán số
        if not audio_queue.empty():
            # Lấy file âm thanh từ Queues
            filename = audio_queue.get()
            # dự đoán số
            mic_check(filename)


In [None]:
# Khởi tạo luồng 1 và luồng 2
thread1 = threading.Thread(target=record_audio)
thread2 = threading.Thread(target=process_audio)
thread1.start()
thread2.start()
thread1.join()
thread2.join()


Recording audio...
Predict: One
Reliability: 99.44 %


Predict: One
Reliability: 81.63 %


Predict: Three
Reliability: 70.88 %


Predict: Five
Reliability: 99.99 %


Predict: Three
Reliability: 97.22 %


Predict: Three
Reliability: 50.97 %


Predict: Nine
Reliability: 66.13 %


Predict: Five
Reliability: 55.17 %


Predict: Seven
Reliability: 95.82 %


Predict: Eight
Reliability: 98.93 %


Predict: Nine
Reliability: 99.78 %


Predict: Nine
Reliability: 88.63 %


Predict: Nine
Reliability: 94.40 %


Predict: One
Reliability: 90.59 %


Predict: Eight
Reliability: 66.22 %


Predict: One
Reliability: 98.25 %


Predict: One
Reliability: 99.57 %


Predict: Eight
Reliability: 98.44 %


Predict: Nine
Reliability: 92.36 %


Predict: One
Reliability: 99.95 %


Predict: Eight
Reliability: 64.84 %


Predict: Eight
Reliability: 95.07 %


Predict: Two
Reliability: 50.88 %


Predict: One
Reliability: 99.99 %


Predict: Eight
Reliability: 95.01 %


Predict: Two
Reliability: 94.06 %


Predict: Eight
Reliability: 95.82 %


Predict: Three
Reliability: 85.95 %


Predict: Three
Reliability: 69.27 %


Predict: Two
Reliability: 98.73 %


Predict: Nine
Reliability: 68.29 %


Predict: Nine
Reliability: 78.41 %


Predict: Five
Reliability: 55.58 %


Predict: Five
Reliability: 63.61 %


Predict: Four
Reliability: 51.37 %


Predict: Four
Reliability: 80.40 %


Predict: Five
Reliability: 93.02 %


Predict: Four
Reliability: 55.22 %


Predict: Five
Reliability: 54.57 %


Predict: Six
Reliability: 70.68 %


Predict: Five
Reliability: 53.20 %


Predict: Five
Reliability: 99.86 %


Predict: Three
Reliability: 99.55 %


Predict: Five
Reliability: 99.79 %


Predict: Six
Reliability: 77.89 %


Predict: Seven
Reliability: 79.01 %


Predict: Eight
Reliability: 99.97 %


Predict: Nine
Reliability: 99.93 %


Predict: One
Reliability: 99.96 %


Predict: One
Reliability: 99.82 %


Predict: Two
Reliability: 84.60 %


Predict: Zero
Reliability: 52.13 %


Predict: Three
Reliability: 95.71 %


Predict: Two
Reliability: 84.32 %


Predict: Zero
Reliability: 99.99 %


Predict: One
Reliability: 66.95 %


Predict: Nine
Reliability: 96.51 %


Predict: Three
Reliability: 99.11 %


Predict: Two
Reliability: 91.10 %


Predict: Three
Reliability: 74.05 %


Predict: Four
Reliability: 84.16 %


Predict: Five
Reliability: 99.89 %


Predict: Five
Reliability: 99.95 %


Predict: Three
Reliability: 95.83 %


Predict: Six
Reliability: 97.99 %


Predict: Three
Reliability: 69.81 %


Predict: Three
Reliability: 99.97 %


Predict: Six
Reliability: 95.89 %


Predict: Nine
Reliability: 83.39 %


Predict: One
Reliability: 99.47 %


Predict: Two
Reliability: 87.38 %


Predict: Three
Reliability: 60.96 %


Predict: Five
Reliability: 71.64 %


Predict: Two
Reliability: 98.41 %


Predict: Three
Reliability: 99.77 %


Predict: Four
Reliability: 52.85 %


Predict: Five
Reliability: 99.92 %


Predict: Five
Reliability: 99.87 %


Predict: Five
Reliability: 94.87 %


Predict: Zero
Reliability: 100.00 %


Predict: One
Reliability: 99.17 %


Predict: Three
Reliability: 70.90 %


Exception in thread Thread-6 (process_audio):
Traceback (most recent call last):
  File "F:\Anaconda\Setup\envs\tensoflow\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "F:\Anaconda\Setup\envs\tensoflow\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_13308\3536641059.py", line 9, in process_audio
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_13308\279495169.py", line 3, in mic_check
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_13308\3702175278.py", line 14, in features_extractor
ValueError: could not broadcast input array from shape (13,161) into shape (13,101)
