In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
import numpy as np
import pyaudio
import time
import librosa
import wave
import queue
import IPython.display as ipd
from IPython.display import Audio
import soundfile as sf
import os
import threading
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
best_model = load_model("GRU_model_17_7.h5", compile=False)

In [3]:
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name)

    # Tính toán số frame trong mỗi frame có độ dài 25ms với sampling rate là 16000
    frame_length = int(0.025 * sample_rate)
    # Tính toán số sample overlap trong frame overlap 10ms với sampling rate là 16000
    hop_length = int(0.01 * sample_rate)
    # Tính toán kích thước FFT (Fast Fourier Transform) trong hàm mfcc
    n_fft = 2 ** int(np.ceil(np.log2(frame_length)))

    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
    mfccs = np.zeros((13,101))
    mfccs[:mfcc.shape[0], :mfcc.shape[1]]=mfcc
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    mfccs_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    mfccs_features = np.transpose(mfccs_features)

    return mfccs_features

In [4]:
def mic_check(file_name):
    mfccs_features = features_extractor(file_name)
    data = np.stack(mfccs_features)
    data = np.reshape(data,(1,101,39))
    pre_test = best_model.predict(data, verbose=0)
    s=0
    result = 0
    for i in pre_test[0]:
        if i > 5.0e-1:
            result = i
            break
        s=s+1
    if s == 0:
        print("Predict: Eight")
    elif s == 1:
        print("Predict: Five")
    elif s == 2:
        print("Predict: Four")
    elif s == 3:
        print("Predict: Nine")
    elif s == 4:
        print("Predict: One")
    elif s == 5:
        print("Predict: Seven")
    elif s == 6:
        print("Predict: Six")
    elif s == 7:
        print("Predict: Three")
    elif s == 8:
        print("Predict: Two")
    elif s == 9:
        print("Predict: Zero")
    else:
        print("Unrecognizable")
    print('Reliability: %.2f %%' %(result*100.00))

In [5]:
# Define constants
RATE = 16000
CHUNK = 1024
FORMAT = pyaudio.paInt16
THRESHOLD_ZCR = 0.1
TIME = True
TIME_AUDIO = 30
# Create a queue for storing audio files
audio_queue = queue.Queue()

In [6]:
# Function for recording audio
def record_audio():
    global TIME
    print("Recording audio...")
    i = 0
    frames = []
    frames_to_write = []
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=1,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    ten_time = time.time()
    while time.time() - ten_time < TIME_AUDIO:
        # Lấy một khung dữ liệu
        data = stream.read(CHUNK)
        frames.append(data)
        
        # Tính năng Zero-Crossing Rate
        x = np.frombuffer(data, dtype=np.int16)
        zrca = []
        zrca = np.where(np.diff(np.sign(x)))[0]
        zero_crossing_rate = len(zrca)/len(x)
            
#         print("energy: "+ str(energy ) +' zcr '+ str(zero_crossing_rate ))
        # Nếu có giọng nói, thì cắt đoạn âm thanh và lưu vào file wav
        if  zero_crossing_rate > THRESHOLD_ZCR:
            # Lấy thời gian bắt đầu của đoạn âm thanh
            start_time = frames.index(data)

            # Cắt đoạn âm thanh từ lúc nó vượt ngưỡng tới lúc nó dưới ngưỡng lần đầu tiên
            end_time = start_time
            while zero_crossing_rate > THRESHOLD_ZCR:
                end_time += 1
                data = stream.read(CHUNK)
                frames.append(data)
                x = np.frombuffer(data, dtype=np.int16)
                zrca = []
                zrca = np.where(np.diff(np.sign(x)))[0]
                zero_crossing_rate = len(zrca)/len(x)
#                 print("energy: "+ str(energy ) +' zcr '+ str(zero_crossing_rate ))
#                 print("zcr: "+ str(zero_crossing_rate))
            end_frame = end_time * CHUNK
            frames_to_write = frames[start_time-5:end_frame]
            # Save recorded audio to a WAV file
            i= i+1
            filename =  "speech_{}".format(i)+".wav"
            print(filename)
            # Lưu file âm thanh thu được
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(1)
                wf.setsampwidth(p.get_sample_size(FORMAT))
                wf.setframerate(RATE)
    #                 wf.writeframes(b''.join(frames))
                wf.writeframes(b''.join(frames_to_write))

            # Put the filename into the audio queue
            audio_queue.put(filename)
    #             display(Audio(filename))
    TIME = False
    # Stop recording
    print("Stop recording audio...\n")
    stream.stop_stream()
    stream.close()
    p.terminate()

    

In [7]:
# Function for processing audio files
def process_audio():
    global TIME
    while (TIME == True):
        # Check if the audio queue is not empty

        if not audio_queue.empty():
            # Get the audio file from the queue
            filename = audio_queue.get()
            
            # Load and preprocess the audio data
            # ...
            
            # Perform prediction using the TensorFlow model
            # prediction = model.predict(audio_data)
            mic_check(filename)

            display(Audio(filename))
    TIME = True
    print("Ending Multi-tasking")
    

In [10]:

thread1 = threading.Thread(target=record_audio)
# Create and start Thread2 to process audio files
thread2 = threading.Thread(target=process_audio)
thread1.start()
thread2.start()
thread1.join()
thread2.join()

Recording audio...
speech_1.wav
Predict: One
Reliability: 89.35 %


speech_2.wav
Predict: Nine
Reliability: 99.78 %


speech_3.wav
Predict: Eight
Reliability: 99.88 %


speech_4.wav
Predict: Six
Reliability: 63.09 %


speech_5.wav
Unrecognizable
Reliability: 0.00 %


speech_6.wav
Predict: Three
Reliability: 99.94 %


speech_7.wav
Predict: Three
Reliability: 98.75 %


speech_8.wav
Predict: Three
Reliability: 98.10 %


speech_9.wav
Predict: Eight
Reliability: 85.52 %


Stop recording audio...
Ending Multi-tasking

