In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
import numpy as np
import pyaudio
import time
import librosa
import wave
import queue
import IPython.display as ipd
from IPython.display import Audio
import soundfile as sf
import os
import threading
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"




In [2]:
def get_file_path(folder,file_name):
    # Lấy đường dẫn đến thư mục làm việc hiện tại trong Jupyter
    current_directory = os.getcwd()

    # Đường dẫn đến thư mục chứa tệp mô hình "H5"
    h5_folder_path = os.path.join(current_directory, folder)

    # Đường dẫn đến tệp mô hình được chỉ định
    file_path = os.path.join(h5_folder_path, file_name)

    return file_path

In [3]:
# def features_extractor(file_name):
#     audio,sample_rate  = librosa.load(file_name, sr=16000, mono=True)
#     mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40) ## Apllying mfcc
#     mfcc_processed = np.mean(mfcc.T, axis=0) ## some pre-processing
#     return mfcc_processed

In [4]:
def features_extractor(file_name):
    audio,sample_rate  = librosa.load(file_name, sr=16000, mono=True)
    # Tính toán số frame trong mỗi frame có độ dài 25ms với sampling rate là 16000
    frame_length = int(0.025 * sample_rate)
    # Tính toán số sample overlap trong frame overlap 10ms với sampling rate là 16000
    hop_length = int(0.01 * sample_rate)
    # Tính toán kích thước FFT (Fast Fourier Transform) trong hàm mfcc
    n_fft = 2 ** int(np.ceil(np.log2(frame_length)))

    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
    desired_size = 101
    mfccs = np.zeros((13,desired_size))
    # Sao chép dữ liệu MFCC vào ma trận mới
    mfccs[:, :min(desired_size, mfcc.shape[1])] = mfcc[:, :min(desired_size, mfcc.shape[1])]
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    mfccs_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    mfccs_features = np.transpose(mfccs_features)
    mfccs_features = np.reshape(mfccs_features,(101,39,1))
    return mfccs_features

In [5]:
# Hàm dự đoán số 
def mic_check(file_name):
    mfccs_features = features_extractor(file_name)
    # data = np.stack(mfccs_features)
    # data = np.reshape(data,(1,150,39))
    pre_test = best_model.predict(data, verbose=0)
    

In [6]:
# Định nghĩa các hằng số
RATE = 16000
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
n_times = int(RATE / CHUNK * 1)  # Number of iterations to record 1 second
# TIME = True
# Khởi tạo Queue
output_folder = 'Wakeup'
os.makedirs(output_folder, exist_ok=True)
p = pyaudio.PyAudio()

In [7]:
# Hàm của luồng 1
def record_audio_1s(file_number, stream):
    global TIME
    print(f"Recording audio for file {file_number}...")
    frames = []

    for _ in range(n_times):
        # Lấy một khung dữ liệu
        data = stream.read(CHUNK)
        frames.append(data)

    # Lưu file âm thanh vào file wav
    filename =  "speech_{}".format(file_number)+".wav"
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
    return filename
        

In [8]:
# Hàm của luồng 1
def record_audio():
    print("Recording audio...")
    #Khởi tạo Audio
    stream = p.open(format=FORMAT,
                    channels=1,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    for i in range(100):
        filename = record_audio_1s(i, stream)
        mfcc_processed = features_extractor(filename)
        data = np.stack(mfcc_processed)
        data = np.reshape(data,(1,101,39))
        prediction = best_model.predict(np.expand_dims(mfcc_processed, axis=0))
        print(prediction)
        # if prediction[:, 0] > 0.5:
        if prediction[:, 0] > prediction[:, 1]:
            print(f"Wake Word Detected for ({i})")
            display(Audio(get_file_path('Response','Hello.wav'), rate=16000, autoplay=True))
            time.sleep(1.3)
            print("Confidence:", prediction[:, 0])
            # display(Audio(filename))
        display(Audio(filename))
        !del {filename}


    

In [9]:
# best_model = load_model("LSTM_model_best.h5", compile=False)
# best_model = load_model("CNN_Class_Wakeup_word.h5", compile=False)
best_model = load_model(get_file_path("WuW_H5","Augu_Train_Noise_145_25_WuW_CNN_Sigmoid.h5"), compile=False)





In [10]:
record_audio()

Recording audio...
Recording audio for file 0...
[[3.9940645e-07 9.9999630e-01]]


Recording audio for file 1...
[[5.3639393e-08 9.9999940e-01]]


Recording audio for file 2...
[[0.01286853 0.96197665]]


Recording audio for file 3...
[[0.02432024 0.9341501 ]]


Recording audio for file 4...
[[2.8446231e-05 9.9983341e-01]]


Recording audio for file 5...
[[2.7852764e-04 9.9872458e-01]]


Recording audio for file 6...
[[0.4553812  0.38152522]]
Wake Word Detected for (6)


Confidence: [0.4553812]


Recording audio for file 7...
[[1.837383e-04 9.991199e-01]]


Recording audio for file 8...
[[0.7752901  0.14828329]]
Wake Word Detected for (8)


Confidence: [0.7752901]


Recording audio for file 9...
[[0.05544333 0.8685084 ]]


Recording audio for file 10...
[[0.84229374 0.10544518]]
Wake Word Detected for (10)


Confidence: [0.84229374]


Recording audio for file 11...
[[0.02332468 0.93646216]]


Recording audio for file 12...
[[8.9405485e-06 9.9994069e-01]]


Recording audio for file 13...
[[1.7744124e-11 1.0000000e+00]]


Recording audio for file 14...
[[2.3805802e-05 9.9985790e-01]]


Recording audio for file 15...
[[2.2460182e-10 1.0000000e+00]]


Recording audio for file 16...
[[2.930979e-07 9.999972e-01]]


Recording audio for file 17...
[[1.8686929e-06 9.9998534e-01]]


Recording audio for file 18...
[[8.377323e-11 1.000000e+00]]


Recording audio for file 19...
[[5.3599375e-05 9.9970675e-01]]


Recording audio for file 20...
[[1.5198935e-07 9.9999845e-01]]


Recording audio for file 21...
[[1.2325982e-04 9.9938357e-01]]


Recording audio for file 22...
[[0.02896058 0.92358935]]


Recording audio for file 23...
[[5.4325734e-04 9.9768651e-01]]


Recording audio for file 24...
[[4.589208e-04 9.980094e-01]]


Recording audio for file 25...
[[0.9575912  0.03152734]]
Wake Word Detected for (25)


Confidence: [0.9575912]


Recording audio for file 26...
[[1.7706495e-04 9.9914843e-01]]


Recording audio for file 27...
[[0.00822653 0.9742771 ]]


Recording audio for file 28...
[[0.06090672 0.8579998 ]]


Recording audio for file 29...
[[1.5527957e-06 9.9998754e-01]]


Recording audio for file 30...
[[0.80330706 0.13025928]]
Wake Word Detected for (30)


Confidence: [0.80330706]


Recording audio for file 31...
[[2.0596458e-04 9.9902552e-01]]


Recording audio for file 32...
[[0.00180162 0.99327743]]


Recording audio for file 33...
[[7.5393846e-06 9.9994904e-01]]


Recording audio for file 34...
[[1.08910235e-05 9.99929249e-01]]


Recording audio for file 35...
[[8.099051e-08 9.999991e-01]]


Recording audio for file 36...
[[0.03596923 0.9082326 ]]


Recording audio for file 37...
[[0.04148523 0.8965793 ]]


Recording audio for file 38...
[[0.00318006 0.98887265]]


Recording audio for file 39...
[[0.00248148 0.99106824]]


Recording audio for file 40...
[[5.786856e-05 9.996860e-01]]


Recording audio for file 41...
[[4.9165512e-05 9.9972850e-01]]


Recording audio for file 42...
[[3.9270843e-05 9.9977785e-01]]


Recording audio for file 43...
[[0.02103912 0.94183826]]


Recording audio for file 44...
[[0.9460757  0.03917853]]
Wake Word Detected for (44)


KeyboardInterrupt: 