In [1]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd
import shutil
import wave
import numpy as np

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt

import csv
from pydub import AudioSegment as am

In [2]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1,16000)
    return audio
def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1]), :])

In [3]:
json_file = open("./model/model.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
model = keras.models.model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("./model/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

Loaded model from disk


In [11]:
path = './recheck model'
sound = am.from_file('./predict speaker/Chi.m4a', format='m4a')
if os.path.exists(path+'/Han/'):
    shutil.rmtree(path+'/Han/')
os.mkdir(path+'/Han',)
sound = sound.set_frame_rate(16000)
for j in range(0,int(sound.frame_count() / 16000)-1):
    Audio = sound[1000*j:(j+1)*1000]
    m = path+'/Han/'+str(j)
    Audio.export(m+'.wav', format="wav")

In [12]:
audio_ds = paths_and_labels_to_dataset([path+'/Han/{}.wav'.format(i) for i in range(0,int(sound.frame_count() / 16000)-1)], [4]*int(sound.frame_count() / 16000))
audio_ds = audio_ds.shuffle(buffer_size=128 * 8, seed=65).batch(
    128
)

In [6]:
label = ["Bao Han","Thanh Chi","Duc Manh","Minh Hieu","Gia Minh","Noise"]

In [13]:
for a,b in audio_ds.take(1):
  ffts = audio_to_fft(a)
  y_pred = model.predict(ffts)
for i in range(y_pred.shape[0]):
  print(label[np.where(y_pred[i] == y_pred[i].max())[0][0]],y_pred[i].max())


Thanh Chi 0.99674356
Thanh Chi 0.58220667
Thanh Chi 0.999982
Bao Han 0.76517946
Bao Han 0.50286937
Bao Han 0.9999529
Thanh Chi 0.94225734
Thanh Chi 0.99999034
