# Test model - Audio

In [1]:
import sounddevice as sd
import subprocess

import time
import librosa

import IPython.display as ipd

import numpy as np
import os
from scipy.io import wavfile as wav

from tensorflow import keras

import json

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
import data_preparation

In [3]:
def load_audio_model(models_dir, model_name):
    full_name = models_dir+"/"+model_name
    print(full_name)
    model = keras.models.load_model(full_name)
    json_name = full_name.replace("h5", "json")

    with open(json_name, "r") as jf:
        d = json.load(jf)
    d["model"] = model
    return d

In [4]:
models_dir = "./best_models"

In [5]:
models = [load_audio_model(models_dir, f) for f in os.listdir(models_dir) if f.endswith("h5")]

./best_models/speakers.h5
./best_models/digits.h5


In [6]:
models = sorted(models, key = lambda x : x["name"])

In [7]:
# Sort by name
[model["name"] for model in models]

['MFCC-based Digit classifier', 'Spectrogram-based speaker classifier']

In [8]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/", wait_time = 0.5):
    print("Ready in 3...", end = "")
    time.sleep(wait_time)
    print("2...", end = "")
    time.sleep(wait_time)
    print("1...")
    time.sleep(wait_time)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK? [Y/n]")
    if (ok == "") or (ok.lower() in "yes"):
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [9]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [10]:
# Taken from Classifiers-SpectrogramBased
max_rec_length = 9015

In [11]:
# Debugging comments are left for the moment
def test_nets(models, max_rec_length, answer = None,
            duration=1.5, rec_rate=8000, directory = "test/",
            filename = "test.wav", wait_time=0.5, db=-48):
    create_recording(duration, rec_rate, filename, directory, wait_time=wait_time)   
    ipd.clear_output()
    
    # this is not great, but at least we make sure that the audio is trimmed
    # TODO: find better solution
    len_rec = max_rec_length + 1
    # print("Before padding")
    print("Trimming recording...")
    while len_rec > max_rec_length:
        # Trim until it the recording is shorter than max_rec_length
        
        trim_audio(filename, directory, directory, db=db)
        rec, _ = librosa.core.load(directory + "/" + filename, sr = rec_rate)
        
        # print(rec.shape)
        len_rec = rec.shape[0]
        db = int(db*0.95) # trim more violently at each step

    rec = data_preparation.padding(max_rec_length, rec)

    preds = []
    print("Predicting...\n")
    for model in models:
        if model["type"] == "spectrogram":
            proc_rec = data_preparation.compute_spectrogram(rec, normalize=True, paper_data=model['paper_data'])
        else:
            proc_rec = data_preparation.mfcc(rec, flatten = False)
        proc_rec = proc_rec[np.newaxis,:,:,np.newaxis]
        model_prediction = model["model"].predict_classes(proc_rec)[0]
        prediction_label = model["class_indices"][model_prediction]
        preds.append(prediction_label)
        print("{:50s}{}".format(model["name"]+" prediction: ", prediction_label))

    # print("Model prediction: {}".format(preds[0]))
    if answer is not None:
        print("\nCorrect answer: {}, {}".format(*answer))
    return preds, rec, rec_rate

In [16]:
preds, rec, rec_rate = test_nets(models, max_rec_length, answer=["khaled", 8])

Trimming recording...
Predicting...

MFCC-based Digit classifier prediction:           8
Spectrogram-based speaker classifier prediction:  yweweler

Correct answer: khaled, 8
