# Test model - Audio

In [1]:
import sounddevice as sd
import subprocess

import time
import librosa

import IPython.display as ipd

import numpy as np
import os
from scipy.io import wavfile as wav

from tensorflow import keras

import json

In [2]:
import data_preparation

In [3]:
# TODO:maybe move this function to some .py file
def load_audio_model(models_dir, model_name):
    full_name = models_dir+"/"+model_name

    model = keras.models.load_model(full_name)
    json_name = full_name.replace("h5", "json")

    with open(json_name, "r") as jf:
        d = json.load(jf)
    d["model"] = model
    return d

In [4]:
models_dir = "../best_models/"

In [5]:
# # Spectrogram-based classifiers
# digits_model = load_audio_model(models_dir, "digit_recognition.h5")
# speakers_model = load_audio_model(models_dir, "speaker_recognition.h5")
# mfcc_speakers_model = load_audio_model(models_dir, "mfcc_speaker_standard.h5")
# models = [digits_model, speakers_model, mfcc_speakers_model]

In [6]:
models = [load_audio_model(models_dir, f) for f in os.listdir(models_dir) if f.endswith("h5")]



In [7]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK? [Y/n]")
    if (ok == "") or (ok.lower() in "yes"):
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [8]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [9]:
# Taken from Classifiers-SpectrogramBased
max_rec_length = 9015

In [10]:
# Debugging comments are left for the moment
def test_NN(models, max_rec_length, answer = None,
            duration=2, rec_rate=8000, directory = "test/",
            filename = "test.wav", db=-48):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    
    # this is not great, but at least we make sure that the audio is trimmed
    # TODO: find better solution
    len_rec = 10000
    # print("Before padding")
    print("Trimming recording...")
    while len_rec > max_rec_length:
        # Trim until it the recording is shorter than max_rec_length
        
        trim_audio(filename, directory, directory, db=db)
        rec, _ = librosa.core.load(directory + "/" + filename, sr = rec_rate)
        
        # print(rec.shape)
        len_rec = rec.shape[0]
        db = int(db*0.95) # trim more violently at each step
        
    # TODO: is the padding the same for mfcc models?
    rec = data_preparation.padding(max_rec_length, rec)
    
    # print("After padding")
    # print(rec.shape)
    # sd.play(rec, rec_rate)
    
    preds = []
    print("Predicting...\n")
    for model in models:
        if model["type"] == "spectrogram":
            proc_rec = data_preparation.compute_spectrogram(rec, normalize=True)
        else:
            # TODO: is the padding the same for mfcc models?
            proc_rec = data_preparation.mfcc(rec, flatten = False)
        proc_rec = proc_rec[np.newaxis,:,:,np.newaxis]
        pred = model["class_indices"][model["model"].predict_classes(proc_rec)[0]]
        preds.append(pred)
        print("{:50s}{}".format(model["name"]+" prediction: ", pred))

    # print("Model prediction: {}".format(preds[0]))
    if answer is not None:
        print("\nCorrect answer: {}, {}".format(*answer))
    return preds, rec, rec_rate

In [11]:
preds, rec, rec_rate = test_NN(models, max_rec_length, answer=["gian", 1])

Trimming recording...
Predicting...

Spectrogram-based digit classifier prediction:    1
MFCC-based speaker classifier prediction:         jackson
Spectrogram-based speaker classifier prediction:  jackson

Correct answer: gian, 1


# TODO

- General cleaning
- Print prediction confidence? Choose not to display prediction below some threshold?