In [None]:
import numpy as np
from pydub import AudioSegment
import random
import os
from scipy.io import wavfile
from tensorflow.keras.models import Model, model_from_json
from tensorflow.keras.layers import Input, Conv1D, GRU, TimeDistributed, BatchNormalization, Activation, Dropout
from tensorflow.keras.optimizers import Adam
from td_utils import graph_spectrogram, load_raw_audio, match_target_amplitude


In [None]:
IPython.display.Audio("./raw_data/activates/1.wav")

In [None]:
IPython.display.Audio("./raw_data/negatives/4.wav")

In [None]:
IPython.display.Audio("./raw_data/backgrounds/1.wav")

In [None]:
IPython.display.Audio("audio_examples/example_train.wav")

In [None]:
x = graph_spectrogram("audio_examples/example_train.wav")

In [None]:
_, data = wavfile.read("audio_examples/example_train.wav")
print("Time steps in audio recording before spectrogram", data[:,0].shape)
print("Time steps in input after spectrogram", x.shape)

In [None]:
Tx = 5511 # The number of time steps input to the model from the spectrogram
n_freq = 101 # Number of frequencies input to the model at each time step of the spectrogram

In [None]:
Ty = 1375 # The number of time steps in the output of our model
chime_file = "data/audio_examples/chime.wav"

In [None]:
# Load audio segments using pydub 
activates, negatives, backgrounds = load_raw_audio('./raw_data/')

print("background len should be 10,000, since it is a 10 sec clip\n" + str(len(backgrounds[0])),"\n")
print("activate[0] len may be around 1000, since an `activate` audio clip is usually around 1 second (but varies a lot) \n" + str(len(activates[0])),"\n")
print("activate[1] len: different `activate` clips can have different lengths\n" + str(len(activates[1])),"\n")

In [None]:
def get_random_time_segment(segment_ms):
    """
    Gets a random time segment of duration segment_ms in a 10,000 ms audio clip.
    
    Arguments:
    segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
    
    Returns:
    segment_time -- a tuple of (segment_start, segment_end) in ms
    """
    
    segment_start = np.random.randint(low=0, high=10000-segment_ms)   
    segment_end = segment_start + segment_ms - 1
    
    return (segment_start, segment_end)

In [None]:
# Load pre-trained model
def load_model():
    json_path = 'models/model.json'
    weights_path = 'models/model.h5'
    with open(json_path, 'r') as file:
        model = model_from_json(file.read())
    model.load_weights(weights_path)
    return model

model = load_model()

In [None]:
# Function to generate spectrogram and predict trigger word
def detect_triggerword(filename):
    x = graph_spectrogram(filename)
    x = np.expand_dims(x.swapaxes(0, 1), axis=0)
    predictions = model.predict(x)
    return predictions

In [None]:
np.random.seed(5)
audio_clip, segment_time = insert_audio_clip(backgrounds[0], activates[0], [(3790, 4400)])
audio_clip.export("insert_test.wav", format="wav")
print("Segment Time: ", segment_time)
IPython.display.Audio("insert_test.wav")

In [None]:
# Expected audio
IPython.display.Audio("audio_examples/insert_reference.wav")

In [None]:
# Overlay chime at detected positions
def chime_on_activate(filename, predictions, threshold=0.5):
    audio_clip = AudioSegment.from_wav(filename)
    chime = AudioSegment.from_wav(chime_file)
    Ty = predictions.shape[1]
    consecutive_timesteps = 0
    for i in range(Ty):
        consecutive_timesteps += 1
        if predictions[0, i, 0] > threshold and consecutive_timesteps > 20:
            audio_clip = audio_clip.overlay(chime, position=(i / Ty) * audio_clip.duration_seconds * 1000)
            consecutive_timesteps = 0
        elif predictions[0, i, 0] < threshold:
            consecutive_timesteps = 0
    output_path = "output/chime_output.wav"
    audio_clip.export(output_path, format='wav')
    print(f"Chime output saved as {output_path}")

In [None]:
# Helper function to preprocess custom audio for compatibility
def preprocess_audio(filename):
    padding = AudioSegment.silent(duration=10000)
    segment = AudioSegment.from_wav(filename)[:10000]
    segment = padding.overlay(segment).set_frame_rate(44100)
    segment.export(filename, format='wav')

In [None]:
# Running the detection and chime
def run_detection(filename, threshold=0.5):
    preprocess_audio(filename)
    predictions = detect_triggerword(filename)
    chime_on_activate(filename, predictions, threshold)

In [None]:
# Set the random seed
np.random.seed(18)
x, y = create_training_example(backgrounds[0], activates, negatives, Ty)

In [None]:
IPython.display.Audio("train.wav")

In [None]:
IPython.display.Audio("audio_examples/train_reference.wav")

In [None]:
# Load preprocessed dev set examples
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")

In [None]:

from test_utils import *

def modelf_test(target):
    Tx = 5511
    n_freq = 101
    model = target(input_shape = (Tx, n_freq))
    expected_model = [['InputLayer', [(None, 5511, 101)], 0],
                     ['Conv1D', (None, 1375, 196), 297136, 'valid', 'linear', (4,), (15,), 'GlorotUniform'],
                     ['BatchNormalization', (None, 1375, 196), 784],
                     ['Activation', (None, 1375, 196), 0],
                     ['Dropout', (None, 1375, 196), 0, 0.8],
                     ['GRU', (None, 1375, 128), 125184, True],
                     ['Dropout', (None, 1375, 128), 0, 0.8],
                     ['BatchNormalization', (None, 1375, 128), 512],
                     ['GRU', (None, 1375, 128), 99072, True],
                     ['Dropout', (None, 1375, 128), 0, 0.8],
                     ['BatchNormalization', (None, 1375, 128), 512],
                     ['Dropout', (None, 1375, 128), 0, 0.8],
                     ['TimeDistributed', (None, 1375, 1), 129, 'sigmoid']]
    comparator(summary(model), expected_model)
    
    
modelf_test(modelf)

In [None]:
model = modelf(input_shape = (Tx, n_freq))

In [None]:
opt = Adam(lr=1e-6, beta_1=0.9, beta_2=0.999)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])

In [None]:
model.fit(X, Y, batch_size = 16, epochs=1)

In [None]:
loss, acc, = model.evaluate(X_dev, Y_dev)
print("Dev set accuracy = ", acc)

In [None]:
IPython.display.Audio("./raw_data/dev/1.wav")

In [None]:
IPython.display.Audio("./raw_data/dev/2.wav")

In [None]:
filename = "./raw_data/dev/1.wav"
prediction = detect_triggerword(filename)
chime_on_activate(filename, prediction, 0.5)
IPython.display.Audio("./chime_output.wav")

In [None]:
filename  = "./raw_data/dev/2.wav"
prediction = detect_triggerword(filename)
chime_on_activate(filename, prediction, 0.5)
IPython.display.Audio("./chime_output.wav")

In [None]:
your_filename = "audio_examples/my_audio.wav"

In [None]:
preprocess_audio(your_filename)
IPython.display.Audio(your_filename) # listen to the audio you uploaded 

In [None]:
chime_threshold = 0.5
prediction = detect_triggerword(your_filename)
chime_on_activate(your_filename, prediction, chime_threshold)
IPython.display.Audio("./chime_output.wav")