## Trigger Word Detection

In [None]:
#import  libraries
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
from td_utils import *
%matplotlib inline

In [None]:
#use positives/negatives/backgrounds recordings to create a labeled dataset.
Tx = 5511
n_freq = 101
Ty = 1375

#load audio segments using pydub 
activates, negatives, backgrounds = load_raw_audio()

In [None]:
def get_random_time_segment(segment_ms):
    
    segment_start = np.random.randint(low=0, high=10000-segment_ms)
    segment_end = segment_start + segment_ms - 1
    
    return (segment_start, segment_end)

In [None]:
def is_overlapping(segment_time, previous_segments):
    
    segment_start, segment_end = segment_time
    overlap = False

    for previous_start, previous_end in previous_segments:
        if segment_start <= previous_end and segment_end >= previous_start:
            overlap = True

    return overlap

In [None]:
def insert_audio_clip(background, audio_clip, previous_segments):

    segment_ms = len(audio_clip)
    segment_time = get_random_time_segment(segment_ms)

    while is_overlapping(segment_time, previous_segments):
        segment_time = get_random_time_segment(segment_ms)

    previous_segments.append(segment_time)

    new_background = background.overlay(audio_clip, position = segment_time[0])
    
    return new_background, segment_time

In [None]:
def insert_ones(y, segment_end_ms):
    
    segment_end_y = int(segment_end_ms*Ty/10000.0)
    
    for i in range(segment_end_y+1, segment_end_y+51):
        if i < Ty:
            y[0,i] = 1
    
    return y

In [None]:
#create training examples

def create_training_example(background, activates, negatives):
   
    np.random.seed(18)
    
    background = background - 20
    
    y = np.zeros((1, Ty))
    
    previous_segments = []

    number_of_activates = np.random.randint(0, 5)
    random_indices = np.random.randint(len(activates), size=number_of_activates)
    random_activates = [activates[i] for i in random_indices]
    
    for random_activate in random_activates:
        background, segment_time = insert_audio_clip(background, random_activate, previous_segments)
        segment_start, segment_end = segment_time
        y = insert_ones(y, segment_end_ms=segment_end)

    number_of_negatives = np.random.randint(0, 3)
    random_indices = np.random.randint(len(negatives), size=number_of_negatives)
    random_negatives = [negatives[i] for i in random_indices]

    for random_negative in random_negatives:
        background, _ = insert_audio_clip(background, random_negative, previous_segments)
    
    background = match_target_amplitude(background, -20.0)

    file_handle = background.export("train" + ".wav", format="wav")
    print("File (train.wav) was saved in your directory.")
    
    x = graph_spectrogram("train.wav")
    
    return x, y

In [None]:
#load preprocessed training examples
X = np.load("./XY_train/X.npy")
Y = np.load("./XY_train/Y.npy")

#load preprocessed dev set examples
X_dev = np.load("./XY_dev/X_dev.npy")
Y_dev = np.load("./XY_dev/Y_dev.npy")

In [None]:
#load libraries for the model
from keras.callbacks import ModelCheckpoint
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
from keras.optimizers import Adam

In [None]:
#build the model

def model(input_shape):
    
    X_input = Input(shape = input_shape)
    
    #step 1: CONV layer
    X = Conv1D(196, kernel_size = 15, strides = 4)(X_input)
    X = BatchNormalization()(X)                                
    X = Activation('relu')(X)                  
    X = Dropout(0.8)(X)                

    #step 2: First GRU Layer
    X = GRU(units = 128, return_sequences = True)(X)                            
    X = Dropout(0.8)(X)                             
    X = BatchNormalization()(X)                              
    
    #step 3: Second GRU Layer
    X = GRU(units = 128, return_sequences = True)(X)                           
    X = Dropout(0.8)(X)                                
    X = BatchNormalization()(X)                                 
    X = Dropout(0.8)(X)                         
    
    #step 4: Time-distributed dense layer
    X = None

    model = Model(inputs = X_input, outputs = X)
    
    return model  

In [None]:
#implement the model
model = model(input_shape = (Tx, n_freq))
model.summary()

In [None]:
#load the pre-trained model
model = load_model('./models/tr_model.h5')

#fit the model
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])
model.fit(X, Y, batch_size = 5, epochs=1)

#test the model
loss, acc = model.evaluate(X_dev, Y_dev)
print("Dev set accuracy = ", acc)

In [None]:
#determine the predict function
def detect_triggerword(filename):
    plt.subplot(2, 1, 1)

    x = graph_spectrogram(filename)
    x  = x.swapaxes(0,1)
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)
    
    plt.subplot(2, 1, 2)
    plt.plot(predictions[0,:,0])
    plt.ylabel('probability')
    plt.show()
    return predictions

In [None]:
#insert a chime to acknowledge the "activate" trigger
chime_file = "audio_examples/chime.wav"
def chime_on_activate(filename, predictions, threshold):
    audio_clip = AudioSegment.from_wav(filename)
    chime = AudioSegment.from_wav(chime_file)
    Ty = predictions.shape[1]
    #step 1: Initialize the number of consecutive output steps to 0
    consecutive_timesteps = 0
    #step 2: Loop over the output steps in the y
    for i in range(Ty):
        #step 3: Increment consecutive output steps
        consecutive_timesteps += 1
        #step 4: If prediction is higher than the threshold and more than 75 consecutive output steps have passed
        if predictions[0,i,0] > threshold and consecutive_timesteps > 75:
            #step 5: Superpose audio and background using pydub
            audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio_clip.duration_seconds)*1000)
            #step 6: Reset consecutive output steps to 0
            consecutive_timesteps = 0
        
    audio_clip.export("chime_output.wav", format='wav')