In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import math
import scipy
from ikrlib import *

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, LSTM
from tensorflow.keras.utils import to_categorical

In [None]:
# paths to data directories
TRAIN_TARGET = '../data/target_train/'
TRAIN_NTARGET = '../data/non_target_train/'
TEST_TARGET = '../data/target_dev/'
TEST_NTARGET = '../data/non_target_dev/'

In [None]:
# load target and non target voice data
train_t = list(wav16khz2mfcc(TRAIN_TARGET).values()) # target train data
train_n = list(wav16khz2mfcc(TRAIN_NTARGET).values()) # non-target train data

print('TEST DATA')
test_t = wav16khz2mfcc(TEST_TARGET) # target test data
test_n = wav16khz2mfcc(TEST_NTARGET) # non-target test data

print(train_t[0].shape)



In [None]:
# Some parameters for us to play with....
MEAN_SEGMENT_LEN = 20
INITIAL_CUTOFF = 190
MEAN_MULTIPLIER = 1
DEFAULT_MEAN = 40.0

# this function cuts up the array to chunks and lets us process these
def divide_chunks(l, n):
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n]
        
def remove_silence(record):
    # First cut off the first 190 frames of the recording
    record = record[INITIAL_CUTOFF:]
    # calculate the mean energy in order to remove silence
    mean_energy = np.mean(record[:][:,0])
    #print(mean_energy)

    
    if mean_energy > DEFAULT_MEAN: mean_energy = DEFAULT_MEAN
    
    # now split the arrays into segments of length MEAN_SEGMENT_LEN
    # and compare the mean of these chunks to the overall mean
    new = []
    for seg in divide_chunks(record, MEAN_SEGMENT_LEN):
        print(np.mean(seg[:][:,0]), mean_energy, mean_energy*MEAN_MULTIPLIER)
        
        if np.mean(seg[:][:,0]) > mean_energy * MEAN_MULTIPLIER:
            new.append(seg)
            
    return np.vstack(new)

In [None]:
# Remove the silence from the training data
target = []
for rec in train_t:
    target.append(remove_silence(rec))

ntarget = []
for rec in train_n:
    ntarget.append(remove_silence(rec))
    
test_target = []
for rec in list(test_t.values()):
    test_target.append(remove_silence(rec))

test_ntarget = []
for rec in list(test_n.values()):
    test_ntarget.append(remove_silence(rec))

X_train_t = np.vstack(target)
X_train_n = np.vstack(ntarget)
X_test_t = np.vstack(test_target)
X_test_n = np.vstack(test_ntarget)

# After cutting the silence
plt.figure()
plt.plot(X_train_t[:1000][:,0])

In [None]:
"""
# First cut off the first 190 frames of each recording
target = []
for rec in train_t:
    target.append(rec[INITIAL_CUTOFF:])

ntarget = []
for rec in train_n:
    ntarget.append(rec[INITIAL_CUTOFF:])
    
X_train_t = np.vstack(target)
X_train_n = np.vstack(ntarget)

# Before cutting the silence..
plt.figure()
plt.plot(X_train_t[:1000][:,0])

# calculate the mean energy in order to remove silence
mean_energy = np.mean(np.hstack((X_train_t[:][:,0], X_train_n[:][:,0])))
print(mean_energy)


# now split the arrays into segments of length 10 and compare the mean of these chunks to the overall mean
target = []
for seg in divide_chunks(X_train_t, MEAN_SEGMENT_LEN):
    if np.mean(seg[:][:,0]) > mean_energy*MEAN_MULTIPLIER:
        target.append(seg)
ntarget = []
for seg in divide_chunks(X_train_n, MEAN_SEGMENT_LEN):
    if np.mean(seg[:][:,0]) > mean_energy*MEAN_MULTIPLIER:
        ntarget.append(seg)
        
X_train_t = np.vstack(target)
X_train_n = np.vstack(ntarget)

# After cutting the silence
plt.figure()
plt.plot(X_train_t[:1000][:,0])
"""

In [None]:
BATCH_LEN = 13
STEP = 3

# this function creates "pictures" from our features by grouping them up
def create_frame_batches(data):
    grouped = []
    for i in range(0, data.shape[0] - BATCH_LEN, STEP):
        group = []
        for j in range(BATCH_LEN):
            group.append(data[i+j])
        grouped.append(np.vstack(group).flatten().reshape(BATCH_LEN, 13, 1))
    return grouped

In [None]:
# Create 13x13 batches from the data
X_train_t = np.array(create_frame_batches(X_train_t))
X_train_n = np.array(create_frame_batches(X_train_n))
X_test_t = np.array(create_frame_batches(X_test_t))
X_test_n = np.array(create_frame_batches(X_test_n))

# Get all the data to one place
X_train = np.vstack((X_train_t, X_train_n))
y_train = np.hstack((np.zeros(X_train_t.shape[0]), np.ones(X_train_n.shape[0])))

X_test = np.vstack((X_test_t, X_test_n))
y_test = np.hstack((np.zeros(X_test_t.shape[0]), np.ones(X_test_n.shape[0])))

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

num_classes = 2

In [None]:
# Let's build our model
model = Sequential()
model.add(Flatten(data_format='channels_last'))
model.add(Dense(13 * BATCH_LEN, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train_hot, batch_size = 32, epochs=40, validation_data=(X_test, y_test_hot))

In [None]:
# convolution
model = Sequential()
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])