In [1]:
### Load necessary libraries ###
import glob
import os
import librosa
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras

In [2]:
### Define helper functions ###
def extract_features(parent_dir,sub_dirs,file_ext="*.wav",
                     bands=60,frames=41):
    def _windows(data, window_size):
        start = 0
        while start < len(data):
            yield int(start), int(start + window_size)
            start += (window_size // 2)
            
    window_size = 512 * (frames - 1)
    features, labels = [], []
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        segment_log_specgrams, segment_labels = [], []
        sound_clip,sr = librosa.load(fn)
        label = int(fn.split('.')[0][-1])-1
        for (start,end) in _windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
                logspec = librosa.amplitude_to_db(melspec)
                logspec = logspec.T.flatten()[:, np.newaxis].T
                segment_log_specgrams.append(logspec)
                segment_labels.append(label)
            
        segment_log_specgrams = np.asarray(segment_log_specgrams).reshape(
            len(segment_log_specgrams),bands,frames,1)
        segment_features = np.concatenate((segment_log_specgrams, np.zeros(
            np.shape(segment_log_specgrams))), axis=3)
        for i in range(len(segment_features)): 
            segment_features[i, :, :, 1] = librosa.feature.delta(
                segment_features[i, :, :, 0])
        
        if len(segment_features) > 0: # check for empty segments 
            features.append(segment_features)
            labels.append(segment_labels)
    return features, labels

In [3]:
# Pre-process and extract feature from the data
parent_dir = 'data/'
save_dir = "processed/"
folds = sub_dirs = np.array(['fold1','fold2'])

for sub_dir in sub_dirs:
    features, labels = extract_features(parent_dir,sub_dir)
    np.savez("{0}{1}".format(save_dir, sub_dir), 
             features=features, 
             labels=labels)

 -0.03935369] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
  0.08535548] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
 -5.4734166e-05  3.5395187e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
  1.9961734e-04 -3.1661851e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
 -0.11631303] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  melspec = librosa.feature.melspectrogram(signal,n_mels=bands)
 -0.14282735] as keyword args. From version 0.10 passing these as positional arguments wi

In [4]:
def get_network():
    num_filters = [24,32,64,128] 
    pool_size = (2, 2) 
    kernel_size = (3, 3)  
    input_shape = (60, 41, 2)
    keras.backend.clear_session()
    
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(24, kernel_size,
                padding="same", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))

    model.add(keras.layers.Conv2D(32, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(64, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(128, kernel_size,
                                  padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  

    model.add(keras.layers.GlobalMaxPooling2D())
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(1, activation="sigmoid"))

    model.compile(optimizer=keras.optimizers.Adam(1e-4), 
        loss=keras.losses.BinaryCrossentropy(), 
        metrics=["accuracy"])
    return model

In [5]:
temp_data1 = np.load("{0}/{1}.npz".format('processed/','fold1'),allow_pickle=True)
temp_data2 = np.load("{0}/{1}.npz".format('processed/','fold2'),allow_pickle=True)

In [6]:
from sklearn.model_selection import train_test_split

x = []
y = []

features = np.concatenate(temp_data1["features"], axis=0) 
features2 = np.concatenate(temp_data2["features"], axis=0) 
labels = np.concatenate(temp_data1["labels"], axis=0)
labels2 = np.concatenate(temp_data2["labels"], axis=0)

x.append(np.concatenate((features,features2),axis=0))
# x.extend(features2)

y.append(np.concatenate((labels,labels2),axis=0))
# y.extend(labels2)

x_train, x_test, y_train, y_test = train_test_split(x[0], y[0], random_state=77, test_size=0.2)

In [7]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []


model = get_network()
hist = model.fit(x_train, y_train, epochs = 50, batch_size = 24, verbose = 0)
print(hist.history)
    
    # evaluate on test set/fold
y_true, y_pred = [], []
for x_t, y_t in zip(x_test, y_test):
        # average predictions over segments of a sound clip
    avg_p = np.round(model.predict(x_t.reshape(-1,60,41,2)))
    y_pred.append(int(avg_p[0][0])) 
        # pick single label via np.unique for a sound clip
    y_true.append(np.unique(y_t)[0])
    
print("Accuracy: ",accuracy_score(y_true, y_pred))

{'loss': [0.11083903163671494, 0.06428693234920502, 0.0581945963203907, 0.05243341624736786, 0.04422549903392792, 0.03782418742775917, 0.030067861080169678, 0.026133304461836815, 0.018788190558552742, 0.014580221846699715, 0.010162577964365482, 0.006343492306768894, 0.005052700638771057, 0.0025153099559247494, 0.006192424800246954, 0.004223634488880634, 0.0017207657219842076, 0.0009398385882377625, 0.0004958961508236825, 0.0004807714431080967, 0.000301114225294441, 0.007852372713387012, 0.0017208089120686054, 0.0008376396726816893, 0.0002521332644391805, 0.0001155572390416637, 0.00026838184567168355, 0.0002997987321577966, 0.014869462698698044, 0.0006885359762236476, 0.00027681884239427745, 0.00015076363342814147, 0.00032707571517676115, 0.00012989032256882638, 8.33028243505396e-05, 6.56879783491604e-05, 3.8646587199764326e-05, 0.010909529402852058, 0.000520843081176281, 0.00020098012464586645, 8.658949809614569e-05, 8.6937980086077e-05, 0.00016981462249532342, 6.509814556920901e-05, 4

In [8]:
model.save('scream.h5')
# new_model = tf.keras.models.load_model('scream.h5')

In [9]:
# ### Train and evaluate via 10-Folds cross-validation ###
# accuracies = []
# folds = np.array(['fold1','fold2'])
# load_dir = "processed/"
# kf = KFold(n_splits=2)
# for train_index, test_index in kf.split(folds):
#     x_train, y_train = [], []
#     for ind in train_index:
#         # read features or segments of an audio file
#         train_data = np.load("{0}/{1}.npz".format(load_dir,folds[ind]), 
#                        allow_pickle=True)
#         # for training stack all the segments so that they are treated as an example/instance
#         features = np.concatenate(train_data["features"], axis=0) 
#         labels = np.concatenate(train_data["labels"], axis=0)
#         x_train.append(features)
#         y_train.append(labels)
#     # stack x,y pairs of all training folds 
#     x_train = np.concatenate(x_train, axis = 0).astype(np.float32)
#     y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
    
#     # for testing we will make predictions on each segment and average them to 
#     # produce signle label for an entire sound clip.
#     test_data = np.load("{0}/{1}.npz".format(load_dir,
#                    folds[test_index][0]), allow_pickle=True)
#     x_test = test_data["features"]
#     y_test = test_data["labels"]

#     model = get_network()
#     model.fit(x_train, y_train, epochs = 50, batch_size = 24, verbose = 0)
    
#     # evaluate on test set/fold
#     y_true, y_pred = [], []
#     for x, y in zip(x_test, y_test):
#         # average predictions over segments of a sound clip
#         avg_p = np.argmax(np.mean(model.predict(x), axis = 0))
#         y_pred.append(avg_p) 
#         # pick single label via np.unique for a sound clip
#         y_true.append(np.unique(y)[0]) 
#     accuracies.append(accuracy_score(y_true, y_pred))    
# print("Average 2 Folds Accuracy: {0}".format(np.mean(accuracies)))