<a href="https://colab.research.google.com/github/QColeman97/AudioTagger/blob/master/AudioTag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This cell contains not-preferred data pre-processing methods
# This cell is left up for example, and to visualize spectrograms

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
from librosa import display
import os
import glob


input_path = 'drive/My Drive/AudioTaggerData/'

train_files_path = input_path + 'FSDKaggle2018.audio_train'
test_files_path = input_path + 'FSDKaggle2018.audio_test'
train_csv_path = (input_path +
                  'FSDKaggle2018.meta/train_post_competition.csv')
test_csv_path = (input_path +
                 'FSDKaggle2018.meta/' +
                 'test_post_competition_scoring_clips.csv')

# Data preprocessing part

df_train = pd.read_csv(train_csv_path)
df_test = pd.read_csv(test_csv_path)

unique_labels = df_train.label.unique()
num_class = len(unique_labels)

label2index = {label: index for index, label in enumerate(unique_labels)}

train_dict = pd.Series.from_csv(train_csv_path, header = 0).to_dict()
test_dict = pd.Series.from_csv(test_csv_path, header = 0).to_dict()

#array of labels in number form (0 = hi-hat, 1 = saxophone, etc)
label_emb_indices = np.array([label2index[label] for label in df_train.label])

label_emb_test_indices = np.array([label2index[label] for label in df_test.label])


def pre_process(pathname):
    sampling_rate = 32000
    hop_length = 192
    fmax = None
    n_mels = 128
    n_fft = 1024

    y, sr = librosa.load(pathname, sr = sampling_rate)
    # "Trim quiet noise away" is somewhat effective
    y, (trim_begin, trim_end) = librosa.effects.trim(y)
    
#     y = librosa.effects.time_stretch(y, 2.0)

    # Amplitudes of STFT
    stft = np.abs(librosa.stft(y, n_fft = n_fft, hop_length = hop_length,
                               window = 'hann', center = True,
                               pad_mode = 'reflect'))

    freqs = librosa.core.fft_frequencies(sr = sampling_rate, n_fft = n_fft)
    stft = librosa.perceptual_weighting(stft*2, freqs, ref = 1.0, amin = 1e-10,
                                        top_db = 99.0)

    # Apply mel filterbank
    # Power param is set to 2 (power) by default
    mel_spect = librosa.feature.melspectrogram(S = stft, sr = sampling_rate,
                                               n_mels = n_mels, fmax = fmax)

    log_mel_spect = librosa.core.power_to_db(mel_spect)

    return np.asarray(log_mel_spect)


# Get data as a numpy array from .wav files (not preferred, but left in for example)
def get_data(pathname, training = True):
    file_list = glob.glob(os.path.join(pathname, '*.wav'))

    if training:
        data_f = open('Audio.train', 'w')
    else:
        data_f = open('Audio.test', 'w')

    spectrograms = np.ndarray((9474, 256, 128))

    for i, file in enumerate(file_list):
        print("%04d / %d | %s" % (i + 1, len(file_list), file))

        spectrogram = pre_process(file)

        time_restriction = 256
        if time_restriction >= spectrogram.shape[1]:
            pad_amount = time_restriction - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_amount)),
                                 'minimum')
        else:
            spectrogram = spectrogram[:, :time_restriction]

        spectrogram = spectrogram.transpose()

        for j in range(len(spectrogram)):
            for k in range(len(spectrogram[j])):
                spectrograms[i][j][k] = spectrogram[j][k].astype(np.float32)


        # Plot every 12th spectrogram
        if i % 12 == 0:
        
            plt.figure("General-Purpose ")
            plt.clf()
            plt.subplots_adjust(right = 0.98, left = 0.1, bottom = 0.1,
                                top = 0.99)
            plt.imshow(spectrogram, origin = "lower",
                       interpolation = "nearest", cmap = "viridis")
            plt.xlabel("%d bins" % spectrogram.shape[1])
            plt.ylabel("%d frames" % spectrogram.shape[0])
            plt.colorbar()
            plt.show()

    return spectrograms


# Get labels as a numpy array from .csv (not preferred, but left in for example)
def get_labels(pathname, training = True):
    file_list = glob.glob(os.path.join(pathname, '*.wav'))

    if training:
        labels_f = open('Labels.train', 'w')
    else:
        labels_f = open('Labels.test', 'w')

    labels = np.ndarray((1570, 41))
    for i, file in enumerate(file_list):
        categ = (train_dict[file.split('/')[-1]] if
            (training) else test_dict[file.split('/')[-1]])
        hot_index = label2index[categ]
        labels[i][hot_index] = 1

    return np.array(labels)

# Uncomment for example plotted spectrograms of training data
# CAUTION: Will error if used with models
# get_data(train_files_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  infer_datetime_format=infer_datetime_format)


In [2]:
from keras import models, layers
from keras_preprocessing.image import ImageDataGenerator

import numpy as np


# Model structure after best methods ~ 6 conv units (conv *2 + maxpool)
def make_2D_CNN_model(input_shape):
    # Example Shape: (500, 128, 1)  # 500 = timesteps, 128 = frequencies
    nn = models.Sequential()
    nn.add(layers.SeparableConv2D(64, (3, 3), padding = 'same',
                                  activation = 'relu',
                                  input_shape = input_shape))
    # Shape: (126, 498, 64)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(64, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (124, 496, 64)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (62, 248, 64)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(128, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (60, 246, 128)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(128, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (58, 244, 128)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (29, 122, 128)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (25, 118, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.Dropout(0.3))
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (23, 116, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.Dropout(0.3))
    nn.add(layers.SeparableConv2D(256, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (21, 114, 256)
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling2D((2, 2)))
    # Shape: (10, 57, 256)
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    # Shape: (1, 24, 512)
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv2D(512, (3, 3), padding = 'same',
                                  activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.GlobalAveragePooling2D())

    nn.add(layers.Dense(41, activation = 'softmax'))
    return nn


# Get data
input_path = 'drive/My Drive/AudioTaggerData/'

train_files_path = input_path + 'FSDKaggle2018.audio_train'
test_files_path = input_path + 'FSDKaggle2018.audio_test'
train_csv_path = (input_path +
                  'FSDKaggle2018.meta/train_post_competition.csv')
test_csv_path = (input_path +
                 'FSDKaggle2018.meta/' +
                 'test_post_competition_scoring_clips.csv')


# Get train data as numpy array from training data file
def get_train_data():
#   Possible future better shape (9474, 256, 128))
    data = np.ndarray((9474, 64, 32))

    data_len, time_len, freq = 9474, 64, 32


    with open(input_path + 'Audio.train', 'r') as data_f:
        for i in range(data_len):

            for j in range(time_len):
    
                time_step = [float(elem) for elem in
                             data_f.readline().split()]
                
                if len(time_step) < freq:
                    rest = freq - len(time_step)
                    time_step += [-100.0 for x in range(rest)]
    
                for k in range(freq):
                    data[i][j][k] = time_step[k]

            data_f.readline()

    return data


# Get test data as numpy array from test data file
def get_test_data():
#   Possible future better shape (9474, 256, 128))
    data = np.ndarray((1570, 64, 32))
    data_len = 1570

    with open(input_path + 'Audio.test', 'r') as data_f:
        for i in range(data_len):

            for j in range(64):
    
                time_step = [float(elem) for elem in
                             data_f.readline().split()]
                
                if len(time_step) < 32:
                    rest = 32 - len(time_step)
                    time_step += [-100.0 for x in range(rest)]
    
                for k in range(32):
                    data[i][j][k] = time_step[k]

            data_f.readline()

    return data


# Get train labels as numpy array from training labels file
def get_train_labels():
    labels = np.ndarray((9474, 41))
    labels_len = 9474

    with open(input_path + 'Labels.train', 'r') as label_f:
        for i in range(labels_len):
            label = [float(elem) for elem in label_f.readline().split()]

            if len(label) < 41:
                    rest = 41 - len(label)
                    label += [0.0 for x in range(rest)]
            
            for j in range(41):
                labels[i][j] = label[j]

    return labels


# Get test labels as numpy array from test labels file
def get_test_labels():
    labels = np.ndarray((1570, 41))
    labels_len = 1570

    with open(input_path + 'Labels.test', 'r') as label_f:
        for i in range(labels_len):
            label = [float(elem) for elem in label_f.readline().split()]

            if len(label) < 41:
                    rest = 41 - len(label)
                    label += [0.0 for x in range(rest)]
            
            for j in range(41):
                labels[i][j] = label[j]

    return labels


# Data
train_data = get_train_data()
train_labels = get_train_labels()
# Labels
test_data = get_test_data()
test_labels = get_test_labels()


train_samples, test_samples = 9474, 1570
max_timesteps = train_data.shape[1]
num_freq = train_data.shape[2]


Using TensorFlow backend.


In [3]:
mean = train_data.mean(axis = 0)
train_data -= mean
std = train_data.std(axis = 0)
train_data /= std

test_data -= mean
test_data /= std


model = make_2D_CNN_model((max_timesteps, num_freq, 1))
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

k = 4
num_val = len(train_data) // k
num_train = len(train_labels) - num_val
all_val_acc_histories, all_val_loss_histories = [], []
for x in range(k):
    val_data = train_data[x * num_val: (x + 1) * num_val]
    val_labels = train_labels[x * num_val: (x + 1) * num_val]

    partial_train_data = np.concatenate(
        [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
        axis = 0)
    partial_train_labels = np.concatenate(
        [train_labels[: x * num_val],
         train_labels[(x + 1) * num_val:]],
        axis = 0)
    
    partial_train_data = partial_train_data.reshape(
            (partial_train_data.shape[0], 
             partial_train_data.shape[1], 
             partial_train_data.shape[2], 
             1))
    
    val_data = val_data.reshape(
            (val_data.shape[0], 
             val_data.shape[1], 
             val_data.shape[2], 
             1))
    
    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 1024,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])

avg_val_loss_hst = np.mean(all_val_loss_histories, axis = 0)
avg_val_acc_hst = np.mean(all_val_acc_histories, axis = 0)

best_loss, best_acc, prev_acc, best_epoch = None, None, None, 0

acc_increased = True
for i in range(10):
    print(avg_val_acc_hst[i], '/', avg_val_loss_hst[i])

    if prev_acc is not None and avg_val_acc_hst[i] < prev_acc:
        acc_increased = False
    prev_acc = avg_val_loss_hst[i]

    if (best_acc is None or avg_val_acc_hst[i] > best_acc and
            acc_increased):
        best_acc = avg_val_acc_hst[i]
        best_loss = avg_val_loss_hst[i]
        best_epoch = i + 1

print('Best val loss:', best_loss, '& with acc:', best_acc, 'at epoch:',
      str(best_epoch))

model.save('2DCNN.h5')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.33372043858508804 / 2.558421367729032
0.3627533796872642 / 2.3826772143711916
0.36951013594060333 / 2.4239236261393575
0.389358

In [4]:

def make_combined_CNN_RNN_model(input_shape):
    nn = models.Sequential()
    nn.add(layers.SeparableConv1D(64, 5, activation = 'relu',
                                  input_shape = (None, input_shape[-1])))
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv1D(64, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling1D(3))
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv1D(128, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv1D(128, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())

    nn.add(layers.Bidirectional(layers.LSTM(128, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True)))
    nn.add(layers.Bidirectional(layers.LSTM(128, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True)))
    nn.add(layers.Bidirectional(layers.LSTM(128, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True)))
    nn.add(layers.Bidirectional(layers.LSTM(128, dropout = 0.3,
                                            recurrent_dropout = 0.3)))

    nn.add(layers.Dense(41, activation = 'softmax'))

    return nn

model = make_combined_CNN_RNN_model((max_timesteps, num_freq))
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

k = 4
num_val = len(train_data) // k
num_train = len(train_labels) - num_val
all_val_acc_histories, all_val_loss_histories = [], []
for x in range(k):
    val_data = train_data[x * num_val: (x + 1) * num_val]
    val_labels = train_labels[x * num_val: (x + 1) * num_val]

    partial_train_data = np.concatenate(
        [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
        axis = 0)
    partial_train_labels = np.concatenate(
        [train_labels[: x * num_val],
         train_labels[(x + 1) * num_val:]],
        axis = 0)
    
    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 1024,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])

avg_val_loss_hst = np.mean(all_val_loss_histories, axis = 0)
avg_val_acc_hst = np.mean(all_val_acc_histories, axis = 0)

best_loss, best_acc, prev_acc, best_epoch = None, None, None, 0

acc_increased = True
for i in range(10):
    print(avg_val_acc_hst[i], '/', avg_val_loss_hst[i])

    if prev_acc is not None and avg_val_acc_hst[i] < prev_acc:
        acc_increased = False
    prev_acc = avg_val_loss_hst[i]

    if (best_acc is None or avg_val_acc_hst[i] > best_acc and
            acc_increased):
        best_acc = avg_val_acc_hst[i]
        best_loss = avg_val_loss_hst[i]
        best_epoch = i + 1

print('Best val loss:', best_loss, '& with acc:', best_acc, 'at epoch:',
      str(best_epoch))

model.save('1DCNN_RNN.h5')

Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.2757601355378692 / 2.6034641845806226
0.28367821036561114 / 2.5775513471783817
0.28948479820344897 / 2.543949747407759
0.29307432442500786 / 2.5618020312206164
0.3047930740222738 / 2.5096061519674353
0.30658783803920486 / 2.499055097231994
0.3111275339851508 / 2.4742559259002275
0.3107052359830689 / 2.4941106921917684
0.31577280375200345 / 2.4698457186286515
0

In [5]:
from keras.applications import DenseNet121

def make_dense_net_model(conv_base):
    nn = models.Sequential()
    nn.add(conv_base)
    nn.add(layers.Dense(41, activation = 'softmax'))
    return nn

dn_base = DenseNet121(include_top = False,
                      input_shape = (max_timesteps, num_freq, 3),
                      pooling = 'avg')
dn_base.trainable = True
# print(dn_base.summary())
# Fine-tuning
set_trainable = False
for layer in dn_base.layers:
    if layer.name == 'conv5_block13_0_bn':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

model = make_dense_net_model(dn_base)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

k = 4
num_val = len(train_data) // k
num_train = len(train_labels) - num_val
all_val_acc_histories, all_val_loss_histories = [], []
for x in range(k):
    val_data = train_data[x * num_val: (x + 1) * num_val]
    val_labels = train_labels[x * num_val: (x + 1) * num_val]

    partial_train_data = np.concatenate(
        [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
        axis = 0)
    partial_train_labels = np.concatenate(
        [train_labels[: x * num_val],
         train_labels[(x + 1) * num_val:]],
        axis = 0)
    
    partial_train_data = partial_train_data.reshape(
            (partial_train_data.shape[0], 
             partial_train_data.shape[1], 
             partial_train_data.shape[2], 
             1))
    
    partial_train_data = np.repeat(partial_train_data, 3, axis=3)
    
    val_data = val_data.reshape(
            (val_data.shape[0], 
             val_data.shape[1], 
             val_data.shape[2], 
             1))
    
    val_data = np.repeat(val_data, 3, axis=3)
    
    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 1024,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])

avg_val_loss_hst = np.mean(all_val_loss_histories, axis = 0)
avg_val_acc_hst = np.mean(all_val_acc_histories, axis = 0)

best_loss, best_acc, prev_acc, best_epoch = None, None, None, 0

acc_increased = True
for i in range(10):
    print(avg_val_acc_hst[i], '/', avg_val_loss_hst[i])

    if prev_acc is not None and avg_val_acc_hst[i] < prev_acc:
        acc_increased = False
    prev_acc = avg_val_loss_hst[i]

    if (best_acc is None or avg_val_acc_hst[i] > best_acc and
            acc_increased):
        best_acc = avg_val_acc_hst[i]
        best_loss = avg_val_loss_hst[i]
        best_epoch = i + 1

print('Best val loss:', best_loss, '& with acc:', best_acc, 'at epoch:',
      str(best_epoch))

model.save('DenseNetModel.h5')

Downloading data from https://github.com/keras-team/keras-applications/releases/download/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.04983108095522668 / 12.04189309558353
0.051203547448322576 / 11.70717437202866
0.05373733098039756 / 11.802424456622148
0.05447635140169311 / 11.705132426442326
0.054898648749332174 / 11.911289253750361
0.054

In [6]:


def make_1DCNN_model(input_shape):
    nn = models.Sequential()
    nn.add(layers.SeparableConv1D(64, 5, activation = 'relu',
                                  input_shape = (None, input_shape[-1])))
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv1D(64, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.MaxPooling1D(5))
    nn.add(layers.Dropout(0.3))

    nn.add(layers.SeparableConv1D(128, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.SeparableConv1D(128, 5, activation = 'relu'))
    nn.add(layers.BatchNormalization())
    nn.add(layers.GlobalAveragePooling1D())

    nn.add(layers.Dense(41, activation = 'softmax'))

    return nn

model = make_1DCNN_model((max_timesteps, num_freq))
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

k = 4
num_val = len(train_data) // k
num_train = len(train_labels) - num_val
all_val_acc_histories, all_val_loss_histories = [], []
for x in range(k):
    val_data = train_data[x * num_val: (x + 1) * num_val]
    val_labels = train_labels[x * num_val: (x + 1) * num_val]

    partial_train_data = np.concatenate(
        [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
        axis = 0)
    partial_train_labels = np.concatenate(
        [train_labels[: x * num_val],
         train_labels[(x + 1) * num_val:]],
        axis = 0)
    
    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 1024,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])

avg_val_loss_hst = np.mean(all_val_loss_histories, axis = 0)
avg_val_acc_hst = np.mean(all_val_acc_histories, axis = 0)

best_loss, best_acc, prev_acc, best_epoch = None, None, None, 0

acc_increased = True
for i in range(10):
    print(avg_val_acc_hst[i], '/', avg_val_loss_hst[i])

    if prev_acc is not None and avg_val_acc_hst[i] < prev_acc:
        acc_increased = False
    prev_acc = avg_val_loss_hst[i]

    if (best_acc is None or avg_val_acc_hst[i] > best_acc and
            acc_increased):
        best_acc = avg_val_acc_hst[i]
        best_loss = avg_val_loss_hst[i]
        best_epoch = i + 1

print('Best val loss:', best_loss, '& with acc:', best_acc, 'at epoch:',
      str(best_epoch))

model.save('1DCNN.h5')


Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.2956081070509311 / 2.647304233667013
0.30796030405405406 / 2.589402749731734
0.3092271951404778 / 2.576870792620891
0.31503378368310025 / 2.5675474050882703
0.31249999919453186 / 2.552316548051061
0.3126055744250078 / 2.5447949863768913
0.31640624959726593 / 2.530710315382158
0.32020692577635923 / 2.514878749847412
0.32411317567567566 / 2.504503085806563
0.324

In [7]:
def make_RNN_model(input_shape):
    nn = models.Sequential()
    nn.add(layers.Bidirectional(layers.LSTM(64, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True,
                                            input_shape =
                                            (None, input_shape[-1]))))
    nn.add(layers.Bidirectional(layers.LSTM(64, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True)))
    nn.add(layers.Bidirectional(layers.LSTM(64, dropout = 0.3,
                                            recurrent_dropout = 0.3,
                                            return_sequences = True)))
    nn.add(layers.Bidirectional(layers.LSTM(64, dropout = 0.3,
                                            recurrent_dropout = 0.3)))
    nn.add(layers.Dense(41, activation = 'softmax'))

    return nn


model = make_RNN_model((max_timesteps, num_freq))
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

k = 4
num_val = len(train_data) // k
num_train = len(train_labels) - num_val
all_val_acc_histories, all_val_loss_histories = [], []
for x in range(k):
    val_data = train_data[x * num_val: (x + 1) * num_val]
    val_labels = train_labels[x * num_val: (x + 1) * num_val]

    partial_train_data = np.concatenate(
        [train_data[: x * num_val], train_data[(x + 1) * num_val:]],
        axis = 0)
    partial_train_labels = np.concatenate(
        [train_labels[: x * num_val],
         train_labels[(x + 1) * num_val:]],
        axis = 0)
    
    hst = model.fit(partial_train_data, partial_train_labels, batch_size = 1024,
                    epochs = 10, validation_data = (val_data, val_labels),)

    hst = hst.history
    all_val_loss_histories.append(hst['val_loss'])
    all_val_acc_histories.append(hst['val_acc'])

avg_val_loss_hst = np.mean(all_val_loss_histories, axis = 0)
avg_val_acc_hst = np.mean(all_val_acc_histories, axis = 0)

best_loss, best_acc, prev_acc, best_epoch = None, None, None, 0

acc_increased = True
for i in range(10):
    print(avg_val_acc_hst[i], '/', avg_val_loss_hst[i])

    if prev_acc is not None and avg_val_acc_hst[i] < prev_acc:
        acc_increased = False
    prev_acc = avg_val_loss_hst[i]

    if (best_acc is None or avg_val_acc_hst[i] > best_acc and
            acc_increased):
        best_acc = avg_val_acc_hst[i]
        best_loss = avg_val_loss_hst[i]
        best_epoch = i + 1

print('Best val loss:', best_loss, '& with acc:', best_acc, 'at epoch:',
      str(best_epoch))

model.save('RNN.h5')


Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 7106 samples, validate on 2368 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.2414484797800715 / 2.830292846705462
0.2417652027027027 / 2.789620895643492
0.2504222977000314 / 2.7642753237002604
0.25464526976685264 / 2.7456509454830273
0.25696790490198784 / 2.730817353403246
0.2586570945945946 / 2.7194588877059314
0.26383023578170184 / 2.7012164528305465
0.2652027022999686 / 2.685756960430661
0.27111486536828244 / 2.674971627222525
0.271

In [0]:
test_data_2D = test_data.reshape(
               (test_data.shape[0], 
                test_data.shape[1], 
                test_data.shape[2], 
                1))
    
test_data_2D_3chan = np.repeat(test_data_2D, 3, axis=3)
    

cnn2D = models.load_model('2DCNN.h5')
rnn = models.load_model('RNN.h5')
cnn1D = models.load_model('1DCNN.h5')
cnn_rnn = models.load_model('1DCNN_RNN.h5')
dense_net = models.load_model('DenseNetModel.h5')

In [0]:
cnn2D_preds = cnn2D.predict(test_data_2D)
rnn_preds = rnn.predict(test_data)
cnn1D_preds = cnn1D.predict(test_data)
cnn_rnn_preds = cnn_rnn.predict(test_data)
dense_net_preds = dense_net.predict(test_data_2D_3chan)

final_preds = (0.5 * cnn2D_preds + 0.08 * rnn_preds + 0.08 * cnn1D_preds + 
              0.08 * cnn_rnn_preds)

In [12]:

with open('TestTags.out', 'w') as ttf:
    test_file_list = glob.glob(os.path.join(test_files_path, '*.wav'))
    for i, filename in enumerate(test_file_list[:1570]):
        ttf.write(str(i) + ') ' + filename.split('/')[-1] + ' ' + 
                  unique_labels[np.argmax(final_preds[i])] + '\n')

[[1.79205555e-02 6.77923963e-04 2.38153320e-02 ... 1.28061080e-03
  1.20461974e-02 3.13588306e-02]
 [1.79205555e-02 6.77923963e-04 2.38153320e-02 ... 1.28061080e-03
  1.20461974e-02 3.13588306e-02]
 [5.01193354e-05 3.25891771e-03 2.97569972e-03 ... 9.69696674e-04
  6.53710589e-03 8.01199712e-05]
 ...
 [9.35527217e-03 1.05663610e-03 1.25318849e-02 ... 1.23593386e-03
  1.35002118e-02 2.34928131e-02]
 [1.13905146e-04 2.67323405e-02 1.28292982e-02 ... 1.32288353e-03
  8.24602097e-02 6.66971493e-04]
 [1.05594366e-03 4.05793637e-02 1.12186028e-02 ... 1.05384178e-02
  7.65757337e-02 8.70423450e-04]]


In [11]:
cnn2D_loss, cnn2D_acc = cnn2D.evaluate(test_data_2D, test_labels)
rnn_loss, rnn_acc = rnn.evaluate(test_data, test_labels)
cnn1D_loss, cnn1D_acc = cnn1D.evaluate(test_data, test_labels)
cnn_rnn_loss, cnn_rnn_acc = cnn_rnn.evaluate(test_data, test_labels)
dense_net_loss, dense_net_acc = dense_net.evaluate(test_data_2D_3chan, test_labels)

print('2DCNN Test Accuracy:', str(cnn2D_acc*100) + '%')
print('1DCNN Test Accuracy:', str(cnn1D_acc*100) + '%')
print('Combined 1D CNN & RNN Test Accuracy:', str(cnn_rnn_acc*100) + '%')
print('Dense Net Conv. Base + Classifier Test Accuracy:', str(dense_net_acc*100) + '%')
print('RNN Test Accuracy:', str(rnn_acc*100) + '%')

2DCNN Test Loss: 45.6687898089172%
1DCNN Test Loss: 31.27388535031847%
Combined 1D CNN & RNN Test Loss: 33.88535031847134%
Dense Net Conv. Base + Classifier Test Loss: 5.031847133757962%
RNN Test Loss: 31.719745222929934%
