In [1]:
from scipy.io import wavfile
from scipy import signal
import re
from glob import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Import dataset

In [2]:
DATADIR = './dataset'  # unzipped train and test data
ALL_LABELS = ('bed bird cat dog down eight five four go ' +
                   'happy house left marvin name.txt nine no ' +
                   'off on one right seven sheila six stop three ' +
                   'tree two up wow yes zero silence').split()
num_labels = len(ALL_LABELS)
id2name = {i: name for i, name in enumerate(ALL_LABELS)}
name2id = {name: i for i, name in id2name.items()}

def load_data(data_dir):
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(ALL_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    return train, val

train, val = load_data(DATADIR)

There are 57929 train and 6798 val samples


# MFCC

In [3]:
import numpy as np
from scipy.io import wavfile
import python_speech_features

def mfcc(audio, sample_rate, **kwargs):
    """从读出的音频数据中算出mfcc,具体可以看python_speech_features的文档

    Parameters:
        audio (np.ndarray): - 指明音频的振幅序列
        sample_rate (int): - 指明抽样率
        numcep (int): - 指明返回的倒数数量,默认为13

    Returns:
        np.ndarray: - mfcc强度(二维)组成的元组,shape为(times.shape,numcep)
    """
    return python_speech_features.mfcc(audio, sample_rate, **kwargs)


def mfcc_from_path(record_path, **kwargs):
    """从音频文件读取出mfcc

    Parameters:
        record_path (Union[pathlib.Path,str]): - 指明音频的路径

    Returns:
        np.ndarray: - mfcc强度(二维)组成的元组,shape为(times.shape,numcep)
    """
    sample_rate, samples = wavfile.read(str(record_path))
    return mfcc(samples, sample_rate,**kwargs)

# Load Transformed Train Data

In [4]:
from keras.utils import to_categorical

train = np.array(train)
np.random.shuffle(train)
train = train.tolist()

size = len(train)

trainX = np.zeros( (len(train),99,13) )
trainY = []

for i, (label_id,uid,fname) in enumerate(train):
    try:
        x = mfcc_from_path(fname).tolist()
        # ---------------------------------------
        # Preprocess the train input sequence
        # - Padding 0.0 at the end if length < 99
        # - If longer than 99, choose only the first 99 items
        # ---------------------------------------
        while len(x) < 99:
            x.append([0.0]*13)
        if len(x) > 99:
            x = x[0:99]
        trainX[i] = x
        trainY.append(label_id)
        if i % 2000 == 0:
            print("Loading {}/{}.".format(i, size))
    except Exception as err:
        print(err)
        
print("Finish loading train data")
trainY = to_categorical(trainY, num_classes=num_labels)

print("Input Shape: ",  trainX.shape)
print("Output Shape: ", trainY.shape)

Using TensorFlow backend.


Loading 0/57929.
Loading 2000/57929.
Loading 4000/57929.
Loading 6000/57929.
Loading 8000/57929.
Loading 10000/57929.
Loading 12000/57929.




Loading 14000/57929.
Loading 16000/57929.
Loading 18000/57929.
Loading 20000/57929.
Loading 22000/57929.
Loading 24000/57929.
Loading 26000/57929.
Loading 28000/57929.
Loading 30000/57929.
Loading 32000/57929.
Loading 34000/57929.
Loading 36000/57929.
Loading 38000/57929.
Loading 40000/57929.
Loading 42000/57929.
Loading 44000/57929.
Loading 46000/57929.
Loading 48000/57929.
Loading 50000/57929.
Loading 52000/57929.
Loading 54000/57929.
Loading 56000/57929.
Finish loading train data
Input Shape:  (57929, 99, 13)
Output Shape:  (57929, 32)


# Load Validation Data

In [5]:
valX = np.zeros( (len(val),99,13) )
valY = []

size = len(val)

for i, (label_id,uid,fname) in enumerate(val):
    try:
        x = mfcc_from_path(fname).tolist()
        # ---------------------------------------
        # Preprocess the train input sequence
        # - Padding 0.0 at the end if length < 99
        # - If longer than 99, choose only the first 99 items
        # ---------------------------------------
        while len(x) < 99:
            x.append([0.0]*13)
        if len(x) > 99:
            x = x[0:99]
        valX[i] = x
        valY.append(label_id)
        if i % 2000 == 0:
            print("Loading {}/{}.".format(i, size))
    except Exception as err:
        print(err)
        
print("Finish loading val data.")
valY = to_categorical(valY, num_classes=num_labels)

print("Input Shape: ",  valX.shape)
print("Output Shape: ", valY.shape)

Loading 0/6798.
Loading 2000/6798.
Loading 4000/6798.
Loading 6000/6798.
Finish loading val data.
Input Shape:  (6798, 99, 13)
Output Shape:  (6798, 32)


# Build the model

In [6]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, GlobalAveragePooling1D, AveragePooling1D, Activation
from keras.utils import to_categorical

from keras.layers import merge
from keras.layers.core import *
from keras.models import *

In [7]:
max_len = 99
embed_dim = 13

MODEL_TYPE = "lstm"
ATTENTION = True
LEARN_RATE = 0.001
BATCH_SIZE = 20
INPUT_SHAPE = [max_len, embed_dim]
EPOCHE = 15
NUM_HIDDEN = 100
TIME_STEPS = max_len
SINGLE_ATTENTION_VECTOR = False

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a)
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul

inputs = Input(shape=(INPUT_SHAPE))
# RNN Layer
if MODEL_TYPE == 'rnn':
    rnn_out = SimpleRNN(NUM_HIDDEN, return_sequences=True)(inputs)
elif MODEL_TYPE == 'gru':
    rnn_out = GRU(NUM_HIDDEN, return_sequences=True)(inputs)
elif MODEL_TYPE == 'lstm':
    rnn_out = LSTM(NUM_HIDDEN, return_sequences=True)(inputs)
else:
    raise NameError("Unsupported model type")
# Attention Layer
attention_mul = attention_3d_block(rnn_out)
attention_mul = Flatten()(attention_mul)
output = Dense(num_labels, activation='softmax')(attention_mul)
model = Model(input=[inputs], output=output)



In [8]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 99, 13)       0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 99, 100)      45600       input_1[0][0]                    
__________________________________________________________________________________________________
permute_1 (Permute)             (None, 100, 99)      0           lstm_1[0][0]                     
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 100, 99)      0           permute_1[0][0]                  
__________________________________________________________________________________________________
dense_1 (D

In [16]:
import keras

# Early Stopping
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=2, mode='auto')

# Reduce lr
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [early_stop, reduce_lr]

# Train
model_info = model.fit(trainX, trainY, 
          epochs=EPOCHE, batch_size=BATCH_SIZE, 
          verbose=2, validation_data=(valX, valY),
          callbacks=callbacks)

In [None]:
import matplotlib.pyplot as plt

def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()
    
plot_model_history(model_info)

# Load Test Data

In [10]:
import glob    
import pandas as pd

test_files = glob.glob("./dataset/test/audio/*.wav")
test_filenames = [f.split("/")[-1] for f in test_files]

# Generate Submission

In [11]:
submission = []

possible_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']

num_test = len(test_files)

for i in range(num_test):
    test_wav = mfcc_from_path(test_files[i])[np.newaxis,:,:]
    label = id2name[np.argmax( model.predict(test_wav) )]
    if label not in possible_labels:
        label = 'unknown' 
    submission.append([test_filenames[i], label])
    if i % 3000 == 0:
        print("Predicting {}/{}".format(i, num_test))

print("Finish predicting.....")

submissionDF = pd.DataFrame(submission, columns=['fname','label'])

print("Writing to file.....")
submissionDF.to_csv("submission-lstm-attention.csv", index=False, header=True)

Predicting 0/158538
Predicting 3000/158538
Predicting 6000/158538
Predicting 9000/158538
Predicting 12000/158538
Predicting 15000/158538
Predicting 18000/158538
Predicting 21000/158538
Predicting 24000/158538
Predicting 27000/158538
Predicting 30000/158538
Predicting 33000/158538
Predicting 36000/158538
Predicting 39000/158538
Predicting 42000/158538
Predicting 45000/158538
Predicting 48000/158538
Predicting 51000/158538
Predicting 54000/158538
Predicting 57000/158538
Predicting 60000/158538
Predicting 63000/158538
Predicting 66000/158538
Predicting 69000/158538
Predicting 72000/158538
Predicting 75000/158538
Predicting 78000/158538
Predicting 81000/158538
Predicting 84000/158538
Predicting 87000/158538
Predicting 90000/158538
Predicting 93000/158538
Predicting 96000/158538
Predicting 99000/158538
Predicting 102000/158538
Predicting 105000/158538
Predicting 108000/158538
Predicting 111000/158538
Predicting 114000/158538
Predicting 117000/158538
Predicting 120000/158538
Predicting 12300