# Log Spectrum + CNN

In [1]:
wanted_words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
possible_labels = ['silence', 'unknown'] + wanted_words

print("{} possible labels : {}".format(len(possible_labels), possible_labels))

12 possible labels : ['silence', 'unknown', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']


In [4]:
################ Train & Val & Test Split ################
validation_percentage = 10
testing_percentage    = 10

################ Silence and Unknown Ratio ################
silence_percentage = 10.0
unknown_percentage = 10.0

##################### Labels #############################
label_count = len(possible_labels)

###################### Audio #############################
sample_rate = 16000
clip_duration_ms = 1000
desired_samples = 16000

################ Background Noise ########################
background_volume_range = 0.1
background_frequency = 0.1

################### Wav Shifting #########################
time_shift = int(desired_samples/8)

In [5]:
import tensorflow as tf

from scipy.io import wavfile
import numpy as np

def load_wav_file(filename):
    """Loads an audio file and returns a float PCM-encoded array of samples.
  
    Args:
        filename: Path to the .wav file to load.
    Returns:
        Numpy array holding the sample data as floats between -1.0 and 1.0.
    """
    
    _, wav = wavfile.read(str(filename))
    wav = wav.astype(np.float32) / np.iinfo(np.int16).max
    return wav

In [6]:
import os.path
import re
import hashlib

from tensorflow.python.util import compat

MAX_NUM_WAVS_PER_CLASS = 2 ** 27 - 1  # ~134M

def which_set(filename, validation_percentage=validation_percentage, testing_percentage=testing_percentage):
    """Determines which data partition the file should belong to.
      
    Args:
        filename: File path of the data sample.
        validation_percentage: How much of the data set to use for validation.
        testing_percentage: How much of the data set to use for testing.
    Returns:
        String, one of 'training', 'validation', or 'testing'.
    """

    # Get hash value based on filename
    base_name = os.path.basename(filename)
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    
    # Turn hash value -> percentage
    percentage_hash = int(hash_name_hashed, 16) \
        % (MAX_NUM_WAVS_PER_CLASS + 1) * (100.0 / MAX_NUM_WAVS_PER_CLASS)
    
    # Assign which data set it belongs to by hash percentage
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < testing_percentage + validation_percentage:
        result = 'testing'
    else:
        result = 'training'
    return result

In [7]:
random_seed = 1234
data_dir = os.path.join(os.getcwd(),'dataset')

In [8]:
import random
import math
import glob

def prepare_data_index():

    random.seed(random_seed)
    wanted_words_index = {}
    for index, word in enumerate(wanted_words):
        wanted_words_index[word] = index + 2

    data_index = {'validation': [], 'testing': [],
                           'training': []}
    unknown_index = {'validation': [], 'testing': [],
                         'training': []}
    all_words = {}

    # Look through all the subfolders to find audio samples
    search_path = os.path.join(data_dir,'train','audio','*','*.wav')

    for wav_path in glob.glob(search_path):
        (_, word) = os.path.split(os.path.dirname(wav_path))
        word = word.lower()
        if word == '_background_noise_':
            continue
        all_words[word] = True
        set_index = which_set(wav_path)
        if word in wanted_words_index:
            data_index[set_index].append({'label': word, 'file': wav_path})
        else:
            unknown_index[set_index].append({'label': word, 'file': wav_path})

    # We need an arbitrary file to load as the input for the silence samples.
    # It's multiplied by zero later, so the content doesn't matter.

    silence_wav_path = data_index['training'][0]['file']
    for set_index in ['validation', 'testing', 'training']:
        set_size = len(data_index[set_index])
        silence_size = int(math.ceil(set_size * silence_percentage / 100))
        for _ in range(silence_size):
            data_index[set_index].append({'label': 'silence',
                    'file': silence_wav_path})

        # Pick some unknowns to add to each partition of the data set.
        random.shuffle(unknown_index[set_index])
        unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
        data_index[set_index].extend((unknown_index[set_index])[:unknown_size])
        
        
    # Make sure the ordering is random.
    for set_index in ['validation', 'testing', 'training']:
        random.shuffle(data_index[set_index])

    # Prepare the rest of the result data structure.
    words_list = possible_labels
    word_to_index = {}
    for word in all_words:
        if word in wanted_words_index:
            word_to_index[word] = wanted_words_index[word]
        else:
            word_to_index[word] = 1
    word_to_index['silence'] = 0
    
    return data_index, word_to_index

In [9]:
data_index, word_to_index = prepare_data_index()
print(data_index['training'][0])
print(data_index['testing'][0])
print(data_index['validation'][0])

{'label': 'silence', 'file': '/Users/yufan/Documents/workspace/kaggle-project/dataset/train/audio/right/988e2f9a_nohash_0.wav'}
{'label': 'right', 'file': '/Users/yufan/Documents/workspace/kaggle-project/dataset/train/audio/right/68dd409e_nohash_0.wav'}
{'label': 'right', 'file': '/Users/yufan/Documents/workspace/kaggle-project/dataset/train/audio/right/ab7b5acd_nohash_1.wav'}


In [10]:
def prepare_background_data():

    background_data = []
    background_wavfiles = glob.glob(os.path.join(data_dir,'train','audio','_background_noise_','*.wav'))

    for wavfile in background_wavfiles:
        wav = load_wav_file(wavfile)
        background_data.append(wav)
    
    return background_data

In [11]:
background_data = prepare_background_data()



In [88]:
from scipy import signal

def desired_samples_wav(wav, desired_samples=desired_samples):
    wav_length = wav.shape[0]
    
    if wav_length < desired_samples:
        # Pad 0 at the end
        desired_wav = np.lib.pad(wav, (0, desired_samples-wav_length), mode='constant')
    elif wav_length > desired_samples:
        # Random choose a range from the data
        start = np.random.randint(0, wav_length-desired_samples)
        desired_wav = wav[start:start+desired_samples]
    else:
        desired_wav = wav
        
    return desired_wav

def silence_as_zero(wav, label):
    
    if label == 'silence':
        # If silence, set all volumn as 0
        volume_scale = 0
    else:
        volume_scale = 1
    
    return np.multiply(wav, volume_scale)

def shift_and_pad_zeros(wav, time_shift=time_shift):
    
    wav_length = wav.shape[0]
    
    if time_shift > 0:
        time_shift_amount = np.random.randint(-time_shift, time_shift)
    else:
        time_shift_amount = 0
    if time_shift_amount > 0:
        shifted_wav = np.lib.pad(wav, (0, time_shift_amount), mode='constant')
    else:
        shifted_wav = np.lib.pad(wav, (-time_shift_amount, 0), mode='constant')
    
    return shifted_wav[:wav_length]

def mix_background_noise(wav, 
                         background_volume_range=background_volume_range, 
                         background_frequency=background_frequency,
                         background_data=background_data):
    
    wav_length = wav.shape[0]
    wav = wav.reshape(wav_length, 1)
    
    # Random choose a background data
    background_index = np.random.randint(len(background_data))
    background_samples = background_data[background_index]
    
    # Random shift the background data
    background_offset = np.random.randint(
        0, len(background_samples) - desired_samples)
    background_clipped = background_samples[background_offset:(background_offset + desired_samples)]
    background_reshaped = background_clipped.reshape([desired_samples, 1])
    
    # Random choose add background noise or not
    if np.random.uniform(0, 1) < background_frequency:
        background_volume = np.random.uniform(0, background_volume_range)
    else:
        background_volume = 0
    background_noise = np.multiply(background_reshaped, background_volume)
    wav_with_noise = background_noise + wav   
    
    # Clip by -1, 1
    background_clamp = np.clip(wav_with_noise, -1.0, 1.0)
    
    return wav_with_noise.reshape(wav_length)

def log_specgram(audio, sample_rate, *, window='hann', window_size=20,
                 step_size=10, eps=1e-10):
    """从读出的音频数据中算出对数频谱数据
    Parameters:
        audio (np.ndarray): - 指明音频的振幅序列
        sample_rate (int): - 指明抽样率
        window (str): - 指明分窗的算法,可选的详情可以看scipy.signal.get_window的文档
        window_size (Union[pathlib.Path,str]): - 指明音频的分窗大小
        step_size (Union[pathlib.Path,str]): - 指明步进长度
        eps (float): - 指明频谱强度取对数时的最小值,防止输入为0后得到负无穷
    Returns:
        tuple[np.ndarray,np.ndarray,np.ndarray]: - 由频率(一维),分段时间(一维)和频谱强度(二维)\
        组成的元组,shape(times.shape,freqs.shape)
    """
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                            fs=sample_rate,
                                            window=window,
                                            nperseg=nperseg,
                                            noverlap=noverlap,
                                            detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [89]:
def preprocess(fname, label, mode='train'):
    wav = load_wav_file(fname)
    wav = desired_samples_wav(wav)
    if mode=='train':
        wav = silence_as_zero(wav, label)
        wav = shift_and_pad_zeros(wav)
        wav = mix_background_noise(wav)
    _, _, log_spec = log_specgram(wav, desired_samples)
    return log_spec, word_to_index[label]

In [90]:
preprocess(data_index['training'][0]['file'], data_index['training'][0]['label'])[0].shape

(99, 161)

## Keras Resnet

In [91]:
from __future__ import division

import six
from keras.models import Model
from keras.layers import (
    Input,
    Activation,
    Dense,
    Flatten
)
from keras.layers.convolutional import (
    Conv2D,
    MaxPooling2D,
    AveragePooling2D
)
from keras.layers.merge import add
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
from keras import backend as K


def _bn_relu(input):
    """Helper to build a BN -> relu block
    """
    norm = BatchNormalization(axis=CHANNEL_AXIS)(input)
    return Activation("relu")(norm)


def _conv_bn_relu(**conv_params):
    """Helper to build a conv -> BN -> relu block
    """
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))

    def f(input):
        conv = Conv2D(filters=filters, kernel_size=kernel_size,
                      strides=strides, padding=padding,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer)(input)
        return _bn_relu(conv)

    return f


def _bn_relu_conv(**conv_params):
    """Helper to build a BN -> relu -> conv block.
    This is an improved scheme proposed in http://arxiv.org/pdf/1603.05027v2.pdf
    """
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer", l2(1.e-4))

    def f(input):
        activation = _bn_relu(input)
        return Conv2D(filters=filters, kernel_size=kernel_size,
                      strides=strides, padding=padding,
                      kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer)(activation)

    return f


def _shortcut(input, residual):
    """Adds a shortcut between input and residual block and merges them with "sum"
    """
    # Expand channels of shortcut to match residual.
    # Stride appropriately to match residual (width, height)
    # Should be int if network architecture is correctly configured.
    input_shape = K.int_shape(input)
    residual_shape = K.int_shape(residual)
    stride_width = int(round(input_shape[ROW_AXIS] / residual_shape[ROW_AXIS]))
    stride_height = int(round(input_shape[COL_AXIS] / residual_shape[COL_AXIS]))
    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]

    shortcut = input
    # 1 X 1 conv if shape is different. Else identity.
    if stride_width > 1 or stride_height > 1 or not equal_channels:
        shortcut = Conv2D(filters=residual_shape[CHANNEL_AXIS],
                          kernel_size=(1, 1),
                          strides=(stride_width, stride_height),
                          padding="valid",
                          kernel_initializer="he_normal",
                          kernel_regularizer=l2(0.0001))(input)

    return add([shortcut, residual])


def _residual_block(block_function, filters, repetitions, is_first_layer=False):
    """Builds a residual block with repeating bottleneck blocks.
    """
    def f(input):
        for i in range(repetitions):
            init_strides = (1, 1)
            if i == 0 and not is_first_layer:
                init_strides = (2, 2)
            input = block_function(filters=filters, init_strides=init_strides,
                                   is_first_block_of_first_layer=(is_first_layer and i == 0))(input)
        return input

    return f


def basic_block(filters, init_strides=(1, 1), is_first_block_of_first_layer=False):
    """Basic 3 X 3 convolution blocks for use on resnets with layers <= 34.
    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
    """
    def f(input):

        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv1 = Conv2D(filters=filters, kernel_size=(3, 3),
                           strides=init_strides,
                           padding="same",
                           kernel_initializer="he_normal",
                           kernel_regularizer=l2(1e-4))(input)
        else:
            conv1 = _bn_relu_conv(filters=filters, kernel_size=(3, 3),
                                  strides=init_strides)(input)

        residual = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv1)
        return _shortcut(input, residual)

    return f


def bottleneck(filters, init_strides=(1, 1), is_first_block_of_first_layer=False):
    """Bottleneck architecture for > 34 layer resnet.
    Follows improved proposed scheme in http://arxiv.org/pdf/1603.05027v2.pdf
    Returns:
        A final conv layer of filters * 4
    """
    def f(input):

        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv_1_1 = Conv2D(filters=filters, kernel_size=(1, 1),
                              strides=init_strides,
                              padding="same",
                              kernel_initializer="he_normal",
                              kernel_regularizer=l2(1e-4))(input)
        else:
            conv_1_1 = _bn_relu_conv(filters=filters, kernel_size=(1, 1),
                                     strides=init_strides)(input)

        conv_3_3 = _bn_relu_conv(filters=filters, kernel_size=(3, 3))(conv_1_1)
        residual = _bn_relu_conv(filters=filters * 4, kernel_size=(1, 1))(conv_3_3)
        return _shortcut(input, residual)

    return f


def _handle_dim_ordering():
    global ROW_AXIS
    global COL_AXIS
    global CHANNEL_AXIS
    if K.image_dim_ordering() == 'tf':
        ROW_AXIS = 1
        COL_AXIS = 2
        CHANNEL_AXIS = 3
    else:
        CHANNEL_AXIS = 1
        ROW_AXIS = 2
        COL_AXIS = 3


def _get_block(identifier):
    if isinstance(identifier, six.string_types):
        res = globals().get(identifier)
        if not res:
            raise ValueError('Invalid {}'.format(identifier))
        return res
    return identifier


class ResnetBuilder(object):
    @staticmethod
    def build(input_shape, num_outputs, block_fn, repetitions):
        """Builds a custom ResNet like architecture.
        Args:
            input_shape: The input shape in the form (nb_channels, nb_rows, nb_cols)
            num_outputs: The number of outputs at final softmax layer
            block_fn: The block function to use. This is either `basic_block` or `bottleneck`.
                The original paper used basic_block for layers < 50
            repetitions: Number of repetitions of various block units.
                At each block unit, the number of filters are doubled and the input size is halved
        Returns:
            The keras `Model`.
        """
        _handle_dim_ordering()
        if len(input_shape) != 3:
            raise Exception("Input shape should be a tuple (nb_channels, nb_rows, nb_cols)")

        # Permute dimension order if necessary
        if K.image_dim_ordering() == 'tf':
            input_shape = (input_shape[1], input_shape[2], input_shape[0])

        # Load function from str if needed.
        block_fn = _get_block(block_fn)

        input = Input(shape=input_shape)
        conv1 = _conv_bn_relu(filters=64, kernel_size=(7, 7), strides=(2, 2))(input)
        pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding="same")(conv1)

        block = pool1
        filters = 64
        for i, r in enumerate(repetitions):
            block = _residual_block(block_fn, filters=filters, repetitions=r, is_first_layer=(i == 0))(block)
            filters *= 2

        # Last activation
        block = _bn_relu(block)

        # Classifier block
        block_shape = K.int_shape(block)
        pool2 = AveragePooling2D(pool_size=(block_shape[ROW_AXIS], block_shape[COL_AXIS]),
                                 strides=(1, 1))(block)
        flatten1 = Flatten()(pool2)
        dense = Dense(units=num_outputs, kernel_initializer="he_normal",
                      activation="softmax")(flatten1)

        model = Model(inputs=input, outputs=dense)
        return model

    @staticmethod
    def build_resnet_18(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [2, 2, 2, 2])

    @staticmethod
    def build_resnet_34(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, basic_block, [3, 4, 6, 3])

    @staticmethod
    def build_resnet_50(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 4, 6, 3])

    @staticmethod
    def build_resnet_101(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 4, 23, 3])

    @staticmethod
    def build_resnet_152(input_shape, num_outputs):
        return ResnetBuilder.build(input_shape, num_outputs, bottleneck, [3, 8, 36, 3])

In [92]:
model = ResnetBuilder.build_resnet_18((1,99,161), (12))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [93]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 99, 161, 1)   0                                            
__________________________________________________________________________________________________
conv2d_61 (Conv2D)              (None, 50, 81, 64)   3200        input_4[0][0]                    
__________________________________________________________________________________________________
batch_normalization_52 (BatchNo (None, 50, 81, 64)   256         conv2d_61[0][0]                  
__________________________________________________________________________________________________
activation_52 (Activation)      (None, 50, 81, 64)   0           batch_normalization_52[0][0]     
__________________________________________________________________________________________________
max_poolin

In [94]:
from keras.utils import to_categorical

train_data_index = data_index['training']
batch_size=32

def train_generator(batch_size=batch_size):
    
    while 1:

        X_batches = []
        Y_batches = []

        for _ in range(batch_size):
            i = np.random.randint(0, len(train_data_index))
            fname = train_data_index[i]['file']
            label = train_data_index[i]['label']
            X_input, Y_input = preprocess(fname, label)
            X_input = X_input[:,:,np.newaxis]
            X_batches.append(X_input)
            Y_batches.append(Y_input)

        yield np.array(X_batches), to_categorical(np.array(Y_batches).reshape(batch_size, 1), num_classes=label_count)

In [95]:
val_itr = 0
val_data_index = data_index['validation']

def val_generator(batch_size=batch_size):
    
    while 1:
        global val_itr
        if val_itr + batch_size >= len(val_data_index):
            val_itr = 0

        X_batches = []
        Y_batches = []

        for i in range(val_itr, val_itr+batch_size):
            fname = val_data_index[i]['file']
            label = val_data_index[i]['label']
            X_input, Y_input = preprocess(fname, label, mode='val')
            X_input = X_input[:,:,np.newaxis]
            X_batches.append(X_input)
            Y_batches.append(Y_input)
            
        val_itr = val_itr + batch_size

        yield np.array(X_batches), to_categorical(np.array(Y_batches).reshape(batch_size, 1), num_classes=label_count)

In [None]:
model.fit_generator(generator = train_generator(), 
                    steps_per_epoch = 200,
                    validation_data = val_generator(),
                    validation_steps = len(val_data_index)//batch_size,
                    epochs=15)

Epoch 1/15