In [1]:
import os
import re
from glob import glob
import pandas as pd
import numpy as np
from IPython import get_ipython
import matplotlib.pyplot as plt

In [2]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}
len(id2name)

12

In [3]:
def load_data(data_dir):
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))
    
    with open(os.path.join(data_dir, 'validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))
            
    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_silence_':
                continue
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'
            
            label_id = name2id[label]
            
            sample = (label, label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    return train_df, valid_df

In [4]:
train_df, valid_df = load_data('./data/')
train_df.head()
train_df.label.value_counts()
silence_files = train_df[train_df.label == 'silence']
train_df = train_df[train_df.label != 'silence']

from scipy.io import wavfile

def read_wav_file(fname):
    _, wav = wavfile.read(fname)
    wav = wav.astype(np.float32) / np.iinfo(np.int16).max
    return wav

silence_data = np.concatenate([read_wav_file(x) for x in silence_files.wav_file.values])



In [10]:
from scipy.signal import stft

def process_wav_file(fname):
    wav = read_wav_file(fname)
    
    L = 16000
    
    if len(wav) > L:
        i = np.random.randint(0, len(wav) - L) # 這是是減L不是len
        wav = wav[i: (i+L)]
    elif len(wav) < L:
        rem_len = L - len(wav)
        i = np.random.randint(0, len(silence_data) - rem_len)
        silence_part = silence_data[i: (i+L)]
        j = np.random.randint(0, rem_len)
        silence_part_left = silence_part[0: j]
        silence_part_right = silence_part[j: rem_len]
        wav = np.concatenate([silence_part_left, wav, silence_part_right])
    
    specgram = stft(wav, 16000, nperseg = 400, noverlap = 240, nfft = 512, padded = False, boundary = None)
    phase = np.angle(specgram[2]) / np.pi
    amp = np.log1p(np.abs(specgram[2]))
    
    return np.stack([phase, amp], axis = 2)

In [11]:
import random
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from keras.optimizers import RMSprop
from keras.utils import np_utils

In [12]:
def train_generator(train_batch_size):
    while True:
        this_train = train_df.groupby('label_id').apply(lambda x: x.sample(n = 100))
        shuffled_ids = random.sample(range(this_train.shape[0]), this_train.shape[0])
        for start in range(0, len(shuffled_ids), train_batch_size):
            x_batch = []
            y_batch = []
            end = min(start + train_batch_size, len(shuffled_ids))
            i_train_batch = shuffled_ids[start: end]
            for i in i_train_batch:
                x_batch.append(process_wav_file(this_train.wav_file.values[i]))
                y_batch.append(this_train.label_id.values[i])
            x_batch = np.array(x_batch)
            y_batch = np_utils.to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch
            
def valid_generator(val_batch_size):
    while True:
        ids = list(range(valid_df.shape[0]))
        for start in range(0, len(ids), val_batch_size):
            x_batch = []
            y_batch = []
            end = min(start + val_batch_size, len(ids))
            i_val_batch = ids[start: end]
            for i in i_val_batch:
                x_batch.append(process_wav_file(valid_df.wav_file.values[i]))
                y_batch.append(valid_df.label_id.values[i])
            x_batch = np.array(x_batch)
            y_batch = np_utils.to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch

In [13]:
x_in = Input(shape = (257, 98, 2))
x = BatchNormalization()(x_in)
for i in range(5):
    x = Conv2D(16*(2 ** i), (3,3))(x)
    x = Activation('elu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2,2))(x)
x = Conv2D(64, (1,1))(x)
x_branch_1 = GlobalAveragePooling2D()(x)
x_branch_2 = GlobalMaxPool2D()(x)
x = concatenate([x_branch_1, x_branch_2])
x = Dense(64, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(len(POSSIBLE_LABELS), activation = 'sigmoid')(x)
model = Model(inputs = x_in, outputs = x)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 257, 98, 2)   0                                            
__________________________________________________________________________________________________
batch_normalization_7 (BatchNor (None, 257, 98, 2)   8           input_2[0][0]                    
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 255, 96, 16)  304         batch_normalization_7[0][0]      
__________________________________________________________________________________________________
activation_6 (Activation)       (None, 255, 96, 16)  0           conv2d_7[0][0]                   
__________________________________________________________________________________________________
batch_norm

In [15]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
import h5py as h5py
callbacks = [EarlyStopping(monitor = 'val_loss', 
                          patience = 5, 
                          verbose = 1,
                          min_delta = 0.01, 
                          mode = 'min'), 
            ReduceLROnPlateau(monitor = 'val_loss', 
                          factor = 0.1, 
                          patience = 3, 
                          verbose = 1, 
                          epsilon = 0.01,  
                          mode = 'min'), 
            ModelCheckpoint(monitor = 'val_loss', 
                          filepath = 'weights/starter.hdf5', 
                          save_best_only = True, 
                          save_weights_only = True, 
                          mode = 'min'),
            TensorBoard(log_dir='./logs', # 這是紅衣說可以拿來跑LOG的
                          histogram_freq=0, 
                          batch_size=32, 
                          write_graph=True, 
                          write_grads=False, 
                          write_images=False, 
                          embeddings_freq=0, 
                          embeddings_layer_names=None, 
                          embeddings_metadata=None)]
# TQDMNotebookCallback()

epochSize = 20 #50
steps_per_epoch = 800 #344
history = model.fit_generator(generator = train_generator(64), 
                             steps_per_epoch = steps_per_epoch, 
                             epochs = epochSize,
                             verbose = 1, 
                             callbacks = callbacks,
                             validation_data = valid_generator(64), 
                             validation_steps = int(np.ceil(valid_df.shape[0]/64)))

Epoch 1/20



Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: reducing learning rate to 0.00010000000474974513.
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: reducing learning rate to 1.0000000474974514e-05.
Epoch 16/20
Epoch 00016: early stopping


In [16]:
import datetime
#model.load_weights('./weights/starter.hdf5')
now = datetime.datetime.now()
saveFileName = str(epochSize)+'_'+str(steps_per_epoch)+'.h5' 
model.save('model/'+saveFileName)