# Tensorflow Speech Recognition

In [3]:
import os
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
# import seaborn as sns
%matplotlib inline

from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

## 0. import trainning sets

In [4]:
filepath = '/media/share/data/kaggle/tensorflow-speech/'
train_path = filepath + 'train/audio/'
test_path = filepath + 'test/audio/'

target_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']

In [5]:
# make training list
import fnmatch

train_df = pd.DataFrame([], columns=['fname', 'label'])
labels = os.listdir(train_path)

for i, label in enumerate(labels):
    labelpath = train_path + label
    filelist = fnmatch.filter(os.listdir(labelpath), '*.wav')
    filelist = pd.DataFrame(filelist, columns=['fname'])
    if label == '_background_noise_':
        filelist['label'] = 'silence'
    elif label not in target_labels:
        filelist['label'] = 'unknown'
    else:
        filelist['label'] = label
    filelist['path'] = label
    
    train_df = pd.concat([train_df, filelist], 0)
    
# random order
train_df = train_df.sample(frac=1).reset_index(drop='index')

## 1. preprocess

In [6]:
def log_specgram(audio, sample_rate, window_size=25,
                 step_size=15, eps=1e-8):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)


def bg_generator():
    bg_path = train_path + '_background_noise_'
    bg_list = fnmatch.filter(os.listdir(bg_path), '*.wav')
    # major bg sound
    fname1 = bg_list[np.random.randint(0, 6)]
    sample_rate1, samples1 = wavfile.read(os.path.join(bg_path, fname1))
    s1 = np.random.randint(0, len(samples1) - 16000)
    bg_clip1 = samples1[s1 : (s1 + 16000)]
    # miner bg sound
    fname2 = bg_list[np.random.randint(0, 6)]
    sample_rate2, samples2 = wavfile.read(os.path.join(bg_path, fname2))
    s2 = np.random.randint(0, len(samples2) - 16000)
    bg_clip2 = samples2[s2 : (s2 + 16000)]
    
    prop = 0.01*(np.random.randint(10, 50))
    new_bg_clip = bg_clip1*(1-prop) + bg_clip2*prop
    
    return new_bg_clip

def time_stretch_aug(audio):
    time_aug = librosa.effects.time_stretch(np.float64(audio), 0.1*(np.random.randint(5, 16)))
    if len(time_aug) > 16000:
        rand_tpoint = np.random.randint(0, len(time_aug) - 16000)
        time_aug = time_aug[rand_tpoint : (rand_tpoint + 16000)]
    else:
        randdd = np.random.randint(0, 2)
        if randdd != 0:
            time_aug = np.pad(time_aug, (0, 16000-len(time_aug)), 'constant')
        else:
            time_aug = np.pad(time_aug, (16000-len(time_aug), 0), 'constant')
    return time_aug

def audio_augment(audio):
    import librosa
#     npdata = np.array(audio).reshape(len(audio))
    seq_type = np.random.randint(0, 2)
    
    if seq_type != 0:
        augment = time_stretch_aug(audio)
        augment = librosa.effects.pitch_shift(np.float64(augment), 16000, np.random.randint(5, 15))
    else:
        augment = librosa.effects.pitch_shift(np.float64(audio), 16000, np.random.randint(5, 15))
        augment = time_stretch_aug(augment)
        
    return augment


def bg_noise(audio):
    npdata = np.array(audio).reshape(len(audio))

    prop = 0.01*(np.random.randint(1, 21))
    augtype = np.random.randint(0, 3)
    
    if augtype != 0:
        new_clip = audio_augment(npdata) + bg_generator()*prop
    else:
        new_clip = npdata + bg_generator()*prop
    
    return new_clip


## preparing trainning data

In [7]:
import re
import hashlib
from tensorflow.python.util import compat
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M

def which_set(filename, validation_percentage):
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.

    Returns:
    String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < validation_percentage:
        result = 'validation'
    else:
        result = 'training'
    return result

In [21]:
train_idx = []
val_idx = []

for ii in range(6):
    for i in range(len(train_df)):
        x = which_set(train_df['fname'].iloc[i],10)
        if x == 'training':
            train_idx.append(i)
        elif x == 'validation':
            val_idx.append(i)

train_size = len(train_idx)
val_size = len(val_idx)

print('Train size: {}'.format(train_size))
print('Validation size: {}'.format(val_size))

Train size: 347574
Validation size: 40788


In [9]:
new_sample_rate = 8000

X1 = []
y1 = []

for size in range(5):
    for j, fname in enumerate(train_df['fname']):
        sample_rate, samples = wavfile.read(os.path.join(train_path+train_df['path'][j], fname))

        # augmentation
        if len(samples) > 16000:
            new_clip = np.random.randint(0, len(samples) - 16000)
            samples = samples[new_clip : (new_clip + 16000)]
        else:
            randdd = np.random.randint(0, 2)
            if randdd != 0:
                samples = np.pad(samples, (0, 16000-len(samples)), 'constant')
            else:
                samples = np.pad(samples, (16000-len(samples), 0), 'constant')
                
        n_type = np.random.randint(0, 6)
        if n_type != 0:
            n_samples = bg_noise(samples)
        else:
            n_samples = samples

        norm_samples = (n_samples - np.mean(n_samples)) / np.std(n_samples)
        resampled = signal.resample(norm_samples, int(new_sample_rate/sample_rate * norm_samples.shape[0]))
        freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)

        X1.append(spectrogram)
        y1.append(train_df['label'][j])



In [44]:
# add slience condition
sX1 = []
sy1 = []

new_sample_rate = 8000
sample_rate = 16000

from sklearn.preprocessing import StandardScaler

for i in range(64727):
    
    samples = bg_generator()
    
    norm_samples = (samples - np.mean(samples)) / np.std(samples)
    resampled = signal.resample(norm_samples, int(new_sample_rate/sample_rate * norm_samples.shape[0]))
    freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
#     norm_spect = StandardScaler().fit_transform(spectrogram)
    
    sX1.append(spectrogram)
    sy1.append('silence')



In [45]:
X4 = np.concatenate((X1, sX1), 0)
X4 = X4.reshape(tuple(list(X4.shape) + [1]))

y4 = np.concatenate((y1, sy1), 0)
y4 = pd.DataFrame(y4)
y4 = pd.get_dummies(y4)

In [46]:
y4

Unnamed: 0,0_down,0_go,0_left,0_no,0_off,0_on,0_right,0_silence,0_stop,0_unknown,0_up,0_yes
0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1,0,0
7,0,0,1,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,1


# 1.Model

In [12]:
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Flatten, Dense, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adadelta
from keras.layers.merge import Concatenate, Add, concatenate
from keras.utils import multi_gpu_model
import keras.backend as K
K.clear_session()

input_shape = (98, 101, 1)
nclass = 12

inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Conv2D(8, kernel_size=(2, 2), activation='relu', padding='same')(norm_inp)
img_1 = Conv2D(16, kernel_size=(2, 2), activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = Dense(128, activation='relu')(BatchNormalization()(img_1))
dense_1 = Dense(128, activation='relu')(BatchNormalization()(dense_1))
dense_1 = Dense(nclass, activation='softmax')(dense_1)

model = Model(inputs=inp, outputs=dense_1)

# multi_gpu = multi_gpu_model(model, gpus=8)

model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 98, 101, 1)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 98, 101, 1)        4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 101, 8)        40        
_________________________________________________________________
batch_normalization_2 (Batch (None, 98, 101, 8)        32        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 98, 101, 16)       528       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 49, 50, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 49, 50, 16)        0         
__________

In [13]:
model.compile(optimizer=Adadelta(lr=0.02), loss='categorical_crossentropy', metrics=['accuracy'])
os.chdir('/media/share/jiaxin_cmu/kaggle/TF_speech/')
model.load_weights('TF_speech_v3-13-0.2878.hdf5')

In [14]:
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau
import datetime

model_checkpoint = ModelCheckpoint('TF_speech_v5-{epoch:02d}-{val_loss:.4f}.hdf5',
                                   monitor='val_loss', save_best_only=True, save_weights_only=True)
adlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=0, 
                         mode='auto', epsilon=0.0001, cooldown=0, min_lr=1e-6)

# x_train, x_valid, y_train, y_valid = train_test_split(X2, y2.as_matrix(), test_size=0.1, random_state=np.random)

In [47]:
x_train = X4[train_idx]
y_train = y4.iloc[train_idx]
x_valid = X4[val_idx]
y_valid = y4.iloc[val_idx]

In [48]:
x_train.shape

(347574, 98, 101, 1)

In [49]:
# model.load_weights('TF_speech_v3-01-0.2875.hdf5')
train_history = model.fit(x_train, y_train.as_matrix(), 
                          batch_size=1024, 
                          validation_data=(x_valid, y_valid.as_matrix()), 
                          epochs=64, 
                          shuffle=True, 
                          verbose=2, 
                          callbacks=[model_checkpoint, adlr])

Train on 347574 samples, validate on 40788 samples
Epoch 1/64
 - 92s - loss: 0.3605 - acc: 0.8845 - val_loss: 0.6404 - val_acc: 0.8033
Epoch 2/64
 - 87s - loss: 0.3594 - acc: 0.8847 - val_loss: 0.6404 - val_acc: 0.8030
Epoch 3/64
 - 87s - loss: 0.3592 - acc: 0.8852 - val_loss: 0.6405 - val_acc: 0.8035
Epoch 4/64
 - 87s - loss: 0.3596 - acc: 0.8853 - val_loss: 0.6406 - val_acc: 0.8024
Epoch 5/64
 - 87s - loss: 0.3587 - acc: 0.8851 - val_loss: 0.6404 - val_acc: 0.8039
Epoch 6/64
 - 87s - loss: 0.3588 - acc: 0.8854 - val_loss: 0.6400 - val_acc: 0.8032
Epoch 7/64
 - 87s - loss: 0.3596 - acc: 0.8851 - val_loss: 0.6406 - val_acc: 0.8035
Epoch 8/64
 - 87s - loss: 0.3594 - acc: 0.8853 - val_loss: 0.6404 - val_acc: 0.8032
Epoch 9/64
 - 87s - loss: 0.3598 - acc: 0.8851 - val_loss: 0.6404 - val_acc: 0.8027
Epoch 10/64
 - 87s - loss: 0.3589 - acc: 0.8849 - val_loss: 0.6405 - val_acc: 0.8029
Epoch 11/64
 - 87s - loss: 0.3593 - acc: 0.8851 - val_loss: 0.6406 - val_acc: 0.8032
Epoch 12/64
 - 87s - lo

In [50]:
# model.load_weights('statoilv5-52-0.1685.hdf5')
model.evaluate(x_valid, y_valid.as_matrix())



[0.64037921673715892, 0.80288320095314403]

# Testing

In [51]:
submpath = '/media/share/jiaxin_cmu/kaggle/TF_speech/'
subm_df = pd.read_csv(filepath + 'sample_submission.csv')

new_sample_rate = 8000
test_X = []

for j, fname in enumerate(subm_df['fname']):
    sample_rate, samples = wavfile.read(os.path.join(test_path, fname))

    if len(samples) > 16000:
        new_clip = np.random.randint(0, len(samples) - 16000)
        samples = samples[new_clip : (new_clip + 16000)]
    else:
        samples = np.pad(samples, (0, 16000-len(samples)), 'constant')
    
    norm_samples = (samples - np.mean(samples)) / np.std(samples)
    resampled = signal.resample(norm_samples, int(new_sample_rate/sample_rate * norm_samples.shape[0]))
    freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
#     norm_spect = StandardScaler().fit_transform(spectrogram)

    test_X.append(spectrogram)

  app.launch_new_instance()


In [31]:
len(subm_df)

158538

In [32]:
test_X = np.array(test_X)
test_X = test_X.reshape(tuple(list(test_X.shape) + [1]))

In [33]:
pred_y = model.predict(test_X)

In [34]:
labels_1 = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop', 'unknown', 'up', 'yes']
# 0_down 	0_go 	0_left 	0_no 	0_off 	0_on 	0_right 	0_silence 	0_stop 	0_unknown 	0_up 	0_yes

In [35]:
predicts = np.argmax(pred_y, axis=1)
predicts = [labels_1[p] for p in predicts]
predicts

['down',
 'down',
 'stop',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'stop',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 'down',
 

# submission

In [36]:
subm_df['label'] = predicts

In [39]:
subm_df.head()

Unnamed: 0,fname,label
0,clip_000044442.wav,down
1,clip_0000adecb.wav,down
2,clip_0000d4322.wav,stop
3,clip_0000fb6fe.wav,down
4,clip_0001d1559.wav,down


In [None]:
subm_df.head()

In [40]:
subm_df.to_csv(submpath + 'submission_08.csv', index=False)