# Tensorflow Speech Recognition

In [3]:
import os
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

## 0. import trainning sets

In [1]:
filepath = '/media/share/data/kaggle/tensorflow-speech/'
train_path = filepath + 'train/audio/'
test_path = filepath + 'test/audio/'

target_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']

In [4]:
import fnmatch

train_df = pd.DataFrame([], columns=['fname', 'label'])
labels = os.listdir(train_path)

for i, label in enumerate(labels):
    labelpath = train_path + label
    filelist = fnmatch.filter(os.listdir(labelpath), '*.wav')
    filelist = pd.DataFrame(filelist, columns=['fname'])
    if label == '_background_noise_':
        filelist['label'] = 'silence'
    elif label not in target_labels:
        filelist['label'] = 'unknown'
    else:
        filelist['label'] = label
    filelist['path'] = label
    
    train_df = pd.concat([train_df, filelist], 0)
    
# random order
train_df = train_df.sample(frac=1).reset_index(drop='index')

In [4]:
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

### plot audio stimulus and spectrogram

In [5]:
def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

In [6]:
from sklearn.preprocessing import StandardScaler

train_cati = os.listdir(train_path)
new_sample_rate = 8000

train_df = pd.DataFrame([], columns=['fname', 'label'])

for i, comlabel in enumerate(train_cati):
    audiopath = train_path + train_cati[i] + '/'
    os.chdir(audiopath)
    filelist = !(ls -1 *.wav)
    
    for j in filelist:
        sample_rate, samples = wavfile.read(os.path.join(audiopath, j))
        
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else:
            samples = np.pad(samples, (0, 16000-len(samples)), 'constant')
            n_samples = [samples]
            
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
            freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
            norm_spect = StandardScaler().fit_transform(spectrogram)

            X1.append(norm_spect)
        
            if train_cati[i] == '_background_noise_':
                y1.append('silence')
            elif train_cati[i] not in target_labels:
                y1.append('unknown')
            else:
                y1.append(train_cati[i])



In [7]:
X1 = np.array(X1)
X1 = X1.reshape(tuple(list(X1.shape) + [1]))
y1 = pd.get_dummies(y1)

In [None]:
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
# ax1.set_title('Raw wave of ' + file01)
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
# ax2.set_title('Spectrogram of ' + file01)
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')

# 1.Model

In [8]:
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Flatten, Dense, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.layers.merge import Concatenate, Add, concatenate
from keras.callbacks import ModelCheckpoint
from keras.utils import multi_gpu_model


input_shape = (99, 81, 1)
nclass = 12
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Conv2D(8, kernel_size=2, activation='relu')(norm_inp)
img_1 = Conv2D(16, kernel_size=2, activation='relu')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Conv2D(16, kernel_size=3, activation='relu')(img_1)
img_1 = Conv2D(32, kernel_size=3, activation='relu')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Conv2D(32, kernel_size=3, activation='relu')(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation='relu')(img_1))
dense_1 = BatchNormalization()(Dense(128, activation='relu')(dense_1))
dense_1 = BatchNormalization()(Dense(128, activation='relu')(dense_1))
dense_1 = Dense(nclass, activation='softmax')(dense_1)

model = Model(inputs=inp, outputs=dense_1)

multi_gpu = multi_gpu_model(model, gpus=8)

multi_gpu.summary()

Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 99, 81, 1)    0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 99, 81, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 99, 81, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, 99, 81, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_4 (

In [11]:
multi_gpu.compile(optimizer=Adam(lr=0.002), loss='binary_crossentropy', metrics=['accuracy'])
os.chdir('/media/share/jiaxin_cmu/kaggle/TF_speech/')

In [12]:
from keras.callbacks import Callback, ReduceLROnPlateau

model_checkpoint = ModelCheckpoint('TF_speech_v2-{epoch:02d}-{val_loss:.4f}.hdf5',
                                   monitor='val_loss', save_best_only=True, save_weights_only=True)
adlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=0, 
                         mode='auto', epsilon=0.0001, cooldown=0, min_lr=1e-6)

x_train, x_valid, y_train, y_valid = train_test_split(X1, y1, test_size=0.1, random_state=np.random)

multi_gpu.fit(x_train, y_train, 
              batch_size=512, 
              validation_data=(x_valid, y_valid), 
              epochs=64, 
              shuffle=True, 
              verbose=1, 
              callbacks=[model_checkpoint, adlr])

Train on 58356 samples, validate on 6485 samples
Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<keras.callbacks.History at 0x7f141676ad30>

In [13]:
# model.load_weights('statoilv5-52-0.1685.hdf5')
multi_gpu.evaluate(x_valid, y_valid)



[0.0082402599811569749, 0.99728860494809235]

# Testing

In [45]:
submpath = '/media/share/jiaxin_cmu/kaggle/TF_speech/'
subm_df = pd.read_csv(filepath + 'sample_submission.csv')

new_sample_rate = 8000
test_X = []

for j, fname in enumerate(subm_df['fname']):
    sample_rate, samples = wavfile.read(os.path.join(test_path, fname))

    if len(samples) > 16000:
        samples = samples[100:16100]
    else:
        samples = np.pad(samples, (0, 16000-len(samples)), 'constant')

    resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
    freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
    norm_spect = StandardScaler().fit_transform(spectrogram)

    test_X.append(norm_spect)

In [46]:
test_X = np.array(test_X)
test_X = test_X.reshape(tuple(list(test_X.shape) + [1]))

In [47]:
pred_y = multi_gpu.predict(test_X)

In [49]:
predicts = np.argmax(pred_y, axis=1)
predicts = [target_labels[p] for p in predicts]
predicts

['down',
 'go',
 'go',
 'go',
 'go',
 'go',
 'right',
 'go',
 'no',
 'go',
 'go',
 'go',
 'off',
 'unknown',
 'no',
 'stop',
 'go',
 'yes',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'right',
 'right',
 'yes',
 'go',
 'go',
 'go',
 'stop',
 'yes',
 'go',
 'no',
 'unknown',
 'go',
 'go',
 'right',
 'go',
 'go',
 'go',
 'go',
 'go',
 'unknown',
 'down',
 'down',
 'go',
 'go',
 'go',
 'silence',
 'no',
 'go',
 'down',
 'go',
 'go',
 'go',
 'off',
 'go',
 'stop',
 'go',
 'silence',
 'go',
 'right',
 'unknown',
 'up',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'off',
 'go',
 'down',
 'right',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'yes',
 'unknown',
 'left',
 'off',
 'right',
 'up',
 'go',
 'go',
 'left',
 'go',
 'no',
 'go',
 'go',
 'go',
 'left',
 'go',
 'go',
 'yes',
 'no',
 'off',
 'go',
 'go',
 'go',
 'go',
 'go',
 'yes',
 'off',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'go',
 'off',
 'go',
 'silence',
 'go',
 

# submission

In [50]:
subm_df['label'] = predicts

In [51]:
subm_df.head()

Unnamed: 0,fname,label
0,clip_000044442.wav,down
1,clip_0000adecb.wav,go
2,clip_0000d4322.wav,go
3,clip_0000fb6fe.wav,go
4,clip_0001d1559.wav,go


In [36]:
subm_df.head()

Unnamed: 0,fname,label
0,clip_0a0a9fa8e.wav,go
1,clip_0a0a60a16.wav,on
2,clip_0a0a99fbe.wav,no
3,clip_0a0aa5a41.wav,go
4,clip_0a0aa67a9.wav,go


In [52]:
subm_df.to_csv(submpath + 'submission_02.csv', index=False)