# Tensorflow Speech Recognition

In [1]:
import os
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

## 0. import trainning sets

In [2]:
filepath = '/media/share/data/kaggle/tensorflow-speech/'
train_path = filepath + 'train/audio/'
test_path = filepath + 'test/audio/'

target_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']

In [3]:
# make training list
import fnmatch

train_df = pd.DataFrame([], columns=['fname', 'label'])
labels = os.listdir(train_path)

for i, label in enumerate(labels):
    labelpath = train_path + label
    filelist = fnmatch.filter(os.listdir(labelpath), '*.wav')
    filelist = pd.DataFrame(filelist, columns=['fname'])
    if label == '_background_noise_':
        filelist['label'] = 'silence'
    elif label not in target_labels:
        filelist['label'] = 'unknown'
    else:
        filelist['label'] = label
    filelist['path'] = label
    
    train_df = pd.concat([train_df, filelist], 0)
    
# random order
train_df = train_df.sample(frac=1).reset_index(drop='index')

## 1. preprocess

In [4]:
def log_specgram(audio, sample_rate, window_size=25,
                 step_size=15, eps=1e-8):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [5]:
from sklearn.preprocessing import StandardScaler

new_sample_rate = 8000

X1 = []
y1 = []

for j, fname in enumerate(train_df['fname']):
    sample_rate, samples = wavfile.read(os.path.join(train_path+train_df['path'][j], fname))

    if len(samples) > 16000:
        new_clip = np.random.randint(100, len(samples) - 16000)
        samples = samples[new_clip : (new_clip + 16000)]
    else:
        samples = np.pad(samples, (0, 16000-len(samples)), 'constant')

    resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
    freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
    norm_spect = StandardScaler().fit_transform(spectrogram)

    X1.append(norm_spect)
    y1.append(train_df['label'][j])



In [6]:
X1 = np.array(X1)
X1 = X1.reshape(tuple(list(X1.shape) + [1]))
y1 = pd.get_dummies(y1)

In [7]:
X1[1].shape

(98, 101, 1)

# 1.Model

In [9]:
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Flatten, Dense, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.layers.merge import Concatenate, Add, concatenate
from keras.callbacks import ModelCheckpoint
from keras.utils import multi_gpu_model


input_shape = (98, 101, 1)
nclass = 12

kernel_size = (3, 3)

inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Conv2D(16, kernel_size, activation='relu', padding='same')(norm_inp)
img_1 = Conv2D(32, kernel_size, activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.1)(img_1)
img_1 = Conv2D(32, kernel_size, activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = Conv2D(64, kernel_size, activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.15)(img_1)
img_1 = Conv2D(64, kernel_size, activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = Conv2D(128, kernel_size, activation='relu', padding='same')(BatchNormalization()(img_1))
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = Dense(256, activation='relu')(BatchNormalization()(img_1))
dense_1 = Dense(128, activation='relu')(BatchNormalization()(dense_1))
dense_1 = Dense(64, activation='relu')(BatchNormalization()(dense_1))
dense_1 = Dense(nclass, activation='softmax')(dense_1)

model = Model(inputs=inp, outputs=dense_1)

multi_gpu = multi_gpu_model(model, gpus=8)

multi_gpu.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 98, 101, 1)   0                                            
__________________________________________________________________________________________________
lambda_9 (Lambda)               (None, 98, 101, 1)   0           input_2[0][0]                    
__________________________________________________________________________________________________
lambda_10 (Lambda)              (None, 98, 101, 1)   0           input_2[0][0]                    
__________________________________________________________________________________________________
lambda_11 (Lambda)              (None, 98, 101, 1)   0           input_2[0][0]                    
__________________________________________________________________________________________________
lambda_12 

In [10]:
multi_gpu.compile(optimizer=Adam(lr=0.002), loss='binary_crossentropy', metrics=['accuracy'])
os.chdir('/media/share/jiaxin_cmu/kaggle/TF_speech/')

In [12]:
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, ReduceLROnPlateau
import datetime

model_checkpoint = ModelCheckpoint('TF_speech_v2-{epoch:02d}-{val_loss:.4f}.hdf5',
                                   monitor='val_loss', save_best_only=True, save_weights_only=True)

adlr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=0, 
                         mode='auto', epsilon=0.0001, cooldown=0, min_lr=1e-6)

train_datagen = ImageDataGenerator(width_shift_range = 0.30, 
                                   height_shift_range = 0.30, 
                                   zoom_range = 0.15)

test_datagen = ImageDataGenerator(width_shift_range = 0.05)

x_train, x_valid, y_train, y_valid = train_test_split(X1, y1, test_size=0.1, random_state=np.random)

train_datagen.fit(x_train)
test_datagen.fit(x_valid)

In [16]:
batch_size = 1024

train_history = multi_gpu.fit_generator(train_datagen.flow(x_train, y_train, batch_size), 
                                        epochs=2**6, steps_per_epoch=(len(y_train)//batch_size), 
                                        validation_data=test_datagen.flow(x_valid, y_valid, batch_size), 
                                        validation_steps=(len(x_valid)//batch_size),
                                        verbose=1, callbacks=[model_checkpoint, adlr])

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [18]:
# model.load_weights('statoilv5-52-0.1685.hdf5')
multi_gpu.evaluate(x_valid, y_valid)



[0.0047744662503417584, 0.99893145279079176]

# Testing

In [5]:
submpath = '/media/share/jiaxin_cmu/kaggle/TF_speech/'
subm_df = pd.read_csv(filepath + 'sample_submission.csv')

# new_sample_rate = 8000
# test_X = []

# for j, fname in enumerate(subm_df['fname']):
#     sample_rate, samples = wavfile.read(os.path.join(test_path, fname))

#     if len(samples) > 16000:
#         new_clip = np.random.randint(0, len(samples) - 16000)
#         samples = samples[new_clip : (new_clip + 16000)]
#     else:
#         samples = np.pad(samples, (0, 16000-len(samples)), 'constant')

#     resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
#     freqs, times, spectrogram = log_specgram(resampled, new_sample_rate)
#     norm_spect = StandardScaler().fit_transform(spectrogram)

#     test_X.append(norm_spect)

In [20]:
test_X = np.array(test_X)
test_X = test_X.reshape(tuple(list(test_X.shape) + [1]))

In [21]:
pred_y = multi_gpu.predict(test_X)

In [22]:
predicts = np.argmax(pred_y, axis=1)
predicts = [target_labels[p] for p in predicts]

# submission

In [6]:
subm_df['label'] = 'unknown'

In [7]:
subm_df.head()

Unnamed: 0,fname,label
0,clip_000044442.wav,unknown
1,clip_0000adecb.wav,unknown
2,clip_0000d4322.wav,unknown
3,clip_0000fb6fe.wav,unknown
4,clip_0001d1559.wav,unknown


In [24]:
subm_df.head()

Unnamed: 0,fname,label
0,clip_000044442.wav,down
1,clip_0000adecb.wav,go
2,clip_0000d4322.wav,go
3,clip_0000fb6fe.wav,go
4,clip_0001d1559.wav,go


In [8]:
subm_df.to_csv(submpath + 'submission_unknow.csv', index=False)