In [24]:
useColab=True
if useColab:
    %tensorflow_version 2.x
    !wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/SpeechDownloader.py
    !wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/SpeechGenerator.py
    !wget -q https://raw.githubusercontent.com/douglas125/SpeechCmdRecognition/master/requirements.txt
    !pip install -r requirements.txt



###**Libraries**

In [25]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17439495806137125493
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 16052332161935604673
physical_device_desc: "device: XLA_CPU device"
]


In [0]:
import librosa
import keras
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

#import SpeechDownloader
#import SpeechGenerator

from tqdm import tqdm
import requests
import math
import os
import tarfile
import librosa
import pandas as pd


#Libraries for the model 

from tensorflow.keras.models import Model, load_model

from tensorflow.keras import layers as L
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers

from kapre.time_frequency import Melspectrogram, Spectrogram
from kapre.utils import Normalization2D

##**Functions**

In [0]:
def WAV2Numpy(folder, sr=None):
    """
    Recursively converts WAV to numpy arrays.
    Deletes the WAV files in the process
    folder - folder to convert.
    """
    allFiles = []
    for root, dirs, files in os.walk(folder):
        allFiles += [os.path.join(root, f) for f in files
                     if f.endswith('.wav')]

    for file in tqdm(allFiles):
        y, sr = librosa.load(file, sr=None)

        # if we want to write the file later
        # librosa.output.write_wav('file.wav', y, sr, norm=False)
        np.save(file + '.npy', y)
        os.remove(file)

In [0]:
def PrepareGoogleSpeechCmd(version=2, forceDownload=False, task='20cmd'):
    """
    Prepares Google Speech commands dataset version 2 for use
    tasks: 12cmd
    Returns full path to training, validation and test file list and file categories
    """
    allowedTasks = ['12cmd'] #to be deleted at the end (vamos a supprimar)
    if task not in allowedTasks:
        raise Exception('Task must be one of: {}'.format(allowedTasks))

    basePath = None
    if version == 2:
        _DownloadGoogleSpeechCmdV2(forceDownload)
        basePath = 'sd_GSCmdV2'
    elif version == 1:
        _DownloadGoogleSpeechCmdV1(forceDownload)
        basePath = 'sd_GSCmdV1'
    else:
        raise Exception('Version must be 1 or 2')
#tout ça c'est to be deleted at the end

    if task == '12cmd':
        GSCmdV2Categs = {
            'unknown': 0,
            'silence': 1,
            '_unknown_': 0,
            '_silence_': 1,
            '_background_noise_': 1,
            'yes': 2,
            'no': 3,
            'up': 4,
            'down': 5,
            'left': 6,
            'right': 7,
            'on': 8,
            'off': 9,
            'stop': 10,
            'go': 11}
        numGSCmdV2Categs = 12

    print('Converting test set WAVs to numpy files')
    WAV2Numpy(basePath + '/test/')
    print('Converting training set WAVs to numpy files')
    WAV2Numpy(basePath + '/train/')

    # read split from files and all files in folders
    testWAVs = pd.read_csv(basePath + '/train/testing_list.txt',
                           sep=" ", header=None)[0].tolist()
    valWAVs = pd.read_csv(basePath + '/train/validation_list.txt',
                          sep=" ", header=None)[0].tolist()

    testWAVs = [os.path.join(basePath + '/train/', f + '.npy')
                for f in testWAVs if f.endswith('.wav')]
    valWAVs = [os.path.join(basePath + '/train/', f + '.npy')
               for f in valWAVs if f.endswith('.wav')]
    allWAVs = []
    for root, dirs, files in os.walk(basePath + '/train/'):
        allWAVs += [root + '/' + f for f in files if f.endswith('.wav.npy')]
    trainWAVs = list(set(allWAVs) - set(valWAVs) - set(testWAVs))

    testWAVsREAL = []
    for root, dirs, files in os.walk(basePath + '/test/'):
        testWAVsREAL += [root + '/' +
                         f for f in files if f.endswith('.wav.npy')]

    # get categories
    testWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in testWAVs]
    valWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in valWAVs]
    trainWAVlabels = [_getFileCategory(f, GSCmdV2Categs) for f in trainWAVs]
    testWAVREALlabels = [_getFileCategory(f, GSCmdV2Categs)
                         for f in testWAVsREAL]

    # background noise should be used for validation as well
    backNoiseFiles = [trainWAVs[i] for i in range(len(trainWAVlabels))
                      if trainWAVlabels[i] == GSCmdV2Categs['silence']]
    backNoiseCats = [GSCmdV2Categs['silence']
                     for i in range(len(backNoiseFiles))]
    if numGSCmdV2Categs == 12:
        valWAVs += backNoiseFiles
        valWAVlabels += backNoiseCats

    # build dictionaries
    testWAVlabelsDict = dict(zip(testWAVs, testWAVlabels))
    valWAVlabelsDict = dict(zip(valWAVs, valWAVlabels))
    trainWAVlabelsDict = dict(zip(trainWAVs, trainWAVlabels))
    testWAVREALlabelsDict = dict(zip(testWAVsREAL, testWAVREALlabels))

    # a tweak here: we will heavily underuse silence samples because there are few files.
    # we can add them to the training list to reuse them multiple times
    # note that since we already added the files to the label dicts we don't
    # need to do it again

    # for i in range(200):
    #     trainWAVs = trainWAVs + backNoiseFiles

    # info dictionary
    trainInfo = {'files': trainWAVs, 'labels': trainWAVlabelsDict}
    valInfo = {'files': valWAVs, 'labels': valWAVlabelsDict}
    testInfo = {'files': testWAVs, 'labels': testWAVlabelsDict}
    testREALInfo = {'files': testWAVsREAL, 'labels': testWAVREALlabelsDict}
    gscInfo = {'train': trainInfo,
               'test': testInfo,
               'val': valInfo,
               'testREAL': testREALInfo}

    print('Done preparing Google Speech commands dataset version {}'.format(version))

    return gscInfo, numGSCmdV2Categs

In [0]:
 def _getFileCategory(file, catDict):
    """
    Receives a file with name sd_GSCmdV2/train/<cat>/<filename> and returns an integer that is catDict[cat]
    """
    categ = os.path.basename(os.path.dirname(file))
    return catDict.get(categ, 0)

In [0]:
def _DownloadGoogleSpeechCmdV2(forceDownload=False):
    """
    Downloads Google Speech commands dataset version 2
    """
    if os.path.isdir("sd_GSCmdV2/") and not forceDownload:
        print('Google Speech commands dataset version 2 already exists. Skipping download.')
    else:
        if not os.path.exists("sd_GSCmdV2/"):
            os.makedirs("sd_GSCmdV2/")
        trainFiles = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
        testFiles = 'http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz'
        _downloadFile(testFiles, 'sd_GSCmdV2/test.tar.gz')
        _downloadFile(trainFiles, 'sd_GSCmdV2/train.tar.gz')

    # extract files
    if not os.path.isdir("sd_GSCmdV2/test/"):
        _extractTar('sd_GSCmdV2/test.tar.gz', 'sd_GSCmdV2/test/')

    if not os.path.isdir("sd_GSCmdV2/train/"):
        _extractTar('sd_GSCmdV2/train.tar.gz', 'sd_GSCmdV2/train/')



In [0]:
def _downloadFile(url, fName):
    # Streaming, so we can iterate over the response.
    r = requests.get(url, stream=True)

    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    wrote = 0
    print('Downloading {} into {}'.format(url, fName))
    with open(fName, 'wb') as f:
        for data in tqdm(r.iter_content(block_size),
                         total=math.ceil(total_size // block_size),
                         unit='KB',
                         unit_scale=True):
            wrote = wrote + len(data)
            f.write(data)
    if total_size != 0 and wrote != total_size:
        print("ERROR, something went wrong")

In [0]:
def _extractTar(fname, folder):
    print('Extracting {} into {}'.format(fname, folder))
    if (fname.endswith("tar.gz")):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(path=folder)
        tar.close()
    elif (fname.endswith("tar")):
        tar = tarfile.open(fname, "r:")
        tar.extractall(path=folder)
        tar.close()

In [0]:
#https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html

In [0]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load('data/' + ID + '.npy')

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [0]:
def ConvSpeechModel(nCategories, samplingrate=16000, inputLength=16000):
    """
    Base fully convolutional model for speech recognition
    """

    inputs = L.Input((inputLength,))

    x = L.Reshape((1, -1))(inputs)

    x = Melspectrogram(n_dft=1024, n_hop=128, input_shape=(1, inputLength),
                       padding='same', sr=samplingrate, n_mels=80,
                       fmin=40.0, fmax=samplingrate / 2, power_melgram=1.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='mel_stft')(x)

    x = Normalization2D(int_axis=0)(x)
    # note that Melspectrogram puts the sequence in shape (batch_size, melDim, timeSteps, 1)
    # we would rather have it the other way around for LSTMs

    x = L.Permute((2, 1, 3))(x)
    # x = Reshape((94,80)) (x) #this is strange - but now we have (batch_size,
    # sequence, vec_dim)

    c1 = L.Conv2D(20, (5, 1), activation='relu', padding='same')(x)
    c1 = L.BatchNormalization()(c1)
    p1 = L.MaxPooling2D((2, 1))(c1)
    p1 = L.Dropout(0.03)(p1)

    c2 = L.Conv2D(40, (3, 3), activation='relu', padding='same')(p1)
    c2 = L.BatchNormalization()(c2)
    p2 = L.MaxPooling2D((2, 2))(c2)
    p2 = L.Dropout(0.01)(p2)

    c3 = L.Conv2D(80, (3, 3), activation='relu', padding='same')(p2)
    c3 = L.BatchNormalization()(c3)
    p3 = L.MaxPooling2D((2, 2))(c3)

    p3 = L.Flatten()(p3)
    p3 = L.Dense(64, activation='relu')(p3)
    p3 = L.Dense(32, activation='relu')(p3)

    output = L.Dense(nCategories, activation='softmax')(p3)

    model = Model(inputs=[inputs], outputs=[output], name='ConvSpeechModel')

    return model

In [36]:
# Download and prepare all data
gscInfo, nCategs = PrepareGoogleSpeechCmd(version=2, task = '12cmd')

  4%|▎         | 4.10k/110k [00:00<00:03, 33.0kKB/s]

Downloading http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz into sd_GSCmdV2/test.tar.gz


110kKB [00:01, 86.4kKB/s]                          
  0%|          | 4.10k/2.37M [00:00<01:29, 26.6kKB/s]

Downloading http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz into sd_GSCmdV2/train.tar.gz


2.37MKB [00:41, 57.7kKB/s]                           


Extracting sd_GSCmdV2/test.tar.gz into sd_GSCmdV2/test/
Extracting sd_GSCmdV2/train.tar.gz into sd_GSCmdV2/train/


  0%|          | 0/4890 [00:00<?, ?it/s]

Converting test set WAVs to numpy files


100%|██████████| 4890/4890 [09:58<00:00,  8.17it/s]


Converting training set WAVs to numpy files


 57%|█████▋    | 60358/105835 [2:05:27<1:29:11,  8.50it/s]

KeyboardInterrupt: ignored