In [None]:
!wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2

!tar -xvf LJSpeech-1.1.tar.bz2

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
LJSpeech-1.1/wavs/LJ022-0089.wav
LJSpeech-1.1/wavs/LJ030-0192.wav
LJSpeech-1.1/wavs/LJ041-0078.wav
LJSpeech-1.1/wavs/LJ045-0249.wav
LJSpeech-1.1/wavs/LJ034-0035.wav
LJSpeech-1.1/wavs/LJ010-0152.wav
LJSpeech-1.1/wavs/LJ036-0174.wav
LJSpeech-1.1/wavs/LJ035-0076.wav
LJSpeech-1.1/wavs/LJ032-0176.wav
LJSpeech-1.1/wavs/LJ046-0113.wav
LJSpeech-1.1/wavs/LJ017-0096.wav
LJSpeech-1.1/wavs/LJ004-0098.wav
LJSpeech-1.1/wavs/LJ010-0147.wav
LJSpeech-1.1/wavs/LJ042-0230.wav
LJSpeech-1.1/wavs/LJ041-0033.wav
LJSpeech-1.1/wavs/LJ045-0229.wav
LJSpeech-1.1/wavs/LJ014-0199.wav
LJSpeech-1.1/wavs/LJ002-0082.wav
LJSpeech-1.1/wavs/LJ006-0055.wav
LJSpeech-1.1/wavs/LJ045-0120.wav
LJSpeech-1.1/wavs/LJ050-0028.wav
LJSpeech-1.1/wavs/LJ045-0215.wav
LJSpeech-1.1/wavs/LJ013-0121.wav
LJSpeech-1.1/wavs/LJ008-0025.wav
LJSpeech-1.1/wavs/LJ005-0240.wav
LJSpeech-1.1/wavs/LJ044-0026.wav
LJSpeech-1.1/wavs/LJ048-0127.wav
LJSpeech-1.1/wavs/LJ006-0195.wav
LJSpeech-1.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import librosa
import os

In [None]:
class LJSpeechPreprocessor():

  def __init__(self, data_dir, num_samples=None):
    self.data_dir = data_dir
    self.metadata = self.read_metadata(num_samples)

  def read_metadata(self, num_samples):
    """Read meta data"""
    fpath = os.path.join(self.data_dir, "metadata.csv")
    metadata = pd.read_csv(fpath, sep='|', header=None, quoting=3)
    metadata.columns = ["ID", "Transcription", "Normalized Transcription"]
    metadata = metadata[["ID", "Normalized Transcription"]] 
    metadata = metadata.sample(frac=1.0).reset_index(drop=True)

    if num_samples:
      metadata = metadata[:min(num_samples, metadata.shape[0])]

    return metadata

  def get_wavs_list(self):
    """get list of file path of .wav data"""
    wav_dir = os.path.join(self.data_dir, "wavs")
    wavs_list = [os.path.join(wav_dir, fname+".wav") for fname in self.metadata["ID"]]
    return wavs_list

  def get_original_text(self):
    """get original sentences"""
    return self.metadata["Normalized Transcription"].tolist()

  def get_target_sequence(self, SOS="", EOS=""):
    """get tokenized and indexed sentences """
    target_text= [SOS + txt + EOS for txt in self.metadata["Normalized Transcription"]]

    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(target_text)

    target_seq = tokenizer.texts_to_sequences(target_text)
    target_seq = pad_sequences(target_seq, padding="post")

    vocab = tokenizer.word_index
    vocab["<UNK>"] = 0

    vocab_rev = dict((id, char) for char, id in vocab.items())

    return target_seq, vocab, vocab_rev

  @staticmethod
  def get_spectograms(wavs_list, n_mels, norm=True):
    """get the spectrogram corresponding to each audio"""
    spectograms = []
    for fpath in wavs_list:
      wav, sr = librosa.load(fpath, sr=None)
      spect = librosa.feature.melspectrogram(wav, sr, n_fft=1024, n_mels=n_mels)
      spect = np.transpose(spect)

      if norm:
        mean = np.mean(spect, 1).reshape((-1,1))
        std = np.std(spect, 1).reshape((-1,1))
        spect = (spect-mean) / std
      spectograms.append(spect)
    
    spectograms = pad_sequences(spectograms, padding="post")
    return spectograms


In [None]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Bidirectional, GRU, BatchNormalization, Conv2D, Activation, Reshape 
from tensorflow.keras.utils import Sequence
import keras.backend as K
import numpy as np
import math

In [None]:
class Dataloader(Sequence):
  """dataloader for CTC model"""
  def __init__(self, wavs_list, target_sequence, n_mels, batch_size=64):
    self.wavs_list = wavs_list
    self.targets = target_sequence
    self.n_mels = n_mels
    self.batch_size = batch_size
    self.fnum = len(wavs_list)

  def __len__(self):
    return math.ceil((self.fnum / self.batch_size))

  def __getitem__(self, idx):
    st = idx * self.batch_size
    ed = min((idx+1) * self.batch_size, self.fnum)

    targets = self.targets[st: ed, :] # shape (samples, length)
    inputs = LJSpeechPreprocessor.get_spectograms(self.wavs_list[st:ed], self.n_mels) # shape (samples, mxlen, n_mels)
    return inputs, targets

In [None]:
class DeepSpeech2:
  def __init__(self, tran_dataset_dir="/content/LJSpeech-1.1"):
    preprocessor = LJSpeechPreprocessor(tran_dataset_dir, num_samples=None)

    self.wavs_list = preprocessor.get_wavs_list()
    self.orginal_text = preprocessor.get_original_text()
    self.target_seq, self.vocab, self.vocab_rev = preprocessor.get_target_sequence()
    self.vocab_size = len(self.vocab.keys())
    self.latent_dim = 128
    self.model = self.build_model()

  def CTCLoss(self, y_true, y_pred):
    batch_size = tf.shape(y_true)[0]
    pred_length = tf.shape(y_pred)[1]
    label_length = tf.shape(y_true)[1]

    pred_length = pred_length * tf.ones(shape=(batch_size,1), dtype="int32")
    label_length = label_length * tf.ones(shape=(batch_size,1), dtype="int32")

    loss = K.ctc_batch_cost(y_true, y_pred, pred_length, label_length)

    return loss

  def build_model(self):
    inputs = Input(shape=(None, self.latent_dim))
    x = Reshape((-1, self.latent_dim, 1))(inputs)

    x = Conv2D(filters=32, kernel_size=[7,11], strides=[1,1], padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x = Conv2D(filters=32, kernel_size=[7,11], strides=[1,2], padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x = Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    x = Bidirectional(GRU(units=512, return_sequences=True), merge_mode="sum")(x)
    x = Bidirectional(GRU(units=512, return_sequences=True), merge_mode="sum")(x)
    x = Bidirectional(GRU(units=512, return_sequences=True), merge_mode="sum")(x)
    x = BatchNormalization()(x)

    x = Dense(256, activation="relu")(x)
    prob = Dense(self.vocab_size+1, activation="softmax")(x)

    model = Model(inputs, prob)

    model.compile(optimizer="adam", loss=self.CTCLoss)

    return model

  def recognize(self, spect):
    pred = self.model.predict(spect)
    input_len = np.ones(pred.shape[0] * pred.shape[1])
    decode = K.ctc_decode(pred, input_length=input_len, greddy=True)[0][0]
    output = K.get_value(decode)

    res = ""
    for x in output[0]:
      if x == -1 or x == 0:
        continue

      res += self.vocab_rev[x]

    return res

  def test(self):
    for i in range(5):
      inputs = LJSpeechPreprocessor.get_sectograms(self.wavs_list[i: i+1], self.latent_dim)

      res = self.recognize(inputs[0:1])
      print('-')
      print("Decoded Sentence: ", res)
      print("Ground Truth: ", self.orginal_text[i])

  def train_model(self, epochs, batch_size=64):
    dataloader = Dataloader(self.wavs_list, self.target_seq, self.latent_dim, 
                            batch_size=batch_size)
    
    self.model.fit(dataloader, epochs=epochs, verbose=1)
    self.test()


In [None]:
speech_recognizer = DeepSpeech2(tran_dataset_dir="/content/LJSpeech-1.1")

speech_recognizer.model.summary()

tf.keras.utils.plot_model(speech_recognizer.model, show_shapes=True)

In [None]:

speech_recognizer.train_model(epochs=20, batch_size=8)