In [40]:
import torch
import torchaudio 
import librispeech 
import my_functions
import torch_functions 
import torch.nn as nn 

# Data

In [41]:
train_lib = librispeech.LIBRISPEECH('/Users/stephen/code/projects/Speech_Recognition/Data/', url="train-clean-100", download=True)
test_lib = librispeech.LIBRISPEECH('/Users/stephen/code/projects/Speech_Recognition/Data/', url="test-clean", download=True)

In [42]:
print(f"Training length: {len(train_lib)}")
print(f"Test length: {len(test_lib)}")

Training length: 28539
Test length: 2620


waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)

In [43]:
class TextTransform:
    """Maps characters to integers and vice versa"""
    def __init__(self):
        char_map_str = """
        ' 0
        <SPACE> 1
        a 2
        b 3
        c 4
        d 5
        e 6
        f 7
        g 8
        h 9
        i 10
        j 11
        k 12
        l 13
        m 14
        n 15
        o 16
        p 17
        q 18
        r 19
        s 20
        t 21
        u 22
        v 23
        w 24
        x 25
        y 26
        z 27
        """
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[1] = ' '

    def text_to_int(self, text):
        """ Use a character map and convert text to an integer sequence """
        int_sequence = []
        for c in text:
            if c == ' ':
                ch = self.char_map['<SPACE>']
            else:
                ch = self.char_map[c]
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        """ Use a character map and convert integer labels to an text sequence """
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string).replace('<SPACE>', ' ')

In [44]:
text_transform = TextTransform()


In [45]:
# data transform and augmentation
SAMPLE_RATE = 8000 
WINDOW_LENGTH = int(0.020 * SAMPLE_RATE)  # 25ms windows 
HOP_LENGTH = int(0.01 * SAMPLE_RATE) # 10ms sliding overlapping window 
N_MELS = 80
DURATION = 16.7  # duration in seconds 
N_SAMPLES = int(SAMPLE_RATE * DURATION)
N_FFT = 400 



train_audio_transforms = nn.Sequential(
    torchaudio.transforms.Spectrogram(n_fft=N_FFT),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=80),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.Spectrogram(n_fft=N_FFT)

In [46]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data: 
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid': 
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else: 
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths
            

In [47]:
def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
	arg_maxes = torch.argmax(output, dim=2)
	decodes = []
	targets = []
	for i, args in enumerate(arg_maxes):
		decode = []
		targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
		for j, index in enumerate(args):
			if index != blank_label:
				if collapse_repeated and j != 0 and index == args[j -1]:
					continue
				decode.append(index.item())
		decodes.append(text_transform.int_to_text(decode))
	return decodes, targets

# Model
based of DeepSpeech
https://pytorch.org/audio/stable/_modules/torchaudio/models/deepspeech.html#DeepSpeech

In [48]:
__all__ = ["DeepSpeech"]


class FullyConnected(torch.nn.Module):
    """
    Args:
        n_feature: Number of input features
        n_hidden: Internal hidden unit size.
    """

    def __init__(self,
                 n_feature: int,
                 n_hidden: int,
                 dropout: float,
                 relu_max_clip: int = 20) -> None:
        super(FullyConnected, self).__init__()
        self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
        self.relu_max_clip = relu_max_clip
        self.dropout = dropout

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc(x)
        x = torch.nn.functional.relu(x)
        x = torch.nn.functional.hardtanh(x, 0, self.relu_max_clip)
        if self.dropout:
            x = torch.nn.functional.dropout(x, self.dropout, self.training)
        return x

In [50]:
class DeepSpeech(nn.Module):
    """
    DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition* 
    [:footcite: 'hannun2014deep'].

    Args:
        n_features: Number of input features
        n_hidden: Internal hidden unit size.
        n_class: Number of output classes
    """

    def __init__(
        self, 
        n_feature: int, 
        n_hidden: int = 2048, 
        n_class: int = 40, 
        dropout: float = 0.0
    ) -> None:
        super(DeepSpeech, self).__init__()
        self.N_hidden = n_hidden
        self.fc1 = FullyConnected(n_feature, n_hidden, dropout)
        self.fc2 = FullyConnected(n_feature, n_hidden, dropout)
        self.fc3 = FullyConnected(n_feature, n_hidden, dropout)
        self.bi_rnn = nn.RNN(
            n_hidden, n_hidden, num_layers=1, nonlinearity="relu", bidirectional=True
        )
        self.fc4 = FullyConnected(n_hidden, n_hidden, dropout)
        self.out = nn.Linear(n_hidden, n_class)

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            """
            Args:
                x (torch.Tensor): Tensor of dimension (batch, channel, time, feature).
            Returns:
                Tensor: Predictor tensor of dimension (batch, time, class).
            """
            # N x C x T x F
            x = self.fc1(x)
            # N x C x T x F
            x = self.fc2(x)
            # N x C x T x F
            x = self.fc3(x)
            # N x C x T x F
            x = x.squeeze(1)
            # N x T x H 
            x = x.transpose(0, 1)
            # T x N x H 
            x, _ = self.bi_rnn(x)
            # The fifth (non-recurrent) layer takes both the forward and backward units as inputs 
            x = x[:, :, :self.n_hidden] + x[:, :, self.n_hidden:]
            # T x N xH 
            x = self.fc4(x)
            # T x N x H
            x = self.out(x)
            # T x N x n_class 
            x = x.permute(1, 0, 2)
            # N x T x n_class 
            x = nn.functional.log_softmax(x, dim=2)
            # N x T x n_class 
            return x 
