In [40]:
import torch
import torchaudio 
import librispeech 
import my_functions
import torch_functions 
import torch.nn as nn 

# Data

In [41]:
train_lib = librispeech.LIBRISPEECH('/Users/stephen/code/projects/Speech_Recognition/Data/', url="train-clean-100", download=True)
test_lib = librispeech.LIBRISPEECH('/Users/stephen/code/projects/Speech_Recognition/Data/', url="test-clean", download=True)

In [42]:
print(f"Training length: {len(train_lib)}")
print(f"Test length: {len(test_lib)}")

Training length: 28539
Test length: 2620


waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)

In [43]:
class TextTransform:
    """Maps characters to integers and vice versa"""
    def __init__(self):
        char_map_str = """
        ' 0
        <SPACE> 1
        a 2
        b 3
        c 4
        d 5
        e 6
        f 7
        g 8
        h 9
        i 10
        j 11
        k 12
        l 13
        m 14
        n 15
        o 16
        p 17
        q 18
        r 19
        s 20
        t 21
        u 22
        v 23
        w 24
        x 25
        y 26
        z 27
        """
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[1] = ' '

    def text_to_int(self, text):
        """ Use a character map and convert text to an integer sequence """
        int_sequence = []
        for c in text:
            if c == ' ':
                ch = self.char_map['<SPACE>']
            else:
                ch = self.char_map[c]
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        """ Use a character map and convert integer labels to an text sequence """
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string).replace('<SPACE>', ' ')

In [44]:
text_transform = TextTransform()


In [54]:
# data transform and augmentation
#SAMPLE_RATE = 8000 
#WINDOW_LENGTH = int(0.020 * SAMPLE_RATE)  # 25ms windows 
#HOP_LENGTH = int(0.01 * SAMPLE_RATE) # 10ms sliding overlapping window 
#N_MELS = 80
#DURATION = 16.7  # duration in seconds 
#N_SAMPLES = int(SAMPLE_RATE * DURATION)
#N_FFT = 400 
n_fft = 1024 
win_length = None 
hop_length = 512 




train_audio_transforms = nn.Sequential(
    torchaudio.transforms.Spectrogram(n_fft=n_fft, 
    win_length=win_length, 
    hop_length=hop_length, 
    center=-True,
    pad_mode="reflect", 
    power=2.0),
    torchaudio.transforms.TimeStretch(fixed_rate=1.2),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=80),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.Spectrogram(n_fft=n_fft)

In [55]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data: 
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid': 
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else: 
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths
            

In [53]:
def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
	"""outputs the most probable character at each time step.

	Args:
		output (_type_): _description_
		labels (_type_): _description_
		label_lengths (_type_): _description_
		blank_label (int, optional): _description_. Defaults to 28.
		collapse_repeated (bool, optional): _description_. Defaults to True.

	Returns:
		_type_: _description_
	"""
	arg_maxes = torch.argmax(output, dim=2)
	decodes = []
	targets = []
	for i, args in enumerate(arg_maxes):
		decode = []
		targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
		for j, index in enumerate(args):
			if index != blank_label:
				if collapse_repeated and j != 0 and index == args[j -1]:
					continue
				decode.append(index.item())
		decodes.append(text_transform.int_to_text(decode))
	return decodes, targets

# Model
based of DeepSpeech2
link: https://nvidia.github.io/OpenSeq2Seq/html/speech-recognition/deepspeech2.html