# Практика №4

Теперь мы построим и обучим простую end-to-end модель. Будем работать с пропатченной версией уже готового [пайплайна](https://www.assemblyai.com/blog/end-to-end-speech-recognition-pytorch). Также нам пригодится [ESPnet](https://github.com/espnet/espnet) для использования модели [Transformer](http://jalammar.github.io/illustrated-transformer/) в качестве энкодера.

### Bootstrap

In [None]:
!pip install -q sentencepiece torchaudio

In [None]:
!gdown --id '1skrVbNyrhBLeceGS9CV9uIw_gvo1JiA6'

!rm -rf lab4
!unzip -q lab4.zip
!rm -rf lab4.zip sample_data
%cd lab4

In [None]:
import math
import os

import numpy as np
import sentencepiece as spm
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,
)
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask

from utils import TextTransform
from utils import cer
from utils import wer

In [None]:
train_audio_transforms = torch.nn.Sequential(
    torchaudio.transforms.MelSpectrogram(
        sample_rate=16000, n_fft=400, hop_length=160, n_mels=80
    ),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100),
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000, n_fft=400, hop_length=160, n_mels=80
)

text_transform = TextTransform()

# -----------------------------TODO №2-----------------------------------
# Заменить графемный токенайзер на сабвордовый TextTransformBPE
# -----------------------------------------------------------------------


def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data:
        if data_type == "train":
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == "valid":
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception("data_type should be train or valid")
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0])
        label_lengths.append(len(label))

    spectrograms = (
        torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
        .unsqueeze(1)
        .transpose(2, 3)
    )
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths


def GreedyDecoder(
    output, labels, label_lengths, blank_label=28, collapse_repeated=True
):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(
            text_transform.int_to_text(labels[i][: label_lengths[i]].tolist())
        )
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j - 1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

In [None]:
class TransformerModel(torch.nn.Module):
    def __init__(
        self,
        input_size=80,
        output_size=29,
        conv2d_filters=32,
        attention_dim=360,
        attention_heads=8,
        feedforward_dim=1024,
        num_layers=10,
        dropout=0.1,
    ):
        super(TransformerModel, self).__init__()

        self.conv_in = torch.nn.Sequential(
            torch.nn.Conv2d(
                1, conv2d_filters, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)
            ),
            torch.nn.ReLU(),
            torch.nn.Conv2d(
                conv2d_filters,
                conv2d_filters,
                kernel_size=(3, 3),
                stride=(2, 2),
                padding=(1, 1),
            ),
            torch.nn.ReLU(),
        )
        self.conv_out = torch.nn.Sequential(
            torch.nn.Linear(conv2d_filters * ((input_size // 2) // 2), attention_dim),
            PositionalEncoding(attention_dim, 0.1),
        )
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, feedforward_dim, dropout)
        self.encoder_layer = repeat(
            num_layers,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim, dropout),
                positionwise_layer(*positionwise_layer_args),
                dropout,
                normalize_before=True,
                concat_after=False,
            ),
        )
        self.after_norm = LayerNorm(attention_dim)
        self.final_layer = torch.nn.Linear(attention_dim, output_size)

    def forward(self, x, ilens):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv_in(x)
        b, c, t, f = x.size()
        x = self.conv_out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        masks = (~make_pad_mask(ilens)[:, None, :])[:, :, ::4].to(x.device)
        x, _ = self.encoder_layer(x, masks)
        x = self.after_norm(x)
        x = self.final_layer(x)
        return x

In [None]:
def train(model, device, train_loader, criterion, optimizer, scheduler, epoch):
    model.train()
    data_len = len(train_loader.dataset)

    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data
        spectrograms, labels = spectrograms[:, :, :, : max(input_lengths)].to(
            device
        ), labels.to(
            device
        )  # (batch, 1, feat_dim, time)
        spectrograms = spectrograms.squeeze(1).transpose(
            1, 2
        )  # (batch, time, feat_dim,)
        optimizer.zero_grad()

        output = model(spectrograms, input_lengths)  # (batch, time, n_classes)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1)  # (time, batch, n_class)
        input_lengths = [x // 4 for x in input_lengths]

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        scheduler.step()
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:.6f}".format(
                    epoch,
                    batch_idx * len(spectrograms),
                    data_len,
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                    scheduler.get_last_lr()[0],
                )
            )


def test(model, device, test_loader, criterion, epoch, blank_label):
    print("\nevaluating...")
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            spectrograms = spectrograms.squeeze(1).transpose(
                1, 2
            )  # (batch time, feat_dim,)

            output = model(spectrograms, input_lengths)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1)  # (time, batch, n_class)
            input_lengths = [x // 4 for x in input_lengths]

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(
                output.transpose(0, 1), labels, label_lengths, blank_label=blank_label,
            )
            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

    avg_cer = sum(test_cer) / len(test_cer)
    avg_wer = sum(test_wer) / len(test_wer)

    print(
        "Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n".format(
            test_loss, avg_cer, avg_wer
        )
    )

In [None]:
def main(output_size, learning_rate=1e-5, batch_size=10, test_batch_size=7, epochs=10,
        train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10):
    
    hparams = {
        "input_size": 80,
        "output_size": output_size,
        "conv2d_filters": 32,
        "attention_dim": attention_dim,
        "attention_heads": 8,
        "feedforward_dim": 1024,
        "num_layers": num_layers,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    if not os.path.isdir("./data"):
        os.makedirs("./data")

    train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=hparams['batch_size'],
                                   shuffle=True,
                                   collate_fn=lambda x: data_processing(x, 'train'),
                                   **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                  batch_size=test_batch_size,
                                  shuffle=False,
                                  collate_fn=lambda x: data_processing(x, 'valid'),
                                  **kwargs)

    model = TransformerModel(
        hparams['input_size'],
        hparams['output_size'],
        hparams['conv2d_filters'],
        hparams['attention_dim'],
        hparams['attention_heads'],
        hparams['feedforward_dim'],
        hparams['num_layers'],
        hparams['dropout']
    ).to(device)

    print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = torch.nn.CTCLoss(blank=hparams['output_size'] - 1, zero_infinity=False).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'],
                                              steps_per_epoch=int(len(train_loader)),
                                              epochs=hparams['epochs'],
                                              anneal_strategy='linear')

    for epoch in range(1, epochs + 1):
        !date
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
        test(model, device, test_loader, criterion, epoch, blank_label=hparams['output_size'] - 1)

In [None]:
# main(output_size=29, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)

В силу того, что обучение занимает много времени и ресурсов, здесь и далее приводится выдержка из журнала обучения:

```python
main(output_size=29, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)
```

```
Num Model Parameters 12_850_957

Epoch  1: Average loss: 2.1915, Average CER: 0.563139 Average WER: 1.1611
Epoch  2: Average loss: 1.9001, Average CER: 0.487321 Average WER: 1.0551
Epoch  3: Average loss: 1.5279, Average CER: 0.384309 Average WER: 0.9229
Epoch  4: Average loss: 1.2169, Average CER: 0.309339 Average WER: 0.8344
Epoch  5: Average loss: 1.0170, Average CER: 0.261957 Average WER: 0.7316
Epoch  6: Average loss: 0.8912, Average CER: 0.232378 Average WER: 0.6713
Epoch  7: Average loss: 0.8002, Average CER: 0.212101 Average WER: 0.6262
Epoch  8: Average loss: 0.7317, Average CER: 0.192456 Average WER: 0.5777
Epoch  9: Average loss: 0.6811, Average CER: 0.181670 Average WER: 0.5540
Epoch 10: Average loss: 0.6559, Average CER: 0.173048 Average WER: 0.5307

Each epoch takes ~11 minutes to run.
```

### <b>Задание №1</b> (5 баллов):
На данный момент практически все E2E SOTA решения использую [сабворды](https://dyakonov.org/2019/11/29/%D1%82%D0%BE%D0%BA%D0%B5%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F-%D0%BD%D0%B0-%D0%BF%D0%BE%D0%B4%D1%81%D0%BB%D0%BE%D0%B2%D0%B0-subword-tokenization/) (subwords/wordpieces) в качестве таргетов нейронки для распознавания. Нам бы тоже не мешало перейти от графем к сабвордам. Теперь вместо букв (графем) будем распознавать кусочки слов. В качестве такого токенайзера предлагается использовать [Sentencepiece](https://github.com/google/sentencepiece). Главное правильно обернуть его в наш класс TextTransform. Текстовый файл (train_clean_100_text_clean.txt) для обучения токенайзера уже подготовлен и лежит в корневой папке проекта. 

In [None]:
class TextTransformBPE:
    def __init__(self, train_text, vocab_size):
        """Обучение BPE модели на 4000 юнитов."""
        # Обучение из файла:
        spm.SentencePieceTrainer.train(
            input=train_text, model_prefix='m', vocab_size=vocab_size,
            normalization_rule_name='nfkc_cf', model_type='bpe',
        )
        # Загружаем обученную модель:
        self.model = spm.SentencePieceProcessor(model_file='m.model')

    def text_to_int(self, text):
        """
        Преобразование входного текста в последовательность сабвордов в формате их индекса 
        в BPE модели.
        """
        int_sequence = self.model.encode(text)
        return int_sequence

    def int_to_text(self, labels):
        """Преобразование последовательности индексов сабвордов в текст."""
        labels = list(map(int, labels))
        string = self.model.decode(labels)
        return string

In [None]:
# text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=4000)

# main(output_size=4001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)

```python
text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=4000)

main(output_size=4001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)
```

```
Num Model Parameters 14_284_849

Epoch  1: Average loss: 6.6286, Average CER: 0.991988 Average WER: 0.9926
Epoch  2: Average loss: 4.8681, Average CER: 0.745415 Average WER: 0.8591
Epoch  3: Average loss: 4.3174, Average CER: 0.651380 Average WER: 0.7959
Epoch  4: Average loss: 3.7446, Average CER: 0.567355 Average WER: 0.7276
Epoch  5: Average loss: 3.1421, Average CER: 0.465592 Average WER: 0.6465
Epoch  6: Average loss: 2.7075, Average CER: 0.398419 Average WER: 0.5818
Epoch  7: Average loss: 2.4468, Average CER: 0.359280 Average WER: 0.5419
Epoch  8: Average loss: 2.2605, Average CER: 0.324294 Average WER: 0.5165
Epoch  9: Average loss: 2.1211, Average CER: 0.303219 Average WER: 0.4925
Epoch 10: Average loss: 2.0560, Average CER: 0.293074 Average WER: 0.4815

Each epoch takes ~11 minutes to run.
```

CER для этой модели выше, чем для модели с обычным `TextTransform`: это связано с тем, что число возможных выходов в разы больше, и модели сложнее выбирать правильный. Но итоговая ошибка WER стала меньше. Ниже мы попробуем добиться ещё более хороших результатов.

### <b>Задание №2</b> (5 баллов):
Импровизация по улучшению качества распознавания.

Увеличим размер словаря до 10000, размерность вектора внимания `attention_dim` до 512, число слоёв трансформера до 16, увеличим максимальное значение `learning_rate` до 0.01:

In [None]:
# text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=10000)

# main(output_size=10001, learning_rate=1e-2, batch_size=6, test_batch_size=4, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=512, num_layers=16)

```python
text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=10000)

main(output_size=10001, learning_rate=1e-2, batch_size=6, test_batch_size=4, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=512, num_layers=16)
```

```
Num Model Parameters 39_113_841

Epoch  1: Average loss: 6.2865, Average CER: 0.986621 Average WER: 0.9878
Epoch  2: Average loss: 6.1903, Average CER: 0.883747 Average WER: 0.9467
Epoch  3: Average loss: 5.9818, Average CER: 0.851486 Average WER: 0.9364
Epoch  4: Average loss: 5.8568, Average CER: 0.796728 Average WER: 0.9136
Epoch  5: Average loss: 5.8445, Average CER: 0.843930 Average WER: 0.9451
Epoch  6: Average loss: 5.5203, Average CER: 0.809913 Average WER: 0.9250
Epoch  7: Average loss: 5.3197, Average CER: 0.737294 Average WER: 0.8970
Epoch  8: Average loss: 5.0723, Average CER: 0.749833 Average WER: 0.8892
Epoch  9: Average loss: 4.8846, Average CER: 0.717338 Average WER: 0.8749
Epoch 10: Average loss: 4.7430, Average CER: 0.706446 Average WER: 0.8638

Each epoch takes ~15 minutes to run.
```

Как можно заметить, результаты ухудшились. Во-первых, выросло количество параметров модели, но число эпох осталось тем же, в связи с чем модель могла не успеть обучиться так же хорошо, как при меньшем количестве параметров. Во-вторых, ещё сильнее выросло число возможных выходов, из-за чего модели стало сложнее делать верные предсказания (по крайней мере, на начальных эпохах).

Попробуем вернуться к первому варианту `TextTransformBPE`, но уменьшим размер словаря вдвое.

In [None]:
# text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=2000)

# main(output_size=2001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)

```python
text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=2000)

main(output_size=2001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)
```

```
Num Model Parameters 13_562_849

Epoch  1: Average loss: 5.9217, Average CER: 0.933404 Average WER: 0.9800
Epoch  2: Average loss: 4.4754, Average CER: 0.761548 Average WER: 0.8589
Epoch  3: Average loss: 4.0477, Average CER: 0.657061 Average WER: 0.8101
Epoch  4: Average loss: 3.2298, Average CER: 0.486938 Average WER: 0.6984
Epoch  5: Average loss: 2.7068, Average CER: 0.400269 Average WER: 0.6330
Epoch  6: Average loss: 2.4311, Average CER: 0.358651 Average WER: 0.6001
Epoch  7: Average loss: 2.1797, Average CER: 0.329015 Average WER: 0.5538
Epoch  8: Average loss: 1.9977, Average CER: 0.296656 Average WER: 0.5237
Epoch  9: Average loss: 1.8694, Average CER: 0.275591 Average WER: 0.5008
Epoch 10: Average loss: 1.8079, Average CER: 0.266416 Average WER: 0.4870

Each epoch takes ~11 minutes to run.
```

При уменьшении размера словаря модели стало проще правильно предсказывать отдельные символы (CER уменьшилась). Ошибка WER осталась практически без изменений.

Посмотрим, что произвдёт с качеством распознавания при дальшейшем уменьшении размера словаря.

In [None]:
# text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=1000)

# main(output_size=1001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)

```python
text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=1000)

main(output_size=1001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)
```

```
Num Model Parameters 13_201_849

Epoch  1: Average loss: 5.0266, Average CER: 0.823378 Average WER: 0.9272
Epoch  2: Average loss: 4.0076, Average CER: 0.679294 Average WER: 0.8462
Epoch  3: Average loss: 3.6146, Average CER: 0.614216 Average WER: 0.8040
Epoch  4: Average loss: 2.9051, Average CER: 0.446937 Average WER: 0.7264
Epoch  5: Average loss: 2.4120, Average CER: 0.381286 Average WER: 0.6491
Epoch  6: Average loss: 2.0948, Average CER: 0.332462 Average WER: 0.6040
Epoch  7: Average loss: 1.8891, Average CER: 0.301625 Average WER: 0.5645
Epoch  8: Average loss: 1.7378, Average CER: 0.277611 Average WER: 0.5341
Epoch  9: Average loss: 1.6393, Average CER: 0.258625 Average WER: 0.5125
Epoch 10: Average loss: 1.5818, Average CER: 0.247516 Average WER: 0.4969

Each epoch takes ~11 minutes to run.
```

Мы пробовали увеличивать словарь только одновременно с увеличением размерностей внутренних слоёв модели. Теперь попробуем увеличить размер словаря при тех же остальных параметрах. Ожидается увеличение CER и как минимум не ухудшение WER.

In [None]:
# text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=6000)

# main(output_size=6001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
#      train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)

```python
text_transform = TextTransformBPE(train_text='train_clean_100_text_clean.txt', vocab_size=6000)

main(output_size=6001, learning_rate=1e-3, batch_size=10, test_batch_size=7, epochs=10,
     train_url="train-clean-100", test_url="test-clean", attention_dim=360, num_layers=10)
```

```
Num Model Parameters 15_006_849

Epoch  1: Average loss: 6.8542, Average CER: 0.995195 Average WER: 0.9975
Epoch  2: Average loss: 5.3233, Average CER: 0.877091 Average WER: 0.9276
Epoch  3: Average loss: 4.7425, Average CER: 0.783739 Average WER: 0.8492
Epoch  4: Average loss: 4.3134, Average CER: 0.742241 Average WER: 0.8131
Epoch  5: Average loss: 3.9123, Average CER: 0.665480 Average WER: 0.7532
Epoch  6: Average loss: 3.4705, Average CER: 0.586426 Average WER: 0.6887
Epoch  7: Average loss: 3.0790, Average CER: 0.517929 Average WER: 0.6345
Epoch  8: Average loss: 2.8149, Average CER: 0.450333 Average WER: 0.5870
Epoch  9: Average loss: 2.6452, Average CER: 0.422564 Average WER: 0.5628
Epoch 10: Average loss: 2.5583, Average CER: 0.406916 Average WER: 0.5453

Each epoch takes ~11 minutes to run.
```

Ошибка CER действительно увеличилась, но, кроме того, немного выросла и WER.