In [1]:
import torch
from tqdm import tqdm

In [2]:
from hw_asr.tests.utils import clear_log_folder_after_use
from hw_asr.utils.object_loading import get_dataloaders
from hw_asr.utils.parse_config import ConfigParser


config_parser = ConfigParser.get_debug_configs()
sample_rate = config_parser.config["preprocessing"]["sr"]
with clear_log_folder_after_use(config_parser):
    dataloaders, _ = get_dataloaders(config_parser, config_parser.get_text_encoder())

1 (0.0%) records are longer then 20.0 seconds. Excluding them.
13243 (46.4%) records are longer then 200 characters. Excluding them.
Filtered 13243(46.4%) records  from dataset
61 (2.3%) records are longer then 20.0 seconds. Excluding them.
292 (10.8%) records are longer then 200 characters. Excluding them.
Filtered 292(10.8%) records  from dataset
41 (1.4%) records are longer then 20.0 seconds. Excluding them.
201 (7.0%) records are longer then 200 characters. Excluding them.
Filtered 201(7.0%) records  from dataset


In [3]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
from hw_asr.model.deep_speech import DeepSpeech2

model = DeepSpeech2(n_feats=128, n_class=28)
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
checkpoint = torch.load(r'saved/models/deep_speech_2/1024_134159/checkpoint-epoch61.pth', map_location=device)
model.load_state_dict(checkpoint['state_dict'])


def move_batch_to_device(batch):
    batch = batch.copy()
    for tensor_for_gpu in ["spectrogram", "text_encoded"]:
        batch[tensor_for_gpu] = batch[tensor_for_gpu].to(device)
    return batch


model = model.to(device)

N_BATCHES = 20
batches = []
for b in dataloaders['val-other']:
    batches.append(b)
    if len(batches) == N_BATCHES:
        break

In [15]:
with torch.no_grad():
    log_probs = []
    log_probs_length = []
    text = []
    for b in tqdm(batches):
        output = model(**move_batch_to_device(b))
        output["log_probs"] = torch.log_softmax(output["logits"], dim=-1)
        output["log_probs_length"] = model.transform_input_lengths(b["spectrogram_length"])
        for i in range(len(b['text'])):
            log_probs.append(output['log_probs'][i])
            log_probs_length.append(output['log_probs_length'][i])
            text.append(b['text'][i])

100%|██████████| 20/20 [01:04<00:00,  3.23s/it]


In [6]:
N_EXAMPLES = N_BATCHES * dataloaders['val-other'].batch_size

In [7]:
from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder

text_encoder = CTCCharTextEncoder()

pred_argmax = []
for i in range(N_EXAMPLES):
    log_prob_vec = torch.argmax(log_probs[i].cpu(), dim=-1).numpy()
    pred_text = text_encoder.ctc_decode_enhanced(log_prob_vec[:log_probs_length[i]])
    pred_argmax.append(pred_text)

pred_beam_search = [text_encoder.ctc_beam_search(log_probs[i], log_probs_length[i], beam_size=5)[0].text for i in tqdm(range(N_EXAMPLES))]

100%|██████████| 640/640 [00:28<00:00, 22.76it/s]


In [102]:
import gzip
import shutil
import os
from pathlib import Path
from speechbrain.utils.data_utils import download_file

LM_MODELS_DIRECTORY = Path('lm_models/')
LM_MODELS_DIRECTORY.mkdir(exist_ok=True)

MODEL_URL = 'https://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz'
VOCAB_URL = 'http://www.openslr.org/resources/11/librispeech-vocab.txt'

MODEL_PATH = LM_MODELS_DIRECTORY / '3-gram.pruned.1e-7.arpa'
VOCAB_PATH = LM_MODELS_DIRECTORY / 'librispeech-vocab.txt'

def download_lm():
    if not MODEL_PATH.exists():
        extract_path = LM_MODELS_DIRECTORY / '3-gram.pruned.1e-7.arpa.gz'
        # Download file
        download_file(MODEL_URL, extract_path)
        # Extract file
        with gzip.open(extract_path, 'rb') as f_in, open(MODEL_PATH, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(str(extract_path))
        # Convert to lowercase
        with open(MODEL_PATH) as f:
            content = f.read()
        with open(MODEL_PATH, 'w') as f:
            f.write(content.lower().replace("\'", '').replace("\"", ''))
    download_file(VOCAB_URL, VOCAB_PATH)


download_lm()

Downloading https://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz to lm_models/3-gram.pruned.1e-7.arpa.gz


3-gram.pruned.1e-7.arpa.gz: 34.1MB [00:08, 3.96MB/s]                            


Downloading http://www.openslr.org/resources/11/librispeech-vocab.txt to lm_models/librispeech-vocab.txt


librispeech-vocab.txt: 1.74MB [00:01, 1.15MB/s]                            


In [103]:
from pyctcdecode import build_ctcdecoder


with open(VOCAB_PATH) as f:
    unigram_list = [t.lower() for t in f.read().strip().split("\n")]


decoder = build_ctcdecoder(
    [''] + text_encoder.alphabet,
    str(MODEL_PATH),
    unigram_list,
)

Using arpa instead of binary LM file, decoder instantiation might be slow.
Alphabet determined to be of regular style.


Loading the LM will be faster if you build a binary file.
Reading /home/ubuntu/asr_project/lm_models/3-gram.pruned.1e-7.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [201]:
from utils import reload

reload('hw_asr')

from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder

encoder = CTCCharTextEncoder()

In [204]:
import multiprocessing

encoder.load_lm()
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
    pred_lm = encoder.ctc_beam_search_lm(log_probs, log_probs_length, beam_size=2000, pool=pool)

In [209]:
from textblob import TextBlob


def autocorrect_sentence(sentence):
    blob = TextBlob(sentence)
    corrected_sentence = blob.correct()
    return str(corrected_sentence)

pred_corr = [autocorrect_sentence(s) for s in pred_lm]

In [210]:
import numpy as np
from hw_asr.metric.utils import calc_cer, calc_wer


def print_wer_cer(targets, predictions):
    assert len(targets) == len(predictions)
    print(f'examples = {len(targets)}')
    wer = np.mean([calc_wer(target, prediction) for target, prediction in zip(targets, predictions)])
    cer = np.mean([calc_cer(target, prediction) for target, prediction in zip(targets, predictions)])
    print(f'WER = {wer * 100:.3f}\tCER = {cer * 100:.2f}')


print('ArgMax')
print_wer_cer(text, pred_argmax)
print('BeamSearch')
print_wer_cer(text, pred_beam_search)
print('BeamSearch + LM')
print_wer_cer(text, pred_lm)
print('BeamSearch + LM + correction')
print_wer_cer(text, pred_corr)

ArgMax
examples = 640
WER = 32.091	CER = 12.14
BeamSearch
examples = 640
WER = 31.610	CER = 11.96
BeamSearch + LM
examples = 640
WER = 21.997	CER = 9.76
BeamSearch + LM + correction
examples = 640
WER = 22.722	CER = 10.39


In [336]:
N = N_EXAMPLES
N = 10
pred_lm = []

text_encoder.alpha_len = 2.35
text_encoder.alpha_lm = 0.5

text_encoder.use_lm = True
for i in range(N):
    print(f'{i + 1}/{N}')
    pred_lm.append(text_encoder.ctc_beam_search(log_probs[i], log_probs_length[i], beam_size=30)[0].text)

print('ArgMax')
print_wer_cer(text[:N], pred_argmax[:N])
print('BeamSearch')
print_wer_cer(text[:N], pred_beam_search[:N])
print('BeamSearch + LM')
print_wer_cer(text[:N], pred_lm[:N])

1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
10/10
ArgMax
examples = 10
WER = 33.289	CER = 12.69
BeamSearch
examples = 10
WER = 31.825	CER = 11.85
BeamSearch + LM
examples = 10
WER = 32.225	CER = 12.54


In [328]:
print_wer_cer(text[:N], [text_encoder.ctc_beam_search(log_probs[i], log_probs_length[i], beam_size=30)[0].text for i in tqdm(range(N))])

100%|██████████| 100/100 [00:27<00:00,  3.65it/s]

examples = 100
WER = 34.067	CER = 13.43





In [322]:
lm.log_s('i', eos=False) / 3

-0.37425800000000226

In [299]:
pred_beam_search[N - 1]

'but now the brandon was a ful swing'

In [297]:
pred_lm[-1]

'but now the brandon was a fuol swing'