reference : https://github.com/NVIDIA/NeMo/blob/main/examples/asr/conf/conformer/conformer_ctc_bpe.yaml

In [1]:
# ML Libraries
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
from torchaudio import transforms
from torch.utils.data import Dataset
from torchmetrics.text.wer import WordErrorRate
from torchmetrics.text.cer import CharErrorRate
from torch.cuda.amp import autocast, GradScaler


# Support Libraries
import math
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
import gc
from itertools import groupby

# Python Scripts
from conf_model import ConformerEncoder, LSTMDecoder#, ConvASRDecoder
from conf_utils import *

In [2]:
total_devices = torch.cuda.device_count()
print(f"Available GPU Devices : {total_devices}")
DEVICES = [i for i in range(0,total_devices)]
print(f"Using devices : {DEVICES}")
DEVICE = f"cuda:{DEVICES[0]}"

Available GPU Devices : 1
Using devices : [0]


In [3]:
TRAIN_BS = 20*len(DEVICES)
TEST_BS = 10*len(DEVICES)
EPOCHS = 1000
NUM_WORKERS = 4*len(DEVICES)
start_epoch = 0

In [4]:
metadata_train = pd.read_csv("/media/rathna/New Volume/datasets/Librispeech/metadata_train_clean_100.tsv", sep = '\t', header = None)
metadata_dev = pd.read_csv("/media/rathna/New Volume/datasets/Librispeech/metadata_test_clean.tsv", sep = '\t', header = None)

metadata_train = metadata_train[metadata_train[2]<=16].reset_index(drop=True)


metadata_train = metadata_train.sample(frac=1).reset_index(drop=True)

print(metadata_train)

metadata_dev = metadata_dev[metadata_dev[2]<=16].reset_index(drop=True)



metadata_dev = metadata_dev.sample(frac=1).reset_index(drop=True)

print(metadata_dev)

                                                       0  \
0      /media/rathna/New Volume/datasets/Librispeech/...   
1      /media/rathna/New Volume/datasets/Librispeech/...   
2      /media/rathna/New Volume/datasets/Librispeech/...   
3      /media/rathna/New Volume/datasets/Librispeech/...   
4      /media/rathna/New Volume/datasets/Librispeech/...   
...                                                  ...   
26747  /media/rathna/New Volume/datasets/Librispeech/...   
26748  /media/rathna/New Volume/datasets/Librispeech/...   
26749  /media/rathna/New Volume/datasets/Librispeech/...   
26750  /media/rathna/New Volume/datasets/Librispeech/...   
26751  /media/rathna/New Volume/datasets/Librispeech/...   

                                                       1       2      3  
0      through which i saw shrubs and a grass plat lo...  15.250  16000  
1      and struck his stick on the floor again my mot...  14.500  16000  
2      and i felt a kind of panic on seeing the pale ... 

USE sentence_piece_final.ipynb TO BUILD THE TOKENIZER AND BEFORE RUNNING THE CODE

In [5]:
my_file = open("/home/rathna/Desktop/tokenizer/libri_100/tokenizer.vocab", "r")
  
# reading the file
data = my_file.read()
  

vocab = data.split("\n")
print(vocab)

vocab_f = []
for i in range(3, len(vocab)-1):
    vocab_f.append(vocab[i].split("\t")[0])
print(vocab_f)

vocab_f = vocab_f+[' ']
vocab = sorted(vocab_f)
num_embeddings = len(vocab)
print(num_embeddings)
vocab_ids = [int(i) for i in range(num_embeddings)]
vocab_dict = dict(zip(vocab,vocab_ids))
print(f"Character mappings : {vocab_dict}")

['<unk>\t0', '<s>\t0', '</s>\t0', '▁\t-1.875', 'e\t-2.51473', 's\t-2.74813', 't\t-2.83066', 'a\t-2.84131', 'i\t-2.916', 'o\t-3.0554', 'r\t-3.05788', 'l\t-3.20473', 'd\t-3.30153', 'h\t-3.45137', 'u\t-3.65498', 'm\t-3.66756', 'c\t-3.69663', 'n\t-3.69821', '▁the\t-3.98861', 'y\t-4.05982', 'p\t-4.06069', 'b\t-4.18664', 'g\t-4.30007', '▁w\t-4.30797', 'f\t-4.32524', 'on\t-4.64682', 'er\t-4.66921', 'w\t-4.78911', '▁and\t-4.85567', 'k\t-4.87774', 'en\t-4.88871', '▁of\t-4.91166', 'an\t-4.92246', '▁to\t-4.94232', 'ing\t-4.98547', 'v\t-5.73803', 'x\t-6.52142', 'j\t-6.56745', 'q\t-7.45048', 'z\t-7.45048', '']
['▁', 'e', 's', 't', 'a', 'i', 'o', 'r', 'l', 'd', 'h', 'u', 'm', 'c', 'n', '▁the', 'y', 'p', 'b', 'g', '▁w', 'f', 'on', 'er', 'w', '▁and', 'k', 'en', '▁of', 'an', '▁to', 'ing', 'v', 'x', 'j', 'q', 'z']
38
Character mappings : {' ': 0, 'a': 1, 'an': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'en': 7, 'er': 8, 'f': 9, 'g': 10, 'h': 11, 'i': 12, 'ing': 13, 'j': 14, 'k': 15, 'l': 16, 'm': 17, 'n': 18, '

In [6]:
import sentencepiece as spm

s = spm.SentencePieceProcessor(model_file='/home/rathna/Desktop/tokenizer/libri_100/tokenizer.model')


import re
def sent_piece_tokenizer(text):
    text_list = re.split(" ", text)
    encoded_words = []
    for word in text_list:
        temp = s.encode(word, out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
        encoded_words += temp
        encoded_words.append(' ')
    encoded_words = encoded_words[:-1]
    #print(encoded_words)
    mapped_words = []
    for i in range(len(encoded_words)):
        #print(i, ':', encoded_words[i])
        mapped_words.append(vocab_dict[encoded_words[i]])
    return mapped_words

In [7]:
feat_dict = {
        "sample_rate":16000,
        "n_mels":80, #as per reference
    }

# time_masks = [torchaudio.transforms.TimeMasking(time_mask_param=15, p=0.05) for _ in range(10)]
# train_transform = nn.Sequential(
#    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, hop_length=160), #80 filter banks, 25ms window size, 10ms hop
#    torchaudio.transforms.FrequencyMasking(freq_mask_param=27),
#    *time_masks,
#  )
train_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, hop_length=160)
validation_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=80, hop_length=160)

In [8]:
class MyDataset(Dataset):
    """
    The Class will act as the container for our dataset. It will take your dataframe, the root path, and also the transform function for transforming the dataset.
    """
    def __init__(self, data_frame, char_dict, transform=None):
        self.data_frame = data_frame
        #self.root_dir = root_dir
        self.transform = transform
        #self.max_transcript_len = N
        self.char_dict = char_dict
    def __len__(self):
        # Return the length of the dataset
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        # Return the observation based on an index. Ex. dataset[0] will return the first element from the dataset, in this case the image and the label.
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        file_path = self.data_frame.iloc[idx, 0]
        waveform, sample_rate = torchaudio.load(file_path, normalize = True)
        if sample_rate!=16000:
            waveform = transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
        mel_spec = self.transform(waveform) # (batch, n_mels, time)
        mel_spec = mel_spec.squeeze(0).transpose(0,1)  # (time, n_mels)
        mel_spec_len = ((mel_spec.shape[0] - 1) // 2 - 1) // 2
        
        transcript = self.data_frame.iloc[idx,1]
        transcript = str(transcript)
        input_transcript_list = sent_piece_tokenizer(transcript)
#         for char in transcript:
#             input_transcript_list.append(self.char_dict[char])
        
        label_tensor = torch.LongTensor(input_transcript_list)
        label_len = len(input_transcript_list)
    
        return mel_spec, label_tensor, mel_spec_len, label_len, transcript

In [9]:
# train_size = int(len(metadata)*0.99)
# dev_size = len(metadata) - train_size

# if not os.path.exists(metadata_path):
#     metadata_train = metadata[:train_size]
#     metadata_dev   = metadata[train_size:]
#     metadata_train.to_csv("lj_metadata_train.tsv", sep='\t', header=False, index=False)
#     metadata_dev.to_csv("lj_metadata_test.tsv", sep='\t', header=False, index=False)
# else:
#     print("Using existing metadata!")
#     metadata_train = pd.read_csv("lj_metadata_train.tsv", sep='\t', header=None, encoding='utf-8')
#     metadata_dev  = pd.read_csv("lj_metadata_test.tsv", sep='\t', header=None, encoding='utf-8')
    
trainset = MyDataset(metadata_train, vocab_dict, transform=train_transform)
devset = MyDataset(metadata_dev, vocab_dict, transform=validation_transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size = TRAIN_BS, shuffle = True, collate_fn = collate_batch, drop_last=True, num_workers=NUM_WORKERS, pin_memory=True)
devloader  = torch.utils.data.DataLoader(devset,  batch_size = TEST_BS, shuffle = True, collate_fn = collate_batch, drop_last=True, num_workers=NUM_WORKERS, pin_memory=True)

In [10]:
# encoder_params = {
#     "d_input": 80,
#     "d_model": 144,
#     "num_layers": 16,
#     "conv_kernel_size": 32,
#     "dropout": 0.1,
#     "num_heads": 4
# }

# decoder_params = {
#     "d_encoder": 144,
#     "d_decoder": 320,
#     "num_layers": 1,
#     "num_classes":len(vocab)+1
# }

encoder_params = {
    "d_input": 80,
    "d_model": 176,
    "num_layers": 16,
    "conv_kernel_size": 31,
    "dropout": 0.1,
    "num_heads": 4
}

decoder_params = {
    "d_encoder": 176,
    "d_decoder": 320,
    "num_layers": 1,
    "num_classes":len(vocab)+1
}

In [11]:
class ConvASRDecoder(nn.Module):
#   '''
#     LSTM Decoder
#     Parameters:
#       d_encoder (int): Output dimension of the encoder
#       d_decoder (int): Hidden dimension of the decoder
#       num_layers (int): Number of LSTM layers to use in the decoder
#       num_classes (int): Number of output classes to predict
    
#     Inputs:
#       x (Tensor): (batch_size, time, d_encoder)
    
#     Outputs:
#       Tensor (batch_size, time, num_classes): Class prediction logits
  
#   '''
    def __init__(self, d_encoder=144, d_decoder=320, num_layers=1, num_classes=29):
        super(ConvASRDecoder, self).__init__()
        #self.conv = nn.Conv1d(input_size=d_encoder, hidden_size=d_decoder, num_layers=num_layers, batch_first=True)
        #in_channels, out_channels, kernel_size,
        self.conv = nn.Conv1d(in_channels=d_encoder, out_channels=d_decoder, kernel_size=num_layers, bias=True)
        self.linear = nn.Linear(d_decoder, num_classes)

    def forward(self, x):
        x = self.conv(x.transpose(1, 2)).transpose(1,2)
        decoder_cem = x # TRIAL FOR CEM
        logits = self.linear(x)
        return logits, decoder_cem

In [12]:
encoder = ConformerEncoder(
                      d_input=encoder_params['d_input'],
                      d_model=encoder_params['d_model'],
                      num_layers=encoder_params['num_layers'],
                      conv_kernel_size=encoder_params['conv_kernel_size'], 
                      dropout=encoder_params['dropout'],
                      num_heads=encoder_params['num_heads']
                    )
  
# decoder = LSTMDecoder(
#                   d_encoder=decoder_params['d_encoder'], 
#                   d_decoder=decoder_params['d_decoder'], 
#                   num_layers=decoder_params['num_layers'],
#                   num_classes= decoder_params['num_classes'])

decoder = ConvASRDecoder(
                  d_encoder=decoder_params['d_encoder'], 
                  d_decoder=decoder_params['d_decoder'], 
                  num_layers=decoder_params['num_layers'],
                  num_classes= decoder_params['num_classes'])


In [13]:
encoder = encoder.to(DEVICE)
decoder = decoder.to(DEVICE)

In [14]:
char_decoder =  GreedyCharacterDecoder().eval()
criterion = nn.CTCLoss(blank=len(vocab), zero_infinity=True)
#optimizer = torch.optim.AdamW(list(encoder.parameters()) + list(decoder.parameters()), lr=5e-4, betas=(.9, .98), eps = 1e-09, weight_decay=1e-6)

#CHANGED FROM NEMO

optimizer = torch.optim.AdamW(list(encoder.parameters()) + list(decoder.parameters()), lr=5.0, betas=(.9, .98), eps = 1e-09, weight_decay=1e-3)
scheduler = TransformerLrScheduler(optimizer, encoder_params['d_model'], 10000)

In [15]:
model_size(encoder, 'Encoder')
model_size(decoder, 'Decoder')

Encoder - num_params: 14.71M,  size: 56.1MB
Decoder - num_params: 0.07M,  size: 0.26MB


In [16]:
print(encoder)
print(decoder)

ConformerEncoder(
  (conv_subsample): Conv2dSubsampling(
    (conv2d_1): Conv2d(1, 176, kernel_size=(3, 3), stride=(2, 2))
    (relu): ReLU()
    (conv2d_2): Conv2d(176, 176, kernel_size=(3, 3), stride=(2, 2))
  )
  (linear_proj): Linear(in_features=3344, out_features=176, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0): ConformerBlock(
      (ff1): FeedForwardBlock(
        (layer_norm): LayerNorm((176,), eps=6.1e-05, elementwise_affine=True)
        (linear_1): Linear(in_features=176, out_features=704, bias=True)
        (silu): SiLU()
        (dropout_1): Dropout(p=0.1, inplace=False)
        (linear_2): Linear(in_features=704, out_features=176, bias=True)
        (dropout_2): Dropout(p=0.1, inplace=False)
      )
      (attention): RelativeMultiHeadAttention(
        (W_q): Linear(in_features=176, out_features=176, bias=True)
        (W_k): Linear(in_features=176, out_features=176, bias=True)
        (W_v): Linear(in_features=176, out_features=

In [17]:
gc.collect()

0

In [18]:
torch.cuda.set_device(DEVICE)
criterion = criterion.to(DEVICE)
char_decoder = char_decoder.to(DEVICE)
torch.cuda.empty_cache()

checkpoint_load_path = "conf_s_libri_best_wer.pt"

if(os.path.exists(checkpoint_load_path)):
    start_epoch, best_loss, best_wer = load_checkpoint(encoder, decoder, optimizer, scheduler, checkpoint_load_path, DEVICE)
    print(f'Resuming training from checkpoint starting at epoch {start_epoch}.')
    print(f'Current model WER : {best_wer}%')

In [19]:
def train(encoder, decoder, char_decoder, optimizer, scheduler, criterion, grad_scaler, train_loader, device): 
    wer = WordErrorRate()
    cer = CharErrorRate()
    # batch accumulation parameter #https://kozodoi.me/blog/20210219/gradient-accumulation
    accum_iter = 4  #ADDED

    encoder.train()
    decoder.train()
    avg_loss = 0
    avg_wer = 0
    avg_cer = 0
    batch_count = 0
    for batch in tqdm(train_loader):
        batch_count += 1
        scheduler.step()
        gc.collect()
        spectrograms, labels, input_lengths, label_lengths, references, mask = batch 

        spectrograms = spectrograms.squeeze(1).to(device)
        labels = labels.to(device)
        input_lengths = torch.tensor(input_lengths).to(device)
        label_lengths = torch.tensor(label_lengths).to(device)
        mask = mask.to(device)
    
        outputs, _ = encoder(spectrograms, mask)
        outputs, _ = decoder(outputs)
        loss = criterion(F.log_softmax(outputs, dim=-1).transpose(0, 1), labels, input_lengths, label_lengths)
        #loss.backward()
        # normalize loss to account for batch accumulation
        loss = loss / accum_iter #ADDED
        
        
        grad_scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 0.1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.1)
        #if (i+1) % args.accumulate_iters == 0:
        if ((batch_count) % accum_iter == 0) or (batch_count == len(train_loader)):#ADDED
            grad_scaler.step(optimizer)
            grad_scaler.update()
            optimizer.zero_grad()
        #avg_loss.update(loss.detach().item())
        
        #use for small datasets
        
#         optimizer.step()
#         optimizer.zero_grad()
        
        avg_loss += loss.detach().item()
        
        inds = char_decoder(outputs.detach())
        # print("train shape:",inds.shape)
        predictions = []
        for sample in inds:
            #print(sample.shape)
            predictions.append(int_to_text(sample, len(vocab), vocab))
        avg_wer += wer(predictions, references) * 100
        avg_cer += cer(predictions, references) * 100

    avg_loss = avg_loss/batch_count
    avg_wer = avg_wer/batch_count
    avg_cer = avg_cer/batch_count
    print(f'Avg WER: {avg_wer}%, Avg Loss: {avg_loss}')  
    for i in range(5):
        print('Prediction: ', predictions[i])
        print('Reference: ', references[i])
    
    # Print metrics and predictions 
    del spectrograms, labels, input_lengths, label_lengths, references, outputs, inds, predictions
    return avg_wer, avg_cer, avg_loss
    
def validate(encoder, decoder, char_decoder, criterion, test_loader, device):
    ''' Evaluate model on test dataset. '''

    wer = WordErrorRate()
    cer = CharErrorRate()
        
    avg_loss = 0
    avg_wer = 0
    batch_count = 0
    avg_cer = 0
    
    encoder.eval()
    decoder.eval()
    for batch in tqdm(test_loader):
        gc.collect()
        batch_count += 1
        spectrograms, labels, input_lengths, label_lengths, references, mask = batch 
  
    # Move to GPU
        spectrograms = spectrograms.to(device)
        labels = labels.to(device)
        input_lengths = torch.tensor(input_lengths).to(device)
        label_lengths = torch.tensor(label_lengths).to(device)
        mask = mask.to(device)

        with torch.no_grad():
            outputs, _ = encoder(spectrograms, mask)
            outputs, _ = decoder(outputs)
            loss = criterion(F.log_softmax(outputs, dim=-1).transpose(0, 1), labels, input_lengths, label_lengths)
            avg_loss += loss.item()

            inds = char_decoder(outputs.detach())
            # print("validation shape:", inds.shape)
            predictions = []
            for sample in inds:
                predictions.append(int_to_text(sample, len(vocab), vocab))

            avg_wer += wer(predictions, references) * 100
            avg_cer += cer(predictions, references) * 100
    print(".............................TEST PREDICTIONS...........................")
    for i in range(5):
        print('Prediction: ', predictions[i])
        print('Reference: ', references[i])
    print("************************************************************************")
    
    return avg_wer/batch_count, avg_cer/batch_count, loss/batch_count 

In [20]:
checkpoint_save_path = "conf_s_libri_chk.pt"
best_loss_save_path = "conf_s_libri_best_loss.pt"
best_wer_save_path = "conf_s_libri_best_wer.pt"

In [21]:
best_loss = float('inf')
best_wer = float('inf')
optimizer.zero_grad()

use_amp = True
variational_noise_std = 0.0001

# Mixed Precision Setup
if use_amp:
    print('Using Mixed Precision')
grad_scaler = GradScaler(enabled=use_amp)

optimizer.zero_grad()
for epoch in range(start_epoch, EPOCHS):
    print(f"Epoch : {epoch+1}/{EPOCHS}")
    torch.cuda.empty_cache()
    
    #variational noise for regularization - COMMENTING ON MAY 19
#     add_model_noise(encoder, std=variational_noise_std, gpu=True)
#     add_model_noise(decoder, std=variational_noise_std, gpu=True)

    
    # Train/Validation loops
    #wer, cer, loss = train(encoder_parallel, decoder_parallel, char_decoder, optimizer, scheduler, criterion, trainloader, DEVICE) 
    wer, cer, loss = train(encoder, decoder, char_decoder, optimizer, scheduler, criterion, grad_scaler, trainloader, DEVICE) 
    valid_wer, valid_cer, valid_loss = validate(encoder, decoder, char_decoder, criterion, devloader, DEVICE)
    print(f'Epoch {epoch} - Valid WER: {valid_wer}%, Valid CER: {valid_cer}%, Valid Loss: {valid_loss}, Train WER: {wer}%, Train CER: {cer}%, Train Loss: {loss}')  
    
    # Save best model
    if valid_loss <= best_loss:
        print('Validation loss improved, saving best model.')
        best_loss = valid_loss
        save_checkpoint(encoder, decoder, optimizer, scheduler, valid_loss, epoch+1, best_loss_save_path, valid_wer, valid_cer, vocab_dict)
    
    if epoch%3==0:
        print(f'Saving checkpoint at epoch:{epoch+1}')
        save_checkpoint(encoder, decoder, optimizer, scheduler, valid_loss, epoch+1, checkpoint_save_path, valid_wer, valid_cer, vocab_dict)
    
    if valid_wer <= best_wer:
        print(f"Validation WER improved, saving best model.")
        best_wer = valid_wer
        save_checkpoint(encoder, decoder, optimizer, scheduler, valid_loss, epoch+1, best_wer_save_path, valid_wer, valid_cer, vocab_dict)

Using Mixed Precision
Epoch : 1/1000


100%|███████████████████████████████████████| 1337/1337 [12:53<00:00,  1.73it/s]


Avg WER: 99.84564208984375%, Avg Loss: 0.7495051676779428
Prediction:   t   h  t          t  h   t ss t s     h t h h  
Reference:  before the majesty of the law and the damning evidence of his guilt despite his social standing and the wealth to sustain it he sees himself alone without friend or sympathiser
Prediction:        ta h t t  ta t  e a h th  ta   t t       t e h  t  t     
Reference:  while he scarce knew if he were the more impressed with her launching it under missus stringhams nose or with her hope that he would allow to london the honour of discovery the less expansive of the white waistcoats propounded the theory that they saw in london
Prediction:     s s                      a s   t  
Reference:  perceiving her still to look doubtful and grave he added though frederick does not leave bath with us he will probably remain but a very short time perhaps only a few days behind us
Prediction:    h  t        s  t  a t  h  h t  
Reference:  yet obtaining the information he des

100%|█████████████████████████████████████████| 242/242 [00:36<00:00,  6.63it/s]


.............................TEST PREDICTIONS...........................
Prediction:   he ho a a a  o 
Reference:  those fellows are all very loyal even mainhall
Prediction:   h h ta a ae to h to a t ta he o 
Reference:  these escapades are not for old gamewell lad his day has come to twilight
Prediction:    h    o h to ha a h   h       
Reference:  to meet the needs of this conflict wretchedness has invented a language of combat which is slang
Prediction:      h    ta      a  h h to a ho  ta  t
Reference:  and hence we find the same sort of clumsiness in the timaeus of plato which characterizes the philosophical poem of lucretius
Prediction:   h  to  ta to h  e ho h to ta  
Reference:  at once the goat gave a leap escaped from the soldiers and with bowed head rushed upon the boolooroo
************************************************************************
Epoch 0 - Valid WER: 98.57537841796875%, Valid CER: 77.99987030029297%, Valid Loss: 0.011167665012180805, Train WER: 99.8456420898

100%|███████████████████████████████████████| 1337/1337 [11:22<00:00,  1.96it/s]


Avg WER: 98.8380126953125%, Avg Loss: 0.6544259240935788
Prediction:   oo o o h o  o o o  o o o o o o
Reference:  then instead of applying as he should have done to the states general who sate close to his own door
Prediction:   o o  o o o o o  o o o o o o o o o o o e o o o o  os o  o  o o o o  o
Reference:  our selling the three hundred cakes corrected rebecca you did as much as i no i didnt rebecca randall i just sat at the gate and held the horse yes but whose horse was it that took us to north riverboro
Prediction:    o o o e se e o o  o o o o o o o o e o o o o o o o a oe
Reference:  with this luigi purchased books and pencils he applied his imitative powers to everything and like giotto when young he drew on his slate sheep houses and trees
Prediction:   o o o o o o o o o o   oe   ao o o o o  o o o o o h o o o o o o he o e o o
Reference:  i know theres them as is born t own the land and them as is born to sweat ont here missus poyser paused to gasp a little and i know its christen

100%|█████████████████████████████████████████| 242/242 [00:37<00:00,  6.54it/s]


.............................TEST PREDICTIONS...........................
Prediction:   oo o o o o  oe oe
Reference:  alexander groaned i meant to but somehow i couldnt
Prediction:    o o o e o o o o o e o o o o o  
Reference:  and with it i leave you a name sif the friendly i shall hope to drink with you sometime in valhalla
Prediction:   o o  o oo o o o o  o o o o o o oo oe
Reference:  he wouldnt search so dont worry replied cyril quietly and the two looked at each other and knew that it was so
Prediction:   oe e o o e o e o e th o a e o o o o o o oe 
Reference:  to do so is to lose god altogether because god becomes intolerable when we seek to measure and to comprehend his infinite majesty
Prediction:   o o o o o o o  oe o o o o o o o o o o o o o o o o oe oe oe o o o
Reference:  the large letter contains indeed entirely feeble and ill drawn figures that is merely childish and failing work of an inferior hand it is not characteristic of gothic or any other school
*********************

100%|███████████████████████████████████████| 1337/1337 [11:00<00:00,  2.02it/s]


Avg WER: 100.68856048583984%, Avg Loss: 0.6483278477824974
Prediction:  a a a a a a a a a a a o a a a a a a a a a a a a a a a a a a a a a a a a a a o
Reference:  to set off the dignity of the prince and for the greater glory of the kings majesty then might not your worship said she be one of those that without stirring a step serve their king and lord in his court
Prediction:  h a a a a a a a es a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a se
Reference:  it was his habit to go from one fore castle mess to another and to insist upon having rather more than his share of the choice morsels from each in a short time he came to the repair shop very much the worse for wear with an impaired digestion
Prediction:  h a a a a a a a a a a a a a a a a a a a e a a a a o a a a sa sa a a a a o a a a a a a a a a a a a a
Reference:  closely following those of the early french supple airships there are several other craft which have become more or less recognised by the 

100%|█████████████████████████████████████████| 242/242 [00:36<00:00,  6.66it/s]


.............................TEST PREDICTIONS...........................
Prediction:   a a a a a si
Reference:  take him out thorkel and let him taste your sword
Prediction:  h a a a a a a a ai a  a
Reference:  i made her for only twenty oars because i thought few men would follow me for i was young fifteen years old
Prediction:   a a a a a h a a 
Reference:  to all these inquiries the count responded in the affirmative
Prediction:  h a a a a a a ad a sha e a a a a a a a a a a a a a a i
Reference:  a stage meal is popular because it proves to the audience that the actors even when called charles hawtrey or owen nares are real people just like you and me
Prediction:   ai h     o o a i
Reference:  it was in a corner that he lay among weeds and nettles
************************************************************************
Epoch 2 - Valid WER: 100.50077819824219%, Valid CER: 76.22796630859375%, Valid Loss: 0.01138945110142231, Train WER: 100.68856048583984%, Train CER: 75.2426986694336%,

100%|███████████████████████████████████████| 1337/1337 [10:44<00:00,  2.07it/s]


Avg WER: 102.62889099121094%, Avg Loss: 0.6509268800982273
Prediction:       o o   o    o o     h     o o o  
Reference:  and rough and tumble nature of the social boom the boom as in itself required that would be the note the subject of the process a comparatively minor question anything was boomable enough when nothing else was more so
Prediction:                
Reference:  it was he who invented the athletes admirable rules
Prediction:     o  o o o o     o o o o   o  o  o  o o o       o o 
Reference:  the crayfish stuck his tail into the mud he often did this when he was surprised it seemed to help him think when he had thought for a while he waved his big pinching claws and said
Prediction:     o o  o       o            o            o o  
Reference:  not from any intention on the part of the discoverer since neither he who buried the gold nor he who worked in the field intended that the money should be found but as i said it happened
Prediction:    o o o o o o o o o o o o o o o o 

100%|█████████████████████████████████████████| 242/242 [00:37<00:00,  6.54it/s]


.............................TEST PREDICTIONS...........................
Prediction:      
Reference:  men should not speculate about the nature of god
Prediction:    o o o  o o      
Reference:  a montfichet a montfichet gamewell to the rescue
Prediction:          
Reference:  could it mean to last a love set pendulous between sorrow and sorrow
Prediction:      o   o o   o  o o o o   o o      
Reference:  philip therefore read diligently in the astor library planned literary works that should compel attention and nursed his genius
Prediction:       o     o        
Reference:  i refer to the thermometer it indicates the figure is obliterated
************************************************************************
Epoch 3 - Valid WER: 99.98868560791016%, Valid CER: 81.5785903930664%, Valid Loss: 0.010802826844155788, Train WER: 102.62889099121094%, Train CER: 75.58157348632812%, Train Loss: 0.6509268800982273
Validation loss improved, saving best model.
Saving checkpoint at epoch:4
Epoc

100%|███████████████████████████████████████| 1337/1337 [11:45<00:00,  1.90it/s]


Avg WER: 102.01261138916016%, Avg Loss: 0.6558482706680526
Prediction:   a a aa o a a a aa a aa a aa a a aa a a a a a a a a a a a a a a a o
Reference:  judge mullowny addressed the prisoners with many high sounding words about the seriousness of obstructing the traffic in the national capital and inadvertently slipped into a discourse on russia and the dangers of revolution
Prediction:   a o a a a a a a a a aa a ao a a a a aa  aa aa o a a a aa a a a a a o o a 
Reference:  who can follow an animal which can traverse the sea of ice and inhabit caves and dens where no man would venture to intrude
Prediction:   a a aa a a a a a a a a aa a a a a a a a a aa a a oa a a a a a a a a a a a
Reference:  had struck deadly blows at the heart of each others empire and harried the inmost provinces up to the gates of each others capitals the persian had turned the wild hordes of the avars loose on thrace
Prediction:  h a a a a a a a a  a a a a a a a oa aa a aa a a a a a a a a aa a a a a a a
Reference: 

100%|█████████████████████████████████████████| 242/242 [00:38<00:00,  6.33it/s]


.............................TEST PREDICTIONS...........................
Prediction:   a   
Reference:  robin entered the hut dragging the unwilling esquire after him
Prediction:   a  a 
Reference:  indeed there were only one or two strangers who could be admitted among the sisters without producing the same result
Prediction:   a a 
Reference:  nancys curly chestnut crop shone in the sun and olives thick black plaits looked blacker by contrast
Prediction:   a  a 
Reference:  tis now winter out of doors thought the tree
Prediction:   a  a 
Reference:  by reason and affection
************************************************************************
Epoch 4 - Valid WER: 98.49299621582031%, Valid CER: 92.96858215332031%, Valid Loss: 0.012517726980149746, Train WER: 102.01261138916016%, Train CER: 77.20291137695312%, Train Loss: 0.6558482706680526
Validation WER improved, saving best model.
Epoch : 6/1000


  8%|███                                     | 104/1337 [00:57<11:21,  1.81it/s]


KeyboardInterrupt: 