Created with assistance from tutorials and adapted from Sources:
1. https://github.com/Kyubyong/g2p - used for base notebook layout and initial introduction into structure of a G2P model
2. https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e#58f2 - assistance in understanding various concepts and functions in [1]
2. https://github.com/bentrevett/pytorch-seq2seq
3. https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

4. Further assistance on DataLoader and BucketIterator from: https://colab.research.google.com/github/gmihaila/ml_things/blob/master/notebooks/pytorch/pytorchtext_bucketiterator.ipynb#scrollTo=ChQWVc4IUUPb


# Imports and Initialization of Config Class/File

In [853]:
from io import open
import unicodedata
import string
import re
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from distance import levenshtein
import os
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils import data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
#from torchtext.data import BucketIterator


seed = '4'
if seed is not None:
    random.seed(seed)

In [854]:
#cfg class which is easliy translatible into a cfg file

class Config:
    seed = '5'
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dict_name = 'rcrl_apd.1.4.1.txt'
    epochs = 35
    batch_size = 128
    hidden_dim = 128
    embed_dim = 64
    dropout = 0.5
    dec_max_len = 30
    MAX_LENGTH = 20
    teacher_forcing_ratio = 0.5
    n_layers = 2
    lr = 0.001


cfg = Config()

In [855]:
print(cfg.device)

cuda


# Preparation of Datasets(using torchtext)

In [856]:
def dict_sorting (dict_file_name):
    dict_file = open(dict_file_name, 'r')
    lines_dict = dict_file.readlines()
    dict_file.close()

    graphemes = []
    phonemes = []

    for i in range(0, len(lines_dict)):
        lines_dict[i] = lines_dict[i].split()
        graphemes.append([*lines_dict[i][0]])
        phonemes.append(lines_dict[i][1:])
    phonemes = [a for b in phonemes for a in b]
    graphemes = [a for b in graphemes for a in b]
    graphemes = sorted(set(graphemes))
    phonemes = sorted(set(phonemes))
    return graphemes, phonemes

dict_file_name = cfg.dict_name
g_seq, p_seq = dict_sorting(dict_file_name)
cfg.graphemes = ["<pad>", "<unk>", "</s>"] + g_seq
cfg.phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + p_seq

#Index to grapheme and phones for vectors
cfg.graph2index = {g: idx for idx, g in enumerate(cfg.graphemes)}
cfg.index2graph = {idx: g for idx, g in enumerate(cfg.graphemes)}

cfg.phone2index = {p: idx for idx, p in enumerate(cfg.phonemes)}
cfg.index2phone = {idx: p for idx, p in enumerate(cfg.phonemes)}


print(cfg.graphemes)
print(cfg.phonemes)
cfg.g_vocab_size = len(cfg.graphemes)
cfg.p_vocab_size = len(cfg.phonemes)
print(cfg.g_vocab_size, cfg.p_vocab_size)
print(cfg.phone2index)
print(cfg.index2phone)

['<pad>', '<unk>', '</s>', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'á', 'ä', 'è', 'é', 'ê', 'ë', 'í', 'ï', 'ó', 'ô', 'ö', 'ú', 'û']
['<pad>', '<unk>', '<s>', '</s>', '2:', '9', '9y', '@', '@i', '@u', 'A:', 'E', 'N', 'O', 'Of', 'S', 'Z', 'a', 'b', 'd', 'e', 'f', 'g', 'h_', 'i', 'i@', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'u', 'u@', 'v', 'w', 'x', 'y', 'z', '{']
43 43
{'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3, '2:': 4, '9': 5, '9y': 6, '@': 7, '@i': 8, '@u': 9, 'A:': 10, 'E': 11, 'N': 12, 'O': 13, 'Of': 14, 'S': 15, 'Z': 16, 'a': 17, 'b': 18, 'd': 19, 'e': 20, 'f': 21, 'g': 22, 'h_': 23, 'i': 24, 'i@': 25, 'j': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'p': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'u@': 36, 'v': 37, 'w': 38, 'x': 39, 'y': 40, 'z': 41, '{': 42}
{0: '<pad>', 1: '<unk>', 2: '<s>', 3: '</s>', 4: '2:', 5: '9', 6: '9y', 7: '@', 8: '@i', 9: '@u', 10: 'A:', 11: 'E', 12: 'N', 13:

In [857]:
def DataLoading(dict_file_name):
    def sortingWP (d):
        w, p = [], []
        for i in range(0, len(d)):
            #w.append(d[i][0])
            w.append(' '.join(d[i][0]))
            p.append((' '.join(d[i][1:])))
        return w,p

    with open(dict_file_name) as f:
        dict_lines = f.readlines()
    vocab_len = len(dict_lines)
    print(vocab_len)
    random.shuffle(dict_lines)

    #Potential to add 'n to train dataset
    
    for i in range(0, len(dict_lines)):
        dict_lines[i] = dict_lines[i].split()
    train_data_lines, test_data_lines, eval_data_lines = [], [], []
    train_data_lines = dict_lines[0:int(0.8*vocab_len)]
    test_data_lines = dict_lines[int(0.8*vocab_len):int(0.9*vocab_len)]
    eval_data_lines = dict_lines[int(0.9*vocab_len):]

    train_word, train_phonemes = sortingWP(train_data_lines)
    test_word, test_phonemes = sortingWP(test_data_lines)
    eval_word, eval_phonemes = sortingWP(eval_data_lines)
    
    return train_word, train_phonemes, test_word, test_phonemes, eval_word, eval_phonemes

In [858]:
train_word, train_phonemes, test_word, test_phonemes, eval_word, eval_phonemes = DataLoading(cfg.dict_name)

#Sanity Check
print(len(test_phonemes))
print(len(train_phonemes))
print(len(eval_phonemes))
print(eval_word[0])
print(eval_phonemes[0])

24175
2417
19340
2418
p r i v a a t r e g
p r i f A: t r { x


# Data Encoder & Decoder
converts data to their dictionary equivalents based on indices(And decoder which will be used when finally checking sequences)

In [859]:
def data_encoder(seq, isWord):
    # Automatically encoders sequence with graph2index if words
    tokenized_seq = []
    if isWord: 
        seq = [*seq] + ['</s>']
        seq = [i for i in seq if i!=" "]
        for i in seq:
            a = cfg.graph2index[i]
            tokenized_seq.append(a)
    #Else simply add end of sequence token to to phoneme sequences
    else:
        a = '<s> ' + str(seq) +' </s>'
        seq = a.split(" ")
        ans = ""
        for i in seq:
            if i== 'o': i="O"
            elif i== 'h': i="h_"
            a = cfg.phone2index[i]
            #ans = ans +", " + a
            tokenized_seq.append(a)
        #tokenized_seq = ans

    #Tokenize sequence
    return tokenized_seq

In [860]:
print(train_word[0])
print(train_phonemes[0])
a = data_encoder(train_word[0], 1)
b = data_encoder(train_phonemes[0], 0)
print(a)
print(b)

o n t s n a p
O n t s n a p
[18, 17, 23, 22, 17, 4, 19, 2]
[2, 13, 30, 34, 33, 30, 17, 31, 3]


In [861]:
def data_decoder(sequence, isWord):
    """Converts index sequence back into corresponding letter tokens"""
    if isWord: tokenizer = cfg.index2graph
    else: tokenizer = cfg.index2phone
    converted_sequence = []
    for i in sequence:
        if tokenizer[i] == "</s>": break
        a = tokenizer[i]
        converted_sequence.append(a)
    return converted_sequence

In [862]:
a = [14, 4, 19, 23, 8, 12, 17, 2]
b = [27, 17, 31, 34, 8, 30, 3]
a = data_decoder(a, 1)
b = data_decoder(b, 0)
print(a)
print(b)

['k', 'a', 'p', 't', 'e', 'i', 'n']
['k', 'a', 'p', 't', '@i', 'n']


# Dataset Class 
(Adapted from https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class)

In [863]:
class G2PData (data.Dataset):
    def __init__(self, graphemes, phonemes):
        self.graphemes = graphemes
        self.phonemes = phonemes

    def __len__(self):
        return len(self.graphemes)

    def __getitem__(self, index):
        graphemes = self.graphemes[index]
        phonemes = self.phonemes[index]

        #Fetches encoded versions
        grapheme_vector = data_encoder(graphemes, 1)
        phoneme_vector = data_encoder(phonemes, 0)

        #Omits </s> character
        decoder_inputs = phoneme_vector[:-1]
        phoneme_vector = phoneme_vector[1:]

        #Used for padding purposes
        g_vec_len = len(grapheme_vector) 
        p_vec_len = len(phoneme_vector)
        
        return grapheme_vector, phoneme_vector, decoder_inputs, g_vec_len, p_vec_len, graphemes, phonemes


In [864]:
trainDataset = G2PData(train_word, train_phonemes)
testDataset = G2PData(test_word, test_phonemes)
evalDataset = G2PData(eval_word, eval_phonemes)

Padding: To ensure datasets in the same batch are of the same length (Could also use bucketiterator to choose strings)

# Model Initialization
Based on seq2seq tutorial for Machine Translation (https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb)

Potential implementation of Attention: https://www.kaggle.com/code/omershect/learning-pytorch-seq2seq-with-m5-data-set/notebook

In [865]:
class Encoder(nn.Module):
    def __init__(self, embed_dim, hidden_dim, g_vocab_size, n_layers, dropout):
        super().__init__()
        self.embed = embed_dim
        self.hidden = hidden_dim
        self.embed = nn.Embedding(g_vocab_size, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first = True)
    
    def forward(self, graph_seq, graph_seq_len):
        embed_inputs = self.embed(graph_seq)
        inputs = self.dropout(embed_inputs)

        #https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
        #packs padded sequences into tensor
        input_tensor = pack_padded_sequence(inputs, graph_seq_len, batch_first=True, enforce_sorted=False)
        output, (hidden, context) = self.lstm(input_tensor)

        return hidden, context

In [866]:
class Decoder(nn.Module):
    def __init__(self, embed_dim, hidden_dim, p_vocab_size, n_layers, dropout):
        super().__init__()
        self.embed = embed_dim
        self.hidden = hidden_dim
        self.embed = nn.Embedding(p_vocab_size, embed_dim)
        self.dropout = nn.Dropout(dropout) 
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first = True)
        self.fc = nn.Linear(hidden_dim ,p_vocab_size) #Predicts output
    
    def forward(self, decoder_inputs, hidden_init, context_init):

        
        embed_inputs = self.embed(decoder_inputs)
        inputs = self.dropout(embed_inputs)

        #is already a tensor

        output, (hidden, context) = self.lstm(inputs, (hidden_init, context_init))


        #Scaling output
        activation_output = self.fc(output)
        
        
        return activation_output, hidden,context

In [867]:
class G2PModel(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.enc = encoder
        self.dec = decoder
        self.device = device
        
    def forward(self, graph_seq, graph_seq_len,phone_seq_len, decoder_inputs, phoneme_target_vec = None, training = False, teacher_forcing = cfg.teacher_forcing_ratio):
        
        #Obtain hidden and context vectors from encoder
        hidden_init, context_init = self.enc(graph_seq, graph_seq_len)
        hidden, context = hidden_init, context_init

        max_len = max(phone_seq_len)

        phoneme_input_vec = decoder_inputs[:, :1]
        outputs = [] 
        phone_pred_seq = []
        if training:
            for i in range(0, max_len):

                output, hidden, context = self.dec(phoneme_input_vec ,hidden, context)
                outputs.append(output)
                #phone_pred = torch.tensor(output.argmax(-1))

                if random.random() > teacher_forcing: 
                    phoneme_input_vec = phoneme_target_vec[:,i]
                    
                else:  phoneme_input_vec = decoder_inputs[:,i]
                phoneme_input_vec = torch.unsqueeze(phoneme_input_vec,1)

        else: #for evaluation/prediction
            for i in range(1, cfg.dec_max_len):
                output, hidden, context = self.dec(phoneme_input_vec ,hidden, context)
                
                phone_pred = output.argmax(-1)
                outputs.append(output)
                phone_pred_seq.append(phone_pred)
                phoneme_input_vec = phone_pred
                #print(i)
                #print(phoneme_input_vec.shape)
            phone_pred_seq = torch.cat(phone_pred_seq, 1)
            


        output = torch.cat(outputs, 1)
        
        return output, phone_pred_seq
        
    

In [868]:
# def init_weights(m):
#     for name, param in m.named_parameters():
#         nn.init.uniform_(param.data, -0.08, 0.08)
        
# model.apply(init_weights)

# Iterators Initialization and Padding

Padding_data takes a batch and pads it for every iteration

In [869]:
def padding_data(batch):

    #Each sequence has a form:
    # grapheme_vector, phoneme_vector, decoder_inputs, g_vec_len, p_vec_len, graphemes, phonemes

    def get_components(batch, index):
        ans = []
        for i in batch:
            ans.append(i[index])
        return ans
    
    def pad_seq(batch, index, max_len):
        #ans = np.empty(cfg.batch_size)
        ans = []
        no_zeros_to_add = 0
        for i in batch:
            no_zeros_to_add = max_len - len(i[index])
            ans.append(i[index] + [0] * no_zeros_to_add)
        return torch.LongTensor(ans)
    
    #input_lens = get_components(batch, 3)
    grapheme_lens = [len(g[0]) for g in batch]

    #output_lens = get_components(batch, 4)
    phonemes_lens = [len(p[1]) for p in batch]

    graphemes = get_components(batch, 5)
    phonemes = get_components(batch, -1)

    input_maxlen = max(grapheme_lens)
    output_maxlen = max(phonemes_lens)
    padded_inputs = pad_seq(batch, 0, input_maxlen)
    padded_outputs = pad_seq(batch, 1, output_maxlen)
    padded_decoder_inputs = pad_seq(batch, 2, output_maxlen)

    return padded_inputs, padded_outputs, padded_decoder_inputs, grapheme_lens, phonemes_lens, graphemes, phonemes

In [870]:
#Data Loader Implementation
#Shuffling not required as already loaded in a shuffled manner

train_iter =  data.DataLoader(trainDataset,batch_size=cfg.batch_size, shuffle=True, collate_fn=padding_data)
test_iter = data.DataLoader( testDataset,batch_size=cfg.batch_size, shuffle=False, collate_fn=padding_data)
eval_iter = data.DataLoader(evalDataset,batch_size=cfg.batch_size, shuffle=False, collate_fn=padding_data)

# Training and Evaluation Functions

In [871]:
encoder = Encoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)
decoder = Decoder(cfg.embed_dim, cfg.hidden_dim, cfg.g_vocab_size, cfg.n_layers, cfg.dropout)

model = G2PModel(encoder, decoder, cfg.device)
print(torch.cuda.is_available())
model.to(device=cfg.device)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


True
The model has 473,899 trainable parameters


Optimizer and Criterion Initializer

In [872]:
optimizer = optim.Adam(model.parameters(), lr=cfg.lr)
criterion = nn.CrossEntropyLoss(ignore_index=0) #Ignores index corresponding to padding

In [873]:
def train(model, iterator, optimizer, criterion, device):
    model.train() #sets model in training model
    
    loss_epoch = 0
    #print("Training")

    for i, batch in enumerate(iterator):
        grapheme_vector, phoneme_vector, decoder_inputs, g_vec_len, p_vec_len, graphemes, phonemes = batch

        
        #Placing vectors in GPU to streamline process
        grapheme_vector = grapheme_vector.to(device)
        phoneme_vector = phoneme_vector.to(device)
        decoder_inputs = phoneme_vector.to(device)

        optimizer.zero_grad() #Sets all gradients to zero

        #graph_seq, graph_seq_len, decoder_inputs, training = False, teacher_forcing = cfg.teacher_forcing_ratio

        phoneme_pred,_ = model(grapheme_vector, g_vec_len,p_vec_len, decoder_inputs,phoneme_vector, True)

        #phoneme_pred is in shape (batchsize, N, p_vocab_size) -> need to drop the last diameter
        # print(f"real shape: {phoneme_vector.shape}")
        # print(f"pred shape: {phoneme_pred.shape}")
        phoneme_pred = phoneme_pred.view(-1, phoneme_pred.shape[-1])
        phoneme_vector = phoneme_vector.view(-1)

        loss = criterion(phoneme_pred, phoneme_vector)

        

        loss.backward()

        optimizer.step()

        #Print loss every 50 batches
        # if (i % 50 == 0) and (i != 0) and i <100 : print(f" {i} batches completed: train loss: {loss}")
        # elif (i % 50 == 0) and (i != 0) : print(f"{i} batches completed: train loss: {loss}")

        loss_epoch += loss.item()


    
    return loss_epoch
        
        




In [874]:
def evaluate(model, iterator, device):
    model.eval()

    with torch.no_grad():

        for i, batch in enumerate(iterator):
            
            grapheme_vector, phoneme_vector, decoder_inputs, g_vec_len, p_vec_len, graphemes, phonemes = batch

            grapheme_vector = grapheme_vector.to(device)
            phoneme_vector = phoneme_vector.to(device)
            decoder_inputs = phoneme_vector.to(device)

            optimizer.zero_grad() #Sets all gradients to zero

            #graph_seq, graph_seq_len, decoder_inputs, training = False, teacher_forcing = cfg.teacher_forcing_ratio

            phoneme_pred, phoneme_pred_sequence = model(grapheme_vector, g_vec_len,p_vec_len, decoder_inputs,phoneme_vector, False) #False means only prediction and no training & teacher-forcing
            
            # First move tensor to CPU then to numpy array for decoding source: https://stackoverflow.com/questions/49768306/pytorch-tensor-to-numpy-array

            print(f"batch: {i}")
            for j in range(5):
                print(data_decoder(phoneme_vector[j].cpu().numpy(), 0))
                print(data_decoder(phoneme_pred_sequence[j].cpu().numpy(), 0))

    

In [875]:
#Phoneme Error Rate
def determine_PER(true_target, pred_target):
    total_phonemes, errors = 0, 0
    for c_true, c_pred in zip(true_target, pred_target):
        total_phonemes += len(c_true)
        errors += levenshtein(c_true, c_pred)

    PER = errors/total_phonemes
    return PER, errors

In [876]:
prev_best_loss = math.inf
for i in range(0, cfg.epochs):
    print(f"Epoch {i+1}")
    loss = train(model, train_iter, optimizer, criterion, cfg.device)
    #evaluate(model, eval_iter, cfg.device)
    print(loss)
    if prev_best_loss > loss:
        prev_best_loss = loss
        print("Loss decreased")


Epoch 1
424.0776467323303
Loss decreased
Epoch 2
293.98299288749695
Loss decreased
Epoch 3
219.5517120361328
Loss decreased
Epoch 4
173.93285381793976
Loss decreased
Epoch 5
141.00932800769806
Loss decreased
Epoch 6
116.83236026763916
Loss decreased
Epoch 7
98.74001979827881
Loss decreased
Epoch 8
84.99255195260048
Loss decreased
Epoch 9
73.69068428874016
Loss decreased
Epoch 10
65.2973039150238
Loss decreased
Epoch 11
58.738361060619354
Loss decreased
Epoch 12
52.65112681686878
Loss decreased
Epoch 13
47.99060997366905
Loss decreased
Epoch 14
44.106254264712334
Loss decreased
Epoch 15
40.88694451749325
Loss decreased
Epoch 16
37.782272189855576
Loss decreased
Epoch 17
34.815278336405754
Loss decreased
Epoch 18
32.69939823448658
Loss decreased
Epoch 19
30.412136405706406
Loss decreased
Epoch 20
28.270658925175667
Loss decreased
Epoch 21
26.727284617722034
Loss decreased
Epoch 22
24.834209382534027
Loss decreased
Epoch 23


KeyboardInterrupt: 

In [877]:
evaluate(model, eval_iter, cfg.device)

batch: 0
['p', 'r', 'i', 'f', 'A:', 't', 'r', '{', 'x']
['p', 'r', 'i', 'f', 'A:', 'r', 't', 'x', '@']
['r', '{', 'x', 's', 'A:', 'k', '@']
['r', '{', 'x', 's', 'A:', 'k', '@']
['s', 'a', 'f', 'A:', 'r', 'i']
['s', 'a', 'f', 'A:', 'r', '@']
['a', 'r', 'p', 'E', 'd', 'Z', 'i', 'u']
['a', 'r', 'p', '@', 'x', '@', 'x']
['l', '@', 'N', 'k', '@', 'r', 'k', 'a', 'n', 't']
['l', '@', 'N', 'k', '@', 'r', 'k', 'a', 'n', 't']
batch: 1
['f', '{', 'r', 'd', '@', 'r']
['f', '@', 'r', 'd', '@', 'r']
['v', '{', 'x', 'k', 'O', 'm']
['v', '{', 'x', 'k', 'O', 'm']
['{', 'x']
['{', 'x']
['v', '{', 'r', 'k', 'i']
['v', '{', 'r', 'k', 'i']
['i@', 'v', '@']
['i@', 'v', '@']
batch: 2
['d', 'r', 'u@', 'm']
['d', 'r', 'u@', 'm']
['b', '@', 'k', 'l', '@', 'N', 'k']
['b', '@', 'k', 'l', '@', 'N', 'k']
['s', 'a', 'x', '@', 'A:', 'r', 'd', '@']
['s', 'a', 'x', '@', 'A:', 'r', 'd', '@']
['v', '9', 'r', 'x', 'r', 'i@', 'p']
['v', '9', 'r', 'x', 'r', 'i@', 'p']
['d', '{', 'r', 'd', '@']
['d', '{', 'r', 'd', '@']
batc