We'll write a simple template for seq2seq using PyTorch. For demonstration, we attack the g2p task. G2p is a task of converting graphemes (spelling) to phonemes (pronunciation). It's a very good source for this purpose as it's simple enough for you to up and run. If you want to know more about g2p, see my repo

In [1]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [140]:
import numpy as np
np.set_printoptions(threshold=1000)
from tqdm import tqdm_notebook as tqdm
from distance import levenshtein
import os
import math
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils import data

In [6]:
torch.__version__

'1.0.1'

# Hyperparameters

In [8]:
class Hparams:
    batch_size = 128
    enc_maxlen = 20
    dec_maxlen = 20
    num_epochs = 10
    hidden_units = 128
    graphemes = ["<pad>", "<unk>", "</s>"] + list("abcdefghijklmnopqrstuvwxyz")
    phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
                    'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
                    'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
                    'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1',
                    'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW',
                    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
    lr = 0.001
    logdir = "log/04"
hp = Hparams()

# Prepare Data

In [9]:
import nltk
# nltk.download('cmudict')# <- if you haven't downloaded, do this.
from nltk.corpus import cmudict
cmu = cmudict.dict()
cmu["refuse"]

[['R', 'AH0', 'F', 'Y', 'UW1', 'Z'],
 ['R', 'EH1', 'F', 'Y', 'UW2', 'Z'],
 ['R', 'IH0', 'F', 'Y', 'UW1', 'Z']]

In [10]:
def load_vocab():
    g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
    idx2g = {idx: g for idx, g in enumerate(hp.graphemes)}

    p2idx = {p: idx for idx, p in enumerate(hp.phonemes)}
    idx2p = {idx: p for idx, p in enumerate(hp.phonemes)}

    return g2idx, idx2g, p2idx, idx2p # note that g and p mean grapheme and phoneme, respectively.

In [11]:
def prepare_data():
    words = [" ".join(list(word)) for word, prons in cmu.items()]
    prons = [" ".join(prons[0]) for word, prons in cmu.items()]
    indices = list(range(len(words)))
    from random import shuffle
    shuffle(indices)
    words = [words[idx] for idx in indices]
    prons = [prons[idx] for idx in indices]
    num_train, num_test = int(len(words)*.8), int(len(words)*.1)
    train_words, eval_words, test_words = words[:num_train], \
                                          words[num_train:-num_test],\
                                          words[-num_test:]
    train_prons, eval_prons, test_prons = prons[:num_train], \
                                          prons[num_train:-num_test],\
                                          prons[-num_test:]    
    return train_words, eval_words, test_words, train_prons, eval_prons, test_prons

In [12]:
train_words, eval_words, test_words, train_prons, eval_prons, test_prons = prepare_data()
print(train_words[0])
print(train_prons[0])

i n d o n e s i a ' s
IH2 N D OW0 N IY1 ZH AH0 Z


In [13]:
def drop_lengthy_samples(words, prons, enc_maxlen, dec_maxlen):
    """We only include such samples less than maxlen."""
    _words, _prons = [], []
    for w, p in zip(words, prons):
        if len(w.split()) + 1 > enc_maxlen: continue
        if len(p.split()) + 1 > dec_maxlen: continue # 1: <EOS>
        _words.append(w)
        _prons.append(p)
    return _words, _prons          

In [14]:
train_words, train_prons = drop_lengthy_samples(train_words, train_prons, hp.enc_maxlen, hp.dec_maxlen)
# We do NOT apply this constraint to eval and test datasets.

# Data Loader

In [53]:
def encode(inp, type, dict):
    '''type: "x" or "y"'''
#     inp_str = inp.decode("utf-8")
    if type=="x": tokens = inp.split() + ["</s>"]
    else: tokens = ["<s>"] + inp.split() + ["</s>"]

    x = [dict.get(t, dict["<unk>"]) for t in tokens]
    return x


In [163]:
class G2pDataset(Dataset):

    def __init__(self, words, prons):
        """
        words: list of words. e.g., ["word", ]
        prons: list of prons. e.g., ['W ER1 D',]
        maxlen: scalar.
        """
        self.words = words
        self.prons = prons

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        g2idx, idx2g, p2idx, idx2p = load_vocab()
        
        word, pron = self.words[idx], self.prons[idx]
        x = encode(word, "x", g2idx)
        y = encode(word, "y", g2idx)
        decoder_input, y = y[:-1], y[1:]

        x_seqlen, y_seqlen = len(x), len(y)
                
        return x, x_seqlen, word, decoder_input, y, y_seqlen, pron

In [164]:
train_dataset = G2pDataset(train_words, train_prons)
eval_dataset = G2pDataset(eval_words, eval_prons)
# test_dataset = G2pDataset(test_words, test_prons)

In [265]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    x_seqlens = f(1)
    y_seqlens = f(5)
    words = f(2)
    prons = f(-1)
    
    x_maxlen = np.array(x_seqlens).max()
    y_maxlen = np.array(y_seqlens).max()
    
    f = lambda x, seqlen: [sample[x]+[0]*(seqlen-len(sample[x])) for sample in batch]
    x = f(0, x_maxlen)
    decoder_inputs = f(3, y_maxlen)
    y = f(4, y_maxlen)
    
    f = torch.IntTensor
    return f(x), f(x_seqlens), words, f(decoder_inputs), f(y), f(y_seqlens), prons

In [266]:
train_loader = DataLoader(train_dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=trim)
eval_loader = DataLoader(eval_dataset, batch_size=hp.batch_size, shuffle=False, collate_fn=trim)

In [267]:
train_iter = iter(train_loader)
eval_iter = iter(eval_loader)

# Model

In [306]:
class Encoder():
    pass

class Decoder():
    pass

class Net():
    pass

In [307]:
tf.reset_default_graph()
class Net:
    def __init__(self, params):
        self.g2idx, self.idx2g, self.p2idx, self.idx2p = load_vocab()
        self.params = params
    
    def encode(self, xs):
        '''
        xs: tupple of 
            x: (N, T)
            seqlens: (N,)
            word: (N,)
            
        returns last hidden state of shape (N, hidden_units)    
        '''
        with tf.variable_scope("encode"):
            x, seqlens, words = xs
            x = tf.one_hot(x, len(self.g2idx))
            cell = tf.contrib.rnn.GRUCell(self.params["hidden_units"])
            outputs, last_hidden = tf.nn.dynamic_rnn(cell, x, seqlens, dtype=tf.float32)
        
        return last_hidden, words
        
    
    def decode(self, ys, h0=None):
        '''
        ys: tupple of
            decoder_inputs: (N, T)
            y: (N, T)
            seqlens: (N,)
            pron: (N,)
            
        returns last hidden state of shape (N, hidden_units)  
        '''
        decoder_inputs, y, seqlens, prons = ys
            
        with tf.variable_scope("decode"):
            inputs = tf.one_hot(decoder_inputs, len(self.p2idx))
            cell = tf.contrib.rnn.GRUCell(self.params["hidden_units"])
            outputs, _ = tf.nn.dynamic_rnn(cell, inputs, initial_state=h0, dtype=tf.float32)

            # projection
            logits = tf.layers.dense(outputs, len(self.p2idx))
            preds = tf.to_int32(tf.argmax(logits, axis=-1))
        
        return logits, preds, y, prons
        
    
    def forward(self, xs, ys):
        last_hidden, word = self.encode(xs)
        logits, preds, y, prons = self.decode(ys, h0=last_hidden)
        return word, logits, preds, y, prons

NameError: name 'tf' is not defined

# Train & Evaluate

In [305]:
train_batches = data.DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
eval_batches = data.DataLoader(eval_dataset, batch_size=params["batch_size"], shuffle=True)

In [0]:
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_batches.output_types, eval_batches.output_shapes)
xs, ys = iter.get_next()

# create the initialisation operations
train_init_op = iter.make_initializer(train_batches)
eval_init_op = iter.make_initializer(eval_batches)

In [34]:
m = Model(params)
words, logits, preds, y, prons = m.forward(xs, ys)

# train
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
mask = tf.to_float(tf.not_equal(y, m.p2idx["<PAD>"])) # 0: <pad>
loss = tf.reduce_sum(ce*mask) / (tf.reduce_sum(mask)+1e-7)

global_step = tf.train.get_or_create_global_step()
train_op = tf.train.AdamOptimizer(params["lr"]).minimize(loss, global_step=global_step)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.


In [35]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(train_init_op)
    
    sv = tf.train.Saver()
    sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
    
    while True:
        try:
            # training
            _, _gs = sess.run([train_op, global_step])
            
            
            # evaluation
            if _gs%params["eval_steps"]==0:
                _loss = sess.run(loss)
                print("="*10, "global step=", _gs, "="*10)
                print("train loss= %.2f" % _loss)
                
                sess.run(eval_init_op)
                _words, _preds, _prons = sess.run([words, preds, prons])
                
                ## logging
                _w = _words[0].decode('utf-8')
                _gt = _prons[0].decode('utf-8')
                _p = " ".join(m.idx2p[each] for each in _preds[0]).split("<EOS>")[0]
                print("input:", _w)
                print("expected:", _gt)
                print("got:", _p)
                print()
            
        except tf.errors.OutOfRangeError: break


train loss= 3.54
input: tydings
expected: T AY1 D IH0 NG Z
got: R 

train loss= 3.26
input: tydings
expected: T AY1 D IH0 NG Z
got: R L L AH0 

train loss= 3.18
input: tydings
expected: T AY1 D IH0 NG Z
got: R L AH0 AH0 

train loss= 3.14
input: tydings
expected: T AY1 D IH0 NG Z
got: R L AH0 AH0 

train loss= 3.09
input: tydings
expected: T AY1 D IH0 NG Z
got: R L L AH0 

train loss= 3.04
input: tydings
expected: T AY1 D IH0 NG Z
got: R L N AH0 N 

train loss= 2.99
input: tydings
expected: T AY1 D IH0 NG Z
got: S AH0 L AH0 N 

train loss= 2.95
input: tydings
expected: T AY1 D IH0 NG Z
got: K AH0 L AH0 N 

train loss= 2.91
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.87
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.83
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.79
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.75
input: tydings
expected: T AY1 D IH0 NG Z
got: B 

KeyboardInterrupt: ignored

# Inference

In [0]:
tf.reset_default_graph()

In [0]:
test_batches = input_fn(test_words, "", params["maxlen"], params["batch_size"], \
                                                          shuffle=False, num_repeat=1)

In [0]:
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes)
xs, ys = iter.get_next()

# create the initialisation operations
test_init_op = iter.make_initializer(test_batches)

(<tf.Tensor 'IteratorGetNext:0' shape=(?, ?) dtype=int32>,
 <tf.Tensor 'IteratorGetNext:1' shape=(?,) dtype=int32>,
 <tf.Tensor 'IteratorGetNext:2' shape=(?,) dtype=string>)

In [0]:
m = Model(params)
last_hidden, words = m.encode(xs)

preds = tf.fill((tf.shape(last_hidden)[0], 1), m.p2idx["<BOS>"])
for t in range(params["maxlen"]):
    logits, _preds, _, _ = m.decode(preds, h0=last_hidden, mode="infer")
    preds = _preds

    



ValueError: Variable decode/rnn/gru_cell/gates/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-11-b0c029ca6eb3>", line 44, in decode
    outputs, _ = tf.nn.dynamic_rnn(cell, inputs, initial_state=h0, dtype=tf.float32)
  File "<ipython-input-15-12d5d59b25f7>", line 6, in <module>
    logits, preds, _, _ = m.decode(inputs, h0=last_hidden, mode="infer")
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


In [0]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(test_init_op)
    
    while True:
        try:
            # training
            _preds = sess.run([preds])