In [0]:
import numpy as np
import tensorflow as tf
# tf.enable_eager_execution()

In [2]:
tf.__version__

'1.13.0-rc0'

# Hyperparameters

In [0]:
params = dict()
params["batch_size"] = 16
params["maxlen"] = 100
params["num_epochs"] = 10
params["hidden_units"] = 32
params["graphemes"] = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"] + list("abcdefghijklmnopqrstuvwxyz")
params["phonemes"] = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"] + ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
                'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
                'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
                'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW0', 'OW1',
                'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 'UW',
                'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
params["lr"] = 0.001
params["eval_steps"] = 100

# Prepare Data

In [0]:
import nltk
# nltk.download('cmudict')
from nltk.corpus import cmudict
cmu = cmudict.dict()

In [0]:
def load_vocab():
    g2idx = {g: idx for idx, g in enumerate(params["graphemes"])}
    idx2g = {idx: g for idx, g in enumerate(params["graphemes"])}

    p2idx = {p: idx for idx, p in enumerate(params["phonemes"])}
    idx2p = {idx: p for idx, p in enumerate(params["phonemes"])}

    return g2idx, idx2g, p2idx, idx2p

In [0]:
def prepare_data():
    words = [word for word, prons in cmu.items()]
    prons = [" ".join(prons[0]) for word, prons in cmu.items()]
    indices = list(range(len(words)))
    from random import shuffle
    shuffle(indices)
    words = [words[idx] for idx in indices]
    prons = [prons[idx] for idx in indices]
    num_train, num_test = int(len(words)*.8), int(len(words)*.1)
    train_words, eval_words, test_words = words[:num_train], \
                                          words[num_train:-num_test],\
                                          words[-num_test:]
    train_prons, eval_prons, test_prons = prons[:num_train], \
                                          prons[num_train:-num_test],\
                                          prons[-num_test:]    
    return train_words, eval_words, test_words, train_prons, eval_prons, test_prons

In [0]:
train_words, eval_words, test_words, train_prons, eval_prons, test_prons = prepare_data()

# Data Loader

In [0]:
def generator_fn(words, prons, maxlen):
  '''
  words: list of words. e.g., ["word", ]
  prons: list of prons. e.g., ['W ER1 D',]
  maxlen: scalar.
  '''
  g2idx, idx2g, p2idx, idx2p = load_vocab()
    
  for word, pron in zip(words, prons):
      graphemes = list(word.decode('utf-8')) + ["<EOS>"]
      phonemes = pron.decode('utf-8').split() + ["<EOS>"]
      if max(len(graphemes), len(phonemes)) > maxlen: continue

      x = [g2idx.get(g, g2idx["<UNK>"]) for g in graphemes]
      y = [p2idx.get(p, p2idx["<UNK>"]) for p in phonemes]

      yield (x, len(x), word), (y, len(y), pron)



In [0]:
def input_fn(words, prons, maxlen, batch_size, shuffle=True, num_repeat=1):
    '''Batchify data
    words: list of words. e.g., ["word", ]
    prons: list of prons. e.g., ['W ER1 D',]
    maxlen: scalar.
    batch_size: scalar.
    shuffle: boolean
    num_repeat: int.
    '''
    shapes = (  ([None], (), ()),
                ([None], (), ())    )
    types = (   (tf.int32, tf.int32, tf.string),
                (tf.int32, tf.int32, tf.string)    )
    paddings = (    (0, 0, ''),
                    (0, 0, '')     )

    dataset = tf.data.Dataset.from_generator(
        generator_fn,
        output_shapes=shapes,
        output_types=types,
        args=(words, prons, maxlen))
        
    if shuffle:
        dataset = dataset.shuffle(64*batch_size)

    dataset = dataset.repeat(num_repeat)
    dataset = dataset.padded_batch(batch_size, shapes, paddings).prefetch(1)

    return dataset



# Model

In [0]:
tf.reset_default_graph()
class Model:
    def __init__(self, params):
        self.g2idx, self.idx2g, self.p2idx, self.idx2p = load_vocab()
        self.params = params
    
    def encode(self, xs):
        '''
        xs: tupple of 
            x: (N, T)
            seqlens: (N,)
            sent: (N,)
            
        returns last hidden state of shape (N, hidden_units)    
        '''
        with tf.variable_scope("encode"):
            x, seqlens, words = xs
            x = tf.one_hot(x, len(self.g2idx))
            cell = tf.contrib.rnn.GRUCell(self.params["hidden_units"])
            outputs, last_hidden = tf.nn.dynamic_rnn(cell, x, seqlens, dtype=tf.float32)
        
        return last_hidden, words
        
    
    def decode(self, ys, h0=None):
        '''
        ys: tupple of 
            y: (N, T)
            seqlens: (N,)
            sent: (N,)
            
        returns last hidden state of shape (N, hidden_units)  
        '''
        y, seqlens, prons = ys
        
        # decoder inputs <- shifted right
        inputs = tf.concat(  (tf.ones_like(y[:, :1])*self.p2idx["<BOS>"], y[:, :-1]),  -1   )
            
        with tf.variable_scope("decode"):
            inputs = tf.one_hot(inputs, len(self.p2idx))
            cell = tf.contrib.rnn.GRUCell(self.params["hidden_units"])
            outputs, _ = tf.nn.dynamic_rnn(cell, inputs, initial_state=h0, dtype=tf.float32)

            # projection
            logits = tf.layers.dense(outputs, len(self.p2idx))
            preds = tf.to_int32(tf.argmax(logits, axis=-1))
        
        return logits, preds, y, prons
        
    
    def forward(self, xs, ys):
        last_hidden, word = self.encode(xs)
        logits, preds, y, prons = self.decode(ys, h0=last_hidden)
        return word, logits, preds, y, prons

# Train & Evaluate

In [0]:
train_batches = input_fn(train_words, train_prons, params["maxlen"], params["batch_size"], \
                                                          shuffle=True, num_repeat=params["num_epochs"])
eval_batches = input_fn(eval_words, eval_prons, params["maxlen"], params["batch_size"], \
                                                          shuffle=False, num_repeat=1)

In [0]:
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_batches.output_types, eval_batches.output_shapes)
xs, ys = iter.get_next()

# create the initialisation operations
train_init_op = iter.make_initializer(train_batches)
eval_init_op = iter.make_initializer(eval_batches)

In [34]:
m = Model(params)
words, logits, preds, y, prons = m.forward(xs, ys)

# train
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
mask = tf.to_float(tf.not_equal(y, m.p2idx["<PAD>"])) # 0: <pad>
loss = tf.reduce_sum(ce*mask) / (tf.reduce_sum(mask)+1e-7)

global_step = tf.train.get_or_create_global_step()
train_op = tf.train.AdamOptimizer(params["lr"]).minimize(loss, global_step=global_step)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.


In [35]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(train_init_op)
    
    sv = tf.train.Saver()
    sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
    
    while True:
        try:
            # training
            _, _gs = sess.run([train_op, global_step])
            
            
            # evaluation
            if _gs%params["eval_steps"]==0:
                _loss = sess.run(loss)
                print("="*10, "global step=", _gs, "="*10)
                print("train loss= %.2f" % _loss)
                
                sess.run(eval_init_op)
                _words, _preds, _prons = sess.run([words, preds, prons])
                
                ## logging
                _w = _words[0].decode('utf-8')
                _gt = _prons[0].decode('utf-8')
                _p = " ".join(m.idx2p[each] for each in _preds[0]).split("<EOS>")[0]
                print("input:", _w)
                print("expected:", _gt)
                print("got:", _p)
                print()
            
        except tf.errors.OutOfRangeError: break


train loss= 3.54
input: tydings
expected: T AY1 D IH0 NG Z
got: R 

train loss= 3.26
input: tydings
expected: T AY1 D IH0 NG Z
got: R L L AH0 

train loss= 3.18
input: tydings
expected: T AY1 D IH0 NG Z
got: R L AH0 AH0 

train loss= 3.14
input: tydings
expected: T AY1 D IH0 NG Z
got: R L AH0 AH0 

train loss= 3.09
input: tydings
expected: T AY1 D IH0 NG Z
got: R L L AH0 

train loss= 3.04
input: tydings
expected: T AY1 D IH0 NG Z
got: R L N AH0 N 

train loss= 2.99
input: tydings
expected: T AY1 D IH0 NG Z
got: S AH0 L AH0 N 

train loss= 2.95
input: tydings
expected: T AY1 D IH0 NG Z
got: K AH0 L AH0 N 

train loss= 2.91
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.87
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.83
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.79
input: tydings
expected: T AY1 D IH0 NG Z
got: B AE1 L AH0 N 

train loss= 2.75
input: tydings
expected: T AY1 D IH0 NG Z
got: B 

KeyboardInterrupt: ignored

# Inference

In [0]:
tf.reset_default_graph()

In [0]:
test_batches = input_fn(test_words, "", params["maxlen"], params["batch_size"], \
                                                          shuffle=False, num_repeat=1)

In [0]:
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes)
xs, ys = iter.get_next()

# create the initialisation operations
test_init_op = iter.make_initializer(test_batches)

(<tf.Tensor 'IteratorGetNext:0' shape=(?, ?) dtype=int32>,
 <tf.Tensor 'IteratorGetNext:1' shape=(?,) dtype=int32>,
 <tf.Tensor 'IteratorGetNext:2' shape=(?,) dtype=string>)

In [0]:
m = Model(params)
last_hidden, words = m.encode(xs)

preds = tf.fill((tf.shape(last_hidden)[0], 1), m.p2idx["<BOS>"])
for t in range(params["maxlen"]):
    logits, _preds, _, _ = m.decode(preds, h0=last_hidden, mode="infer")
    preds = _preds

    



ValueError: Variable decode/rnn/gru_cell/gates/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-11-b0c029ca6eb3>", line 44, in decode
    outputs, _ = tf.nn.dynamic_rnn(cell, inputs, initial_state=h0, dtype=tf.float32)
  File "<ipython-input-15-12d5d59b25f7>", line 6, in <module>
    logits, preds, _, _ = m.decode(inputs, h0=last_hidden, mode="infer")
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


In [0]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(test_init_op)
    
    while True:
        try:
            # training
            _preds = sess.run([preds])