# Neural Machine Translation

褚则伟 zeweichu@gmail.com

本段代码作为给稀牛学院学员参考之用。
本代码实现了Sequence to Sequence模型的一个参考，如有任何bug建议学员自行修复，也欢迎email汇报给我。

### load数据

In [12]:
import nltk
import sys
import re
train_file = "./data/train.txt"
enc = sys.getdefaultencoding()
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r', encoding = enc) as f:
        for line in f:
            line = line.strip().split("\t")
            en.append(["BOS"] + re.findall(r"[\w']+|[.,!?;]", line[0]) + ["EOS"])
            cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])
    return en, cn
train_en, train_cn = load_data(train_file)
num_train = len(train_en)

In [13]:
train_en[:3]

[['BOS', 'Anyone', 'can', 'do', 'that', '.', 'EOS'],
 ['BOS', 'How', 'about', 'another', 'piece', 'of', 'cake', '?', 'EOS'],
 ['BOS', 'She', 'married', 'him', '.', 'EOS']]

In [14]:
train_cn[:3]

[['BOS', '任', '何', '人', '都', '可', '以', '做', '到', '。', 'EOS'],
 ['BOS', '要', '不', '要', '再', '來', '一', '塊', '蛋', '糕', '？', 'EOS'],
 ['BOS', '她', '嫁', '给', '了', '他', '。', 'EOS']]

### 构建vocabulary

In [34]:
import os
import collections
import pickle

def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        pass
    
model_dir = "seq2seq"
make_dir(model_dir)

def build_dict(sentences, max_words=50000):
    # Flatten the 2 dims list and count words
    word_count = collections.Counter([word for sentence in sentences for word in sentence])
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 1
    word_dict = {w[0]: index+1 for (index, w) in enumerate(ls)}
    word_dict["UNK"] = 0
    return word_dict, total_words

vocab_file = os.path.join(model_dir, "vocab.pkl")
en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
    
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

### 将word转换成index

In [16]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    '''
        Encode the sequences. 
    '''
    length = len(en_sentences)
    out_en_sentences = []
    out_cn_sentences = []

    for i in range(length):
        en_seq = [en_dict[w] if w in en_dict else 0 for w in en_sentences[i]]
        cn_seq = [cn_dict[w] if w in cn_dict else 0 for w in cn_sentences[i]]
        out_en_sentences.append(en_seq)
        out_cn_sentences.append(cn_seq)

    # sort sentences by english lengths
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
       
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)

In [17]:
train_en[:5]

[[1, 3456, 3, 2],
 [1, 2199, 123, 2],
 [1, 2230, 123, 2],
 [1, 1255, 123, 2],
 [1, 2199, 123, 2]]

In [18]:
train_cn[:5]

[[1, 7, 86, 440, 5, 3, 2],
 [1, 118, 1367, 220, 2],
 [1, 981, 2027, 7, 3, 2],
 [1, 238, 238, 220, 2],
 [1, 150, 189, 220, 2]]

### 把数据转换成batch

In [19]:
batch_size = 128
import numpy as np

# get minibatches of 
def get_minibatches(n, minibatch_size, shuffle=False):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_mask = np.zeros((n_samples, max_len)).astype('float32')
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
        x_mask[idx, :lengths[idx]] = 1.0
    return x, x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_mask = prepare_data(mb_en_sentences)
        mb_y, mb_y_mask = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_mask, mb_y, mb_y_mask))
    return all_ex
train_data = gen_examples(train_en, train_cn, batch_size)

tf.contrib.seq2seq

In [20]:
import tensorflow as tf
from tensorflow.contrib import rnn

class Encoder:
    def __init__(self, embedding, hidden_size, num_layers = 1):
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell = rnn.GRUCell(self.hidden_size)
        
    def __call__(self, inputs, seq_length, state=None):
        out = tf.nn.embedding_lookup(self.embedding, inputs)
        for i in range(self.num_layers):
            out, state = tf.nn.dynamic_rnn(self.cell, out, sequence_length=seq_length, initial_state=state, dtype=tf.float32)
        return out, state

class Decoder:
    def __init__(self, embedding, hidden_size, num_layers=1, max_length=15):
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell = rnn.GRUCell(hidden_size)
        self.linear = tf.Variable(tf.random_normal(shape=(self.hidden_size, cn_total_words))*0.1)
        
        
    def __call__(self, inputs, state, encoder_state): # context vector
        
        out = tf.nn.embedding_lookup(self.embedding, inputs)
        out = tf.tile(tf.expand_dims(encoder_state, 1), (1, tf.shape(out)[1], 1))

        for i in range(self.num_layers):
#             state = tf.concat([state, encoder_state], 1)
            out, state = tf.nn.dynamic_rnn(self.cell, out, initial_state=state, dtype=tf.float32)
    
        out = tf.tensordot(out, self.linear, axes=[[2], [0]])
        return out, state

class Seq2Seq:
    def __init__(self, hidden_size, num_layers, embed_words_en, embed_words_cn):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_length = 15
        self.grad_clip = 5.0
        
        with tf.device("/cpu:0"):
            with tf.name_scope("place_holder"):
                self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int64, name="encoder_inputs")
                self.encoder_length = tf.placeholder(shape=(None, ), dtype=tf.int64, name="encoder_length")
                self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int64, name="decoder_inputs")
                self.decoder_target = tf.placeholder(shape=(None, None), dtype=tf.int64, name="decoder_target")
                self.decoder_mask = tf.placeholder(shape=(None, None), dtype=tf.float32, name="decoder_mask")

            with tf.name_scope("embedding"):
                self.embedding_en = tf.get_variable(name="embedding_en", dtype=tf.float32, shape=(en_total_words, hidden_size),
                                                    initializer=tf.constant_initializer(embed_words_en))
                self.embedding_cn = tf.get_variable(name="embedding_cn", dtype=tf.float32, shape=(cn_total_words, hidden_size),
                                                    initializer=tf.constant_initializer(embed_words_cn))
            with tf.name_scope("encoder-decoder"):
                self.encoder = Encoder(self.embedding_en, self.hidden_size, self.num_layers)
                self.decoder = Decoder(self.embedding_cn + self.hidden_size, self.hidden_size, self.num_layers)

            with tf.variable_scope("seq2seq-train"):
                encoder_outputs, encoder_state = self.encoder(self.encoder_inputs, self.encoder_length)
                tf.get_variable_scope().reuse_variables()
                decoder_state = encoder_state
                word_indices = self.decoder_inputs

                decoder_outputs, decoder_state = self.decoder(word_indices, decoder_state, encoder_state)

                # decoder_outputs.append(decoder_out)
                decoder_outputs = tf.concat(decoder_outputs, 1)

            with tf.name_scope("cost"):
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=decoder_outputs, labels=self.decoder_target)

                self.cost = tf.reduce_mean(loss * self.decoder_mask)

                tvars = tf.trainable_variables()
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip)
                optimizer = tf.train.RMSPropOptimizer(learning_rate=0.01)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            with tf.variable_scope("seq2seq-generate"):
                self.generate_outputs = []
                decoder_state = encoder_state
                word_indices = tf.expand_dims(self.decoder_inputs[:, 0], 1)
                for i in range(self.max_length):
                    decoder_out, decoder_state = self.decoder(word_indices, decoder_state, encoder_state)
                    softmax_out = tf.nn.softmax(decoder_out[:, 0, :])
                    word_indices = tf.expand_dims(tf.cast(tf.argmax(softmax_out, -1), dtype=tf.int64), 1)
                    self.generate_outputs.append(word_indices)
                self.generate_outputs = tf.concat(self.generate_outputs, 0)
            
            
    def train(self, sess, encoder_inputs, encoder_length, decoder_inputs, decoder_target, decoder_mask):
        _, cost = sess.run([self.train_op, self.cost], feed_dict={
            self.encoder_inputs: encoder_inputs, 
            self.encoder_length: encoder_length,
            self.decoder_inputs: decoder_inputs,
            self.decoder_target: decoder_target,
            self.decoder_mask: decoder_mask
        })
        return cost
    
    def generate(self, sess, encoder_inputs, encoder_length):
        decoder_inputs = np.asarray([[en_dict["BOS"]]*15], dtype="int64")
        if encoder_inputs.ndim == 1:
            encoder_inputs = encoder_inputs.reshape((1, -1))
            encoder_length = encoder_length.reshape((-1))
        generate = sess.run([self.generate_outputs],
                           feed_dict={self.encoder_inputs: encoder_inputs,
                                      self.decoder_inputs: decoder_inputs,
                                      self.encoder_length: encoder_length})[0]
        return generate
            

In [21]:
tf.reset_default_graph()
hidden_size = 50
num_layers = 1
emb_en = np.random.uniform(low=-0.1, high=0.1, size=(en_total_words, hidden_size))
emb_cn = np.random.uniform(low=-0.1, high=0.1, size=(cn_total_words, hidden_size))
model = Seq2Seq(hidden_size, num_layers, emb_en, emb_cn)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
epoch = 0
n_epochs = 30
# print(sess.run(model.decoder_state))
while epoch < n_epochs:
    epoch += 1
    total_loss = 0 
    total_num_ins = 0
    for (encoder_inputs, encoder_length, mb_y, mb_y_mask) in train_data:
        decoder_inputs = mb_y[:, :-1]
        decoder_target = mb_y[:, 1:]
#         print(encoder_length.sum(1).shape)
        loss = model.train(sess, encoder_inputs, encoder_length.sum(1), decoder_inputs, decoder_target, mb_y_mask[:, :-1])
        total_loss += loss
        total_num_ins += mb_y.shape[0]
    print("training loss: {}".format(total_loss / total_num_ins))
    
        

training loss: 0.03970086322732714
training loss: 0.028367880345014272
training loss: 0.0263932841220972
training loss: 0.0251889954592351
training loss: 0.02416398723250302
training loss: 0.023301005224019098
training loss: 0.02259101703950644
training loss: 0.02194472657898407
training loss: 0.021422825971572027
training loss: 0.020968513766675727
training loss: 0.020595596094695155
training loss: 0.020229067049752917
training loss: 0.01993888716224235
training loss: 0.019631148360760353
training loss: 0.01938906716995276
training loss: 0.019149154426718024
training loss: 0.018967886288676416
training loss: 0.018717446872194347
training loss: 0.018582233196196337
training loss: 0.018403751141582214
training loss: 0.01825184116817625
training loss: 0.01808356840940415
training loss: 0.017968122263387155
training loss: 0.017837092625925022
training loss: 0.017693190673734156
training loss: 0.017614874468694277
training loss: 0.017466924773402417
training loss: 0.01735667388191294
train

## 测试一些句子

In [24]:
encoder_inputs = [inv_en_dict[c] for c in train_data[11][0][2]]
print(encoder_inputs)
encoder_inputs = [en_dict.get(e, 0) for e in encoder_inputs]
encoder_inputs = np.asarray(encoder_inputs).reshape(1, -1)
encoder_length = np.asarray([encoder_inputs.shape[1]]).reshape(-1)
res = model.generate(sess, encoder_inputs, encoder_length).flatten()

res = [inv_cn_dict[r] for r in res]
print(res)

['BOS', "That's", 'what', 'Tom', 'promised', '.', 'EOS']
['那', '是', '汤', '汤', '汤', '汤', '汤', '汤', '姆', '姆', '。', 'EOS', 'EOS', 'UNK', 'UNK']


# Homework
- 我的代码全部是用的train dataset，同学们请尝试使用dev set做early stopping，存下在dev set上最好的模型。然后在test set上尝试生成一些句子，记录下一些有趣的结果。
- 由于我的模型是一个基本的sequence to sequence模型，效果不会特别好。请同学们找几个方向尝试改进模型。以下是几个建议尝试的方向
    - 把encoder改成bidirectional RNN
    - 我用的是GRUCell，同学们可以尝试RNNCell和LSTMCell看看效果如何。
    - 给decoder加上attention，可以参考tf.contrib.seq2seq
    - 尝试beam search in generate
    - try multi layer RNN for encoder or decoder
    - 同学们也可自行寻找网上的论文看看有没有别的好方法
- 我的代码不保证没有bug，所有同学们如果发现有任何的bug欢迎汇报给我，更鼓励同学们在交流群中讨论（批判）我的代码，更重要的是改进我的代码。