In [1]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

In [2]:
with open("letters_source.txt", 'r', encoding="utf-8") as f:
    source_data = f.read()
with open("letters_target.txt", 'r', encoding="utf-8") as p:
    target_data = p.read()

In [None]:
source_data.split('\n')[:30]

In [None]:
target_data.split('\n')[:30]

In [3]:
def extract_character_vocab(data):
    #构造映射表
    special_words = ['<PAD>', 'UNK', '<GO>', '<EOS>']
    set_words = list(set([character for line in data.split('\n') for character in line]))
    #把4个特殊字符放进词典
    int_to_vocab = {idx: word for idx, word in enumerate(special_words + set_words)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    
    return int_to_vocab, vocab_to_int

In [4]:
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

In [None]:
print(source_letter_to_int)

In [5]:
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['UNK']) for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['UNK']) for letter in line] for line in target_data.split('\n')]

In [None]:
source_int_to_letter

In [None]:
target_int_to_letter

In [None]:
source_int[:10]

In [None]:
target_int[:10]

In [6]:
special_words = ['<PAD>', 'UNK', '<GO>', '<EOS>']
set_words = list(set([character for line in target_data.split('\n') for character in line]))
print(set_words)


['j', 's', 'q', 'd', 'v', 'x', 'w', 'o', 'p', 'k', 'l', 'c', 'u', 'y', 'i', 'f', 't', 'z', 'r', 'a', 'h', 'b', 'n', 'g', 'm', 'e']


In [7]:
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name="inputs")
    targets = tf.placeholder(tf.int32, [None, None], name="targets")
    learning_rate = tf.placeholder(tf.float32, name="learning_rate")
    
    target_sequence_length = tf.placeholder(tf.int32, (None,), name="target_sequence_length")
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name="max_target_len")
    source_sequence_length = tf.placeholder(tf.int32, (None,), name="source_sequence_length")
    

    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

In [8]:
def get_encoder_layer(input_data, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoding_embedding_size):
    #encode embedding
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)
    #RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell 
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, sequence_length=source_sequence_length, dtype=tf.float32)
    
    return encoder_output, encoder_state
                                        

In [9]:
def process_decoder_input(data, vocab_to_int, batch_size):
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return decoder_input

In [10]:
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size, target_sequence_length, max_target_sequence_length,
                  encoder_state, decoder_input):
    #1 Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    
    #2 构造Decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        
        return decoder_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
    
    #3 Output全连接层
    output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    
    #4 Training decoder
    with tf.variable_scope("decode"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, sequence_length=target_sequence_length, time_major=False)
        
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell, training_helper, encoder_state, output_layer)
        
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
    
    #5 Predict decoder
    with tf.variable_scope("decode", reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name="start_tokens")
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell, predicting_helper, encoder_state, output_layer)
        
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
        
    return training_decoder_output, predicting_decoder_output

In [11]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length, source_vocab_size,
                 target_vocab_size, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers):
    # 获取encoder的状态输出
    _, encoder_state = get_encoder_layer(input_data, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoding_embedding_size)
    
    #预处理后的decoder输入
    decoder_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    #将状态向量与输入传递给decoder
    training_decoder_output, predicting_decoder_output = decoding_layer(target_letter_to_int, 
                                                                        decoding_embedding_size, 
                                                                        num_layers, rnn_size,
                                                                       target_sequence_length, 
                                                                        max_target_sequence_length, 
                                                                        encoder_state, 
                                                                        decoder_input)
    return training_decoder_output, predicting_decoder_output

    

In [12]:
#超参数
epoches = 50
batch_size = 128
rnn_size = 50
num_layers = 2
encoding_embedding_size = 15
decoding_embedding_size = 15
learning_rate = 0.001

In [13]:
train_graph = tf.Graph()

with train_graph.as_default():
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_sequence_length,
                                                                      source_sequence_length, len(source_letter_to_int), len(target_letter_to_int),
                                                                      encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers)
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name="prediction")
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name="mask")