# 文本生成
基于RNN对文本进行学习，从而让模型自动来写文章

# 1 读取数据

In [1]:
def load_data(fname):
    with open(fname, 'r') as f:
        text = f.read()
    
    data = text.split()
    return data

In [2]:
text = load_data('data/Android_split')

In [3]:
print("First 10th words: {}".format(text[:10]))

First 10th words: ['岗位', '职责', '负责', '安卓产品', '规划', '设计', '跟进', '完成', '上', '线']


# 2 数据预处理

In [4]:
# 构造词典及映射
vocab = set(text)
vocab_to_int = {w: idx for idx, w in enumerate(vocab)}
int_to_vocab = {idx: w for idx, w in enumerate(vocab)}

In [5]:
print('Total words: {}'.format(len(text)))
print('Vocab size: {}'.format(len(vocab)))

Total words: 280420
Vocab size: 9393


In [6]:
# 转换文本为整数
int_text = [vocab_to_int[w] for w in text]

# 3 构建网络

In [7]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
import numpy as np

In [8]:
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0


  import sys


# 输入层

In [9]:
def get_inputs():
    '''
    构建输入层
    '''
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    return inputs, targets, learning_rate

# RNN Cell

In [10]:
def get_init_cell(batch_size, rnn_size):
    '''
    构建堆叠RNN单元
    
    参数
    ---
    batch_size: 每个batch的大小
    rnn_size: RNN隐层神经元个数
    '''
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm])
    
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, 'initial_state')
    return cell, initial_state

# Word Embedding

In [11]:
def get_embed(input_data, vocab_size, embed_dim):
    '''
    单词太多，需要进行embedding
    
    参数
    ---
    input_data: 输入的tensor
    vocab_size: 词汇表大小
    embed_dim: 嵌入维度
    '''
    embedding = tf.Variable(tf.random_uniform([vocab_size, embed_dim], -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    
    return embed

# Build RNN

In [12]:
def build_rnn(cell, inputs):
    '''
    构建RNN模型
    
    参数:
    ---
    cell: RNN单元
    inputs: 输入的batch
    '''
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    
    final_state = tf.identity(final_state, 'final_state')
    return outputs, final_state

# Build Neural Network

In [13]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    '''
    构建整个神经网络
    
    '''
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    return logits, final_state

# 构造batch

In [14]:
def get_batches(int_text, batch_size, seq_length):
    '''
    构造batch
    '''
    batch = batch_size * seq_length
    n_batch = len(int_text) // batch
    
    int_text = np.array(int_text[:batch * n_batch])
    
    int_text_targets = np.zeros_like(int_text)
    int_text_targets[:-1], int_text_targets[-1] = int_text[1:], int_text[0]
    
    x = np.split(int_text.reshape(batch_size, -1), n_batch, -1)
    y = np.split(int_text_targets.reshape(batch_size, -1), n_batch, -1)
    
    return np.stack((x, y), axis=1)

# 3 模型训练

In [21]:
# Number of Epochs
num_epochs = 200
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 512
# Embedding Dimension Size
embed_dim = 200
# Sequence Length
seq_length = 15
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 100

In [22]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab) # vocab_size
    input_text, targets, lr = get_inputs() # 输入tensor
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [36]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

Epoch   0 Batch    0/109   train_loss = 9.147
Epoch   0 Batch  100/109   train_loss = 4.171
Epoch   1 Batch   91/109   train_loss = 3.119
Epoch   2 Batch   82/109   train_loss = 2.554
Epoch   3 Batch   73/109   train_loss = 2.344


KeyboardInterrupt: 