# 基于Embedding的RNN生成混搭歌词
基于词粒度对RNN进行训练，文本采用中文歌词的分词文本。模型中加入Embedding层来降低输入词的维度。

# 1 读取数据

In [1]:
def load_data(fname):
    with open(fname, 'r') as f:
        text = f.read()
    
    data = text.split()
    return data

In [4]:
text = load_data('data/split.txt')

In [5]:
print("前10个词: {}".format(text[:10]))

前10个词: ['疯狂', '世界', '.', '如果说', '了', '后悔', '.', '是不是', '一切', '就']


# 2 数据预处理

In [6]:
# 构造词典及映射
vocab = set(text)
vocab_to_int = {w: idx for idx, w in enumerate(vocab)}
int_to_vocab = {idx: w for idx, w in enumerate(vocab)}

In [7]:
print('Total words: {}'.format(len(text)))
print('Vocab size: {}'.format(len(vocab)))

Total words: 103558
Vocab size: 11143


In [8]:
# 转换文本为整数
int_text = [vocab_to_int[w] for w in text]

# 3 构建网络

In [9]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
import numpy as np

In [10]:
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow版本: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('未发现GPU，请使用GPU进行训练！')
else:
    print('默认GPU设备: {}'.format(tf.test.gpu_device_name()))

TensorFlow版本: 1.0.0
默认GPU设备: /gpu:0


## 输入层

In [11]:
def get_inputs():
    '''
    构建输入层
    '''
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    return inputs, targets, learning_rate

## RNN Cell

In [63]:
def get_init_cell(batch_size, rnn_size):
    '''
    构建堆叠RNN单元
    
    参数
    ---
    batch_size: 每个batch的大小
    rnn_size: RNN隐层神经元个数
    '''
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm])
    
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, 'initial_state')
    return cell, initial_state

# Word Embedding

In [64]:
def get_embed(input_data, vocab_size, embed_dim):
    '''
    单词太多，需要进行embedding
    
    参数
    ---
    input_data: 输入的tensor
    vocab_size: 词汇表大小
    embed_dim: 嵌入维度
    '''
    embedding = tf.Variable(tf.random_uniform([vocab_size, embed_dim], -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    
    return embed

# Build RNN

In [65]:
def build_rnn(cell, inputs):
    '''
    构建RNN模型
    
    参数:
    ---
    cell: RNN单元
    inputs: 输入的batch
    '''
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    
    final_state = tf.identity(final_state, 'final_state')
    return outputs, final_state

# Build Neural Network

In [66]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    '''
    构建神经网络，将RNN层与全连接层相连
    
    参数:
    ---
    cell: RNN单元
    rnn_size: RNN隐层结点数量
    input_data: input tensor
    vocab_size
    embed_dim: 嵌入层大小
    
    '''
    embed = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embed)
    
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    return logits, final_state

# 构造batch

在这里，我们将采用以下方式进行batch的构造，如果我们有一个1-20的序列，传入参数batch_size=3, seq_length=2的话，希望返回以下一个四维的向量。

分为了三个batch，每个batch中包含了输入和对应的目标输出。
get_batches([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 3, 2)

  # First Batch
  [
    # Batch of Input
    [[ 1  2], [ 7  8], [13 14]]
    # Batch of targets
    [[ 2  3], [ 8  9], [14 15]]
  ]

  # Second Batch
  [
    # Batch of Input
    [[ 3  4], [ 9 10], [15 16]]
    # Batch of targets
    [[ 4  5], [10 11], [16 17]]
  ]

  # Third Batch
  [
    # Batch of Input
    [[ 5  6], [11 12], [17 18]]
    # Batch of targets
    [[ 6  7], [12 13], [18  1]]
  ]
]

In [67]:
def get_batches(int_text, batch_size, seq_length):
    '''
    构造batch
    '''
    batch = batch_size * seq_length
    n_batch = len(int_text) // batch
    
    int_text = np.array(int_text[:batch * n_batch]) # 保留能构成完整batch的数量
    
    int_text_targets = np.zeros_like(int_text)
    int_text_targets[:-1], int_text_targets[-1] = int_text[1:], int_text[0]
    
    # 切分
    x = np.split(int_text.reshape(batch_size, -1), n_batch, -1)
    y = np.split(int_text_targets.reshape(batch_size, -1), n_batch, -1)
    
    return np.stack((x, y), axis=1) # 组合

# 3 模型训练

In [68]:
# Number of Epochs
num_epochs = 100
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 512
# Embedding Dimension Size
embed_dim = 200
# Sequence Length
seq_length = 20
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 100

In [69]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab) # vocab_size
    input_text, targets, lr = get_inputs() # 输入tensor
    input_data_shape = tf.shape(input_text)
    # 初始化RNN
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # 计算softmax层概率
    probs = tf.nn.softmax(logits, name='probs')

    # 损失函数
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # 优化函数
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

In [70]:
# 获取batch
batches = get_batches(int_text, batch_size, seq_length)
# 定义参数存储目录
save_dir = './save'

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # 每训练一定阶段对结果进行打印
            if (epoch * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch,
                    batch_i,
                    len(batches),
                    train_loss))
    # 保存模型
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/80   train_loss = 9.318
Epoch   1 Batch   20/80   train_loss = 5.227
Epoch   2 Batch   40/80   train_loss = 3.993
Epoch   3 Batch   60/80   train_loss = 2.916
Epoch   5 Batch    0/80   train_loss = 2.104
Epoch   6 Batch   20/80   train_loss = 1.980
Epoch   7 Batch   40/80   train_loss = 1.770
Epoch   8 Batch   60/80   train_loss = 1.329
Epoch  10 Batch    0/80   train_loss = 1.064
Epoch  11 Batch   20/80   train_loss = 0.848
Epoch  12 Batch   40/80   train_loss = 0.723
Epoch  13 Batch   60/80   train_loss = 0.528
Epoch  15 Batch    0/80   train_loss = 0.480
Epoch  16 Batch   20/80   train_loss = 0.480
Epoch  17 Batch   40/80   train_loss = 0.394
Epoch  18 Batch   60/80   train_loss = 0.364
Epoch  20 Batch    0/80   train_loss = 0.410
Epoch  21 Batch   20/80   train_loss = 0.451
Epoch  22 Batch   40/80   train_loss = 0.568
Epoch  23 Batch   60/80   train_loss = 0.858
Epoch  25 Batch    0/80   train_loss = 1.164
Epoch  26 Batch   20/80   train_loss = 1.154
Epoch  27 

In [71]:
def get_tensors(loaded_graph):
    '''
    获取模型训练结果参数
    
    参数
    ---
    loaded_graph: 从文件加载的tensroflow graph
    '''
    inputs = loaded_graph.get_tensor_by_name('inputs:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    return inputs, initial_state, final_state, probs

In [72]:
def pick_word(probabilities, int_to_vocab):
    '''
    选择单词进行文本生成，用来以一定的概率生成下一个词
    
    参数
    ---
    probabilities: Probabilites of the next word
    int_to_vocab: 映射表
    '''
    result = np.random.choice(len(probabilities), 50, p=probabilities)
    return int_to_vocab[result[0]]

In [78]:
# 生成文本的长度
gen_length = 300

# 定义冷启动的单词
prime_word = '离开'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # 加载模型
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)

    # 获取训练的结果参数
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word]
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # 生成句子
    for n in range(gen_length):
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # 预测
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    lyrics = ' '.join(gen_sentences)
    lyrics = lyrics.replace(';', '\n')
    lyrics = lyrics.replace('.', ' ')
    lyrics = lyrics.replace(' ', '')
        
    print(lyrics)

离开就说不要爱我掉进爱情悬崖跌太深爬不出来下降的速度太快那颜色我已全然龙你说我若一个人会比较自由我不懂你说什么反正不会松手我背你走到最后能不能别想太多会不会手牵着手晚一点才到尽头你说我说像手牵手
距离的噪音化为乌有
你说你说我听个够
这世界，身邊還會有谁，都梦想没有那的你永远那么想你曾停下过吗你说你会哭不是因为在乎不是因为在乎不是因为在乎朦胧的时间
和你都回来吧
你的什么
而你已经不爱我最安静都还不懂不想太多我已经
是你把你万一没万一我我最的鱼自由
我只想你
笑为我
听不到
我想逃往哪里逃
你都看不见你都要结束我不
我就像
不要说
其实我的快乐不要问我不要问我我不要再想我的快乐为什么我却还在流
爱是笑呵呵的风
然后哎呀呀的痛
直到你太多把我放
如果你天生
(间奏)我说你已经听你听我的把心你看宇宙(你在
