# RNNLM
Recurrent Neural Network Language Model
RNN による言語モデルです。
文章の集団を学習させることで、それっぽい文章を生成できます。

これが発展して Seq2Seq のデコーダー部分になっていきます。

In [1]:
import tensorflow as tf
from tensorflow.python.layers import core as layers_core
import numpy as np
import os
import random
import collections

# Create Model

In [2]:
hidden_dim = 1024
embedding_dim = 256
vocab_size = 1000

In [3]:
# 入出力部分
in_ph = tf.placeholder(tf.int32, shape=[None, None], name='in_ph')
out_ph = tf.placeholder(tf.int32, shape=[None, None], name='out_ph')
len_ph = tf.placeholder(tf.int32, shape=[None], name='len_ph')
gen_start_token_ph = tf.placeholder(tf.int32, shape=[], name='gen_start_token_ph')

In [4]:
def debug(ops):
    '''与えられた計算ノードの値を表示します。'''
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        result = sess.run(ops, {
            in_ph: [[30, 40, 50], [160, 170, 180]],
            out_ph:[[40, 50, 60], [170, 180, 190]],
            len_ph:[3, 3]
        })
        print('## {}\nshape: {}'.format(ops.name, ops.shape))
        print(result)

In [5]:
# embeddings - 文字の ID から分散表現のベクトルに変換します。
# データは [batch_size, sentence_len, embedding_dim] の形になります。
embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=1), name='embeddings', dtype=tf.float32)
in_embedded = tf.nn.embedding_lookup(embeddings, in_ph)
debug(in_embedded)

## embedding_lookup:0
shape: (?, ?, 256)
[[[-1.51875842 -0.9256826   1.67430234 ..., -0.39987257  0.12857436
   -0.55193132]
  [ 1.42736435 -1.72631562 -0.07338642 ...,  1.14886475 -1.46833229
   -0.39282385]
  [ 0.2599524  -0.49925283 -0.35491866 ...,  0.16615726  1.20080113
    2.04619074]]

 [[ 0.07657067  0.82344168 -0.09656296 ..., -0.41994748 -0.46351427
    0.40188897]
  [-0.65720195  0.70287919  0.63894361 ..., -1.8827759   1.86026025
    1.41338992]
  [ 0.17521305  0.81392652  0.81880534 ..., -0.94512004  0.80815339
   -0.29899299]]]


In [6]:
# RNN 部分
cell = tf.nn.rnn_cell.GRUCell(hidden_dim, kernel_initializer=tf.orthogonal_initializer)
rnn_out, final_state = tf.nn.dynamic_rnn(
    cell=cell,
    inputs=in_embedded,
    sequence_length=len_ph,
    dtype=tf.float32,
    scope='rnn',
)
# 隠れ層から全結合をかませて、各単語の生成確率っぽい値にする。
# （i番目のニューロンの出力が id: i の単語の生成確率っぽいものになる）
output_layer = layers_core.Dense(vocab_size, use_bias=False, name='output_layer')
onehot_logits = output_layer.apply(rnn_out)
debug(onehot_logits)
output_ids_op = tf.argmax(onehot_logits, -1)

## output_layer/Tensordot:0
shape: (?, ?, 1000)
[[[ 0.00221155  0.09382153  0.03604724 ..., -0.22709875 -0.06939974
   -0.18812199]
  [-0.06599759  0.1990689   0.02569382 ..., -0.1297736  -0.07503378
   -0.27143532]
  [-0.00870639  0.11945572  0.25159195 ..., -0.03676797 -0.10855646
    0.0134067 ]]

 [[ 0.01965541  0.00715457  0.05620908 ..., -0.12843172  0.17167172
    0.14248312]
  [ 0.02611106 -0.30134672  0.17909557 ...,  0.03745769  0.08064837
    0.19177547]
  [ 0.1086482  -0.21623132  0.19076024 ..., -0.07495454  0.11627872
    0.3164745 ]]]


In [7]:
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=out_ph,
    logits=onehot_logits,
)
loss_op = tf.reduce_mean(cross_entropy, name='loss')
debug(loss_op)

## loss:0
shape: ()
6.93211


In [8]:
# 生成時用の RNN
beam_width = 20
gen_max_len = 500
start_tokens = tf.ones([1], tf.int32) * gen_start_token_ph  # 生成時の batch_size は1

decoder = tf.contrib.seq2seq.BeamSearchDecoder(
    cell=cell,
    embedding=embeddings,
    start_tokens=start_tokens,  
    end_token=0,  # dummy
    initial_state=cell.zero_state(beam_width, tf.float32),
    beam_width=beam_width,
    output_layer=output_layer,
)

beam_decoder_output = tf.contrib.seq2seq.dynamic_decode(
    decoder=decoder,
    maximum_iterations=500,
    scope='generator_decode'
)[0]
generate_op = beam_decoder_output.predicted_ids

# Load and Convert Data

In [9]:
sentence_len = 50
batch_size = 512
data_path = 'data/natsume.txt'

In [10]:
class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.rev_dict = {c: i for i, c in enumerate(vocab)}
        self.pad = 0
        self.bos = 1
        self.eos = 2
        self.unk = 3
    
    @classmethod
    def from_text(cls, text):
        char_freq_tuples = collections.Counter(text).most_common(vocab_size - 4)
        vocab, _ = zip(*char_freq_tuples)
        vocab = ['<pad>', '<bos>', '<eos>', '<unk>'] + list(vocab)
        return cls(vocab)

    @property
    def vocab_size(self):
        return len(self.vocab_size)
    
    def text2id(self, text):
        return [self.rev_dict[c] if c in self.rev_dict else self.unk for c in text]

    def id2text(self, ids):
        return ''.join(self.vocab[i] for i in ids)

In [11]:
with open(data_path) as f:
    text = f.read().replace('\n', '')

tokenizer = Tokenizer.from_text(text)
ids = tokenizer.text2id(text)

def split_ndlist(ndlist, size):
    return [np.array(ndlist[i - size:i]) for i in range(size, len(ndlist) + 1, size)]

# (1文字目, 2文字目), (2文字目, 3文字目), ... というペアを作る
# ある時刻の入力に対しその次時刻の出力を学習させるため
in_sequence_list = split_ndlist(ids[:-1], size=sentence_len)
out_sequence_list = split_ndlist(ids[1:], size=sentence_len)

in_batch_list = split_ndlist(in_sequence_list, batch_size)
out_batch_list = split_ndlist(out_sequence_list, batch_size)

# batch_size 個ごとに切り分け
batch_list = [
    {
        'in': in_batch,
        'out': out_batch,
        'len': np.array([len(seq) for seq in in_batch]),
    }
    for in_batch, out_batch
    in zip(in_batch_list, out_batch_list)
]

In [12]:
print(tokenizer.text2id('こんにちは😁'))
print(tokenizer.id2text([33, 27, 8, 51, 14, 3]))
print('batch list num: {}'.format(len(batch_list)))
print(batch_list[0])

[33, 27, 8, 51, 14, 3]
こんにちは<unk>
batch list num: 129
{'in': array([[  3,  77,   8, ...,  17, 224,  38],
       [ 12,  16,  55, ...,   4, 317,  14],
       [491,   3, 120, ...,  27,  25,  18],
       ..., 
       [ 19,  25,  12, ..., 190, 255, 165],
       [ 11,  23,   4, ...,  10,  49, 266],
       [ 30,  12,  15, ...,   4,  14,  55]]), 'out': array([[ 77,   8,   3, ..., 224,  38,  12],
       [ 16,  55,  46, ..., 317,  14, 491],
       [  3, 120,   3, ...,  25,  18,   7],
       ..., 
       [ 25,  12,  10, ..., 255, 165,  11],
       [ 23,   4,  19, ...,  49, 266,  30],
       [ 12,  15,  13, ...,  14,  55, 109]]), 'len': array([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
       50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
       50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
       50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
       50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 

# Training

In [None]:
max_epoch = 50
save_path = 'tmp/rnnlm/model.ckpt'
log_dir = 'tmp/rnnlm/log/'
learning_rate = 0.001

In [None]:
if not os.path.isdir(os.path.dirname(save_path)):
    os.makedirs(os.path.dirname(save_path))
if not os.path.isdir(log_dir):
    os.makedirs(log_dir)

global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op, global_step=global_step)
tf.summary.scalar('loss', loss_op)
summary_op = tf.summary.merge_all()

In [None]:
min_loss = 100000.0
sess = tf.Session()
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())
for epoch in range(max_epoch):
    random.shuffle(batch_list)
    for batch in batch_list:
        feed_dict = {
            in_ph: batch['in'],
            out_ph: batch['out'],
            len_ph: batch['len'],
        }
        _, loss, summary, step = sess.run([train_op, loss_op, summary_op, global_step], feed_dict)
        summary_writer.add_summary(summary, step)
        if loss < min_loss:
            saver.save(sess, save_path)
            min_loss = loss
    print('epoch {}/{} - loss: {}'.format(epoch, max_epoch, loss))


# Restore

In [14]:
load_path = 'learned_model/rnnlm/model.ckpt'
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess, load_path)

INFO:tensorflow:Restoring parameters from learned_model/rnnlm/model.ckpt


# Generate

In [None]:
start_char = '私'
generated_ids = sess.run(generate_op, {
    gen_start_token_ph:  tokenizer.text2id(start_char)[0]
})[0, :, 0]
generated_text = start_char + tokenizer.id2text(generated_ids)
print(generated_text)