# Data Preparing

In [1]:
import numpy as np
import jieba
import tensorflow as tf
import re

In [2]:
# word embedding look up loading
word2vec_lookup = np.load('word2vec_gensim.npy')
word2vec_lookup = word2vec_lookup[None][0]

In [3]:
# one hot encoding
words2oneHot = dict()
words = np.array(list(word2vec_lookup.keys()))
voca_size = len(words)
for i in range(voca_size):
    words2oneHot[words[i]] = np.zeros(voca_size).astype('int')
    words2oneHot[words[i]][i] = 1
oneHotPos2vec_lookup = dict()
oneHotPos2words = dict()
for key, val in words2oneHot.items():
    oneHotPos2vec_lookup[np.argmax(words2oneHot[key])] = word2vec_lookup[key]
    oneHotPos2words[np.argmax(words2oneHot[key])] = key

In [4]:
vec_list = [list(oneHotPos2vec_lookup[i]) for i in range(len(oneHotPos2vec_lookup.keys()))]

In [5]:
# books vectors
sent_len = 10

with open('101Matthew.htm', 'r') as f1:
    data = list(f1)
f1.close()
with open('102Mark.htm', 'r') as f2:
    data += list(f2)
f2.close()
with open('103Luke.htm', 'r') as f3:
    data += list(f3)
f3.close()
with open('104John.htm', 'r') as f4:
    data += list(f4)
f4.close()
sentences = list()
for i in range(len(data)):
    data[i] = re.sub(r'[<>『』(）《*/"=!@#%:.？…、“，\n”‘’: |[-]|\u3000]|[a-zA-Z0-9]|[\u3000]', r'', data[i])
    data[i]=re.split('。', data[i])
    for element in data[i]:
        if element == '':
            data[i].remove(element)
    sentences += data[i]
print('Pre-processing done.')
sent_seg = list()
jieba.set_dictionary('dict.txt.big')
sent_seg_too_long = list()
f_s = list()
for i in range(len(sentences)):
    s = sentences[i]
    s = list(jieba.cut(s))
    if len(s) > sent_len:
        f = int(len(s)/sent_len)
        f_ = sent_len - len(s) + int(len(s)/sent_len)*sent_len
        for j in range(f):
            sent_seg.append(s[j*sent_len:j*sent_len+sent_len])
            f_s.append(0)
        unknowns = ['<unknown>' for k in range(f_)]
        if len(unknowns) >= sent_len/2:
            continue
        sent_seg.append(s[j*sent_len+sent_len:]+unknowns)
        sent_seg_too_long.append(s)
    else:
        f_ = 0
        unknowns = ['<unknown>' for j in range(sent_len-len(s))]
        s += unknowns
        sent_seg.append(s)
    f_s.append(f_)
sent_seg = np.array(sent_seg)
print('word segmentation done.', 'len =', len(sent_seg), len(sent_seg_too_long))
sent_seg.shape

Building prefix dict from /mnt/hgfs/進階軟體開發專題/NLP/dict.txt.big ...
Loading model from cache /tmp/jieba.u1ad5655d51ba17ccc0ad09208c097d82.cache


Pre-processing done.


Loading model cost 1.458 seconds.
Prefix dict has been built succesfully.


word segmentation done. len = 6910 1104


(6910, 10)

In [6]:
__ = dict()
for i in f_s:
    if i in __:
        __[i] += 1
    else:
        __[i] = 1
__

{0: 5806, 1: 247, 2: 251, 3: 278, 4: 328}

In [7]:
def batch_X(i, batch):
    batch_x = list()
    for s in sent_seg[i:i+batch]:
        v = list()
        for w in s:
            if w is '<unknown>':
                v.append(vec_list[np.random.randint(len(vec_list))])
            elif w in word2vec_lookup:
                v.append(word2vec_lookup[w])
            else:
                v.append(vec_list[np.random.randint(len(vec_list))])
        batch_x.append(v)
    batch_x = np.array(batch_x)
    return batch_x
def batch_Y(i, batch):
    batch_y = list()
    for s in sent_seg[i+1:i+batch+1]:
        v = list()
        for w in s:
            if w is '<unknown>':
                v.append(np.random.randint(len(vec_list)))
            elif w in words2oneHot:
                v.append(np.argmax(words2oneHot[w]))
            else:
                v.append(np.random.randint(len(vec_list)))
        batch_y.append(v)
    batch_y = np.array(batch_y)
    return batch_y

In [8]:
batch_x = batch_X(0, 50)
batch_x2 = batch_X(1, 50)
batch_y = batch_Y(0, 50)
batch_x.shape, batch_x[45], batch_x[1]==batch_x2[0] # (batch size, sent len, vec len)

((50, 10, 1024),
 array([[ 0.00139729, -0.00304426,  0.00319758, ...,  0.00546759,
          0.0030041 , -0.00779329],
        [ 0.00154612, -0.00377121,  0.00481181, ...,  0.00671196,
          0.00388792, -0.00995091],
        [ 0.07981644, -0.14293131,  0.19046581, ...,  0.29229102,
          0.17591159, -0.42922705],
        ..., 
        [ 0.0024073 , -0.00417157,  0.00470921, ...,  0.00731867,
          0.00446155, -0.01117776],
        [ 0.07981644, -0.14293131,  0.19046581, ...,  0.29229102,
          0.17591159, -0.42922705],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]], dtype=float32),
 array([[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ..., 
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,

# LSTM

In [9]:
import tensorflow as tf

In [10]:
vec_size = len(word2vec_lookup['<unknown>'])
oneHot_size = len(words2oneHot['<unknown>'])
enc_len = sent_len
dec_len = enc_len
n_layer1 = 1024
l_r = 1e-3
epoch = 500
batch_size = 128

In [11]:
# generate random weight and bias
def _weight(shape):
    initial = tf.truncated_normal(shape, stddev=1.)
    return tf.Variable(initial)

def _bias(shape):
    initial = tf.constant(1., shape=shape)
    return tf.Variable(initial)

In [12]:
# word vectors constant
vecs = tf.constant(vec_list)

# input - word vectors
word_vec = [tf.placeholder(tf.float32, shape=[None, vec_size], name='word_vec') for i in range(enc_len)]    

# encoder input
w_enc = _weight([vec_size, n_layer1])
b_enc = _bias([n_layer1])
enc_inp = [tf.nn.dropout(tf.sigmoid(tf.matmul(word_vec[i], w_enc)+b_enc), keep_prob=.8, name='enc_inp') \
           for i in range(enc_len)]

# training target - word one-hot
word_target = [tf.placeholder(tf.int32, shape=[None], name='word_target') for i in range(dec_len)]

# one-hot to word vectors
target = [tf.nn.embedding_lookup(vecs, word_target[i]) for i in range(dec_len-1)]
target = [tf.zeros_like(target[0], dtype=np.float32, name="GO")] + target

# decoder input
w_dec = _weight([vec_size, n_layer1])
b_dec = _bias([n_layer1])
dec_inp = [tf.nn.dropout(tf.nn.relu(tf.matmul(target[i], w_dec)+b_dec), keep_prob=.8, name='dec_inp') \
           for i in range(dec_len)]

In [13]:
n_hiddens = 3
cells = [tf.contrib.rnn.BasicLSTMCell(n_layer1, forget_bias=1.0) for i in range(n_hiddens)]
cell = tf.contrib.rnn.MultiRNNCell(cells)

dec_outputs, dec_memory = tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(
        enc_inp, 
        dec_inp, 
        cell
    )

w_dec_o = _weight([n_layer1, oneHot_size])
b_dec_o = _weight([oneHot_size])
dec_outputs = [tf.matmul(dec_outputs[i], w_dec_o)+b_dec_o for i in range(dec_len)]

In [14]:
loss = list()
loss_all = 0
accu = list()
for y_hat, y_real in zip(dec_outputs, word_target):
    #loss.append(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=y_real)))
    print(y_hat, y_real)
    #accu += tf.reduce_mean(tf.cast(tf.equal(y_hat, y_real), tf.float32))
    loss_all += tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=y_real)
loss_all = tf.reduce_mean(loss_all/dec_len)
lr_decay = 0.92  # default: 0.9 . Simulated annealing.
momentum = 0.5  # default: 0.0 . Momentum technique in weights update
op = tf.train.RMSPropOptimizer(l_r, decay=lr_decay, momentum=momentum).minimize(loss_all)
#ops = [tf.train.AdamOptimizer(l_r).minimize(l) for l in loss]
#op3 = tf.train.AdagradOptimizer(l_r).minimize(loss)

Tensor("add_20:0", shape=(?, 6306), dtype=float32) Tensor("word_target:0", shape=(?,), dtype=int32)
Tensor("add_21:0", shape=(?, 6306), dtype=float32) Tensor("word_target_1:0", shape=(?,), dtype=int32)
Tensor("add_22:0", shape=(?, 6306), dtype=float32) Tensor("word_target_2:0", shape=(?,), dtype=int32)
Tensor("add_23:0", shape=(?, 6306), dtype=float32) Tensor("word_target_3:0", shape=(?,), dtype=int32)
Tensor("add_24:0", shape=(?, 6306), dtype=float32) Tensor("word_target_4:0", shape=(?,), dtype=int32)
Tensor("add_25:0", shape=(?, 6306), dtype=float32) Tensor("word_target_5:0", shape=(?,), dtype=int32)
Tensor("add_26:0", shape=(?, 6306), dtype=float32) Tensor("word_target_6:0", shape=(?,), dtype=int32)
Tensor("add_27:0", shape=(?, 6306), dtype=float32) Tensor("word_target_7:0", shape=(?,), dtype=int32)
Tensor("add_28:0", shape=(?, 6306), dtype=float32) Tensor("word_target_8:0", shape=(?,), dtype=int32)
Tensor("add_29:0", shape=(?, 6306), dtype=float32) Tensor("word_target_9:0", shape=(

In [15]:
op#, ops

<tf.Operation 'RMSProp' type=NoOp>

In [16]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [None]:
import time
for epo in range(epoch):
    print('Epoch', epo+1)
    t_start = time.time()
    for i in range(int(len(sent_seg)/batch_size)):
        batch_x = batch_X(i*batch_size, batch_size)
        batch_y = batch_Y(i*batch_size, batch_size)
        feed_dict = {word_vec[t]: batch_x[:, t] for t in range(enc_len)}
        feed_dict.update({word_target[t]: batch_y[:, t] for t in range(dec_len)})
        
        _ = sess.run([op], feed_dict=feed_dict)
        #lo, ac = sess.run([loss, accu], feed_dict=feed_dict)
        if (i+1)%50 == 0:
            lo = sess.run([loss_all], feed_dict=feed_dict)
            print('Epoch:', epo+1, 
                  ', iter:', i+1, 
                  ', loss:', lo)
        #if (epo+1)%10 == 0:
    #-----test-----
    batch_x = batch_X(1886, 1)
    batch_y = batch_Y(1886, 1)
    feed_dict = {word_vec[t]: batch_x[:, t] for t in range(enc_len)}
    feed_dict.update({word_target[t]: batch_y[:, t] for t in range(dec_len)})
    test = sess.run([dec_outputs], feed_dict=feed_dict)
    test = test[0]
    w_ = list()
    for i in range(sent_len):
        w_.append(test[i][0])
    print('Input: '+''.join(sent_seg[1886]))
    print('Output: '+''.join([oneHotPos2words[np.argmax(w)] for w in w_ ]))
    t_end = time.time()
    print('--------------------------Time span:', t_end-t_start, '--------------------------')

Epoch 1
Epoch: 1 , iter: 50 , loss: [7.243679]
Input: 恐怕他的門徒來把他偷了去
Output: 肥肥肥．．．．．．．
--------------------------Time span: 1119.944514989853 --------------------------
Epoch 2
Epoch: 2 , iter: 50 , loss: [38.62384]
Input: 恐怕他的門徒來把他偷了去
Output: 有有和和和和和和和和
--------------------------Time span: 1147.1416835784912 --------------------------
Epoch 3
Epoch: 3 , iter: 50 , loss: [6.9828448]
Input: 恐怕他的門徒來把他偷了去
Output: 耶穌說說說<unknown><unknown><unknown><unknown><unknown><unknown>
--------------------------Time span: 1097.3119313716888 --------------------------
Epoch 4
Epoch: 4 , iter: 50 , loss: [6.8125558]
Input: 恐怕他的門徒來把他偷了去
Output: 耶穌說說說說說說的說說
--------------------------Time span: 1096.0096981525421 --------------------------
Epoch 5
Epoch: 5 , iter: 50 , loss: [6.0003629]
Input: 恐怕他的門徒來把他偷了去
Output: 的的的的的的的的的的
--------------------------Time span: 1097.9076302051544 --------------------------
Epoch 6
Epoch: 6 , iter: 50 , loss: [5.835309]
Input: 恐怕他的門徒來把他偷了去
Output: 他們說說你你你你你你你
----------------

In [None]:
saver = tf.train.Saver()
saver.save(sess, './models/'+time.strftime('%Y.%m.%d_%H%M%S')+'novel')