# Data preparing

In [1]:
import numpy as np
import jieba
import tensorflow as tf

In [2]:
# word embedding look up loading
word2vec_lookup = np.load('word2vec_gensim.npy')
word2vec_lookup = word2vec_lookup[None][0]

In [3]:
# one hot encoding
words2oneHot = dict()
words = np.array(list(word2vec_lookup.keys()))
voca_size = len(words)
for i in range(voca_size):
    words2oneHot[words[i]] = np.zeros(voca_size).astype('int')
    words2oneHot[words[i]][i] = 1
oneHotPos2vec_lookup = dict()
oneHotPos2words = dict()
for key, val in words2oneHot.items():
    oneHotPos2vec_lookup[np.argmax(words2oneHot[key])] = word2vec_lookup[key]
    oneHotPos2words[np.argmax(words2oneHot[key])] = key

In [4]:
vec_list = [list(oneHotPos2vec_lookup[i]) for i in range(len(oneHotPos2vec_lookup.keys()))]

In [5]:
# romance condor heroes vectors
with open('romance_condor_heroes.txt', 'r') as f:
    data = list(f)
sentences = list()
for i in range(len(data)):
    sentences += data[i].replace(' ', '').split('。')
print('Pre-processing done.')
sent_seg = list()
jieba.set_dictionary('dict.txt.big')
sent_seg_too_long = list()
for i in range(len(sentences)):
    s = sentences[i]
    s = list(jieba.cut(s))
    if len(s) > 50:
        sent_seg_too_long.append(s)
    else:
        unknowns = ['<unknown>' for j in range(50-len(s))]
        s += unknowns
        sent_seg.append(s)
sent_seg = np.array(sent_seg)
print('word segmentation done.', 'len =', len(sent_seg), len(sent_seg_too_long))

Building prefix dict from /media/warrents/6A085EA7085E7255/GoogleDriveSync/WarrenTS/Code/Code/Python2.7/IPython_Notebooks/MachineLearning/pecu_course/Text_mining_LSTM/dict.txt.big ...
Loading model from cache /tmp/jieba.u86847c3b15d467aa53d8311fcf6be389.cache


Pre-processing done.


Loading model cost 1.172 seconds.
Prefix dict has been built succesfully.


word segmentation done. len = 19416 2207


 - Batch data function

In [6]:
def batch_X(i, batch):
    batch_x = list()
    for s in sent_seg[i:i+batch]:
        v = list()
        for w in s:
            if w in word2vec_lookup:
                v.append(word2vec_lookup[w])
            else:
                v.append(word2vec_lookup['<unknown>'])
        batch_x.append(v)
    batch_x = np.array(batch_x)
    return batch_x
def batch_Y(i, batch):
    batch_y = list()
    for s in sent_seg[i+1:i+batch+1]:
        v = list()
        for w in s:
            if w in words2oneHot:
                v.append(words2oneHot[w])
            else:
                v.append(words2oneHot['<unknown>'])
        batch_y.append(v)
    batch_y = np.array(batch_y)
    return batch_y

In [7]:
batch_x = batch_X(0, 50)
batch_y = batch_Y(0, 50)
batch_x.shape, batch_x[45] # (batch size, sent len, vec len)

((50, 50, 64), array([[-2.12412047,  1.26558292,  0.48184228, ..., -1.78398168,
         -0.11021218, -0.40292478],
        [ 0.24595512, -0.22340411,  0.46395406, ..., -0.27114129,
         -0.14347288, -0.52333629],
        [ 0.70083529, -0.13018842,  1.39388931, ..., -0.0787876 ,
          0.99347192, -0.10601542],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]], dtype=float32))

## tf.embedding_lookup test

In [8]:
params = tf.constant([[10,20,30,40], [40,30,20,50]])
ids1 = tf.constant([[0,0,0,1], [0,0,1,0]])
#ids2 = tf.constant([0,3,3,2,3,1,2])
with tf.Session() as sess:
    tt = sess.run(tf.nn.embedding_lookup(params,ids1))
    print(tt)
    #print(sess.run(tf.nn.embedding_lookup(params,ids2)))

[[[10 20 30 40]
  [10 20 30 40]
  [10 20 30 40]
  [40 30 20 50]]

 [[10 20 30 40]
  [10 20 30 40]
  [40 30 20 50]
  [10 20 30 40]]]


In [9]:
params2 = tf.constant(vec_list)

ids3 = tf.constant([0, 1])
ids4 = tf.arg_max(tf.constant([[1, 0, 0], [0, 1, 0]]), dimension=1)
with tf.Session() as sess:
    _ = sess.run(tf.nn.embedding_lookup(params2,ids3))
    _2 = sess.run(tf.nn.embedding_lookup(params2,ids4))
    print(sess.run(ids4))

[0 1]


# LSTM

In [10]:
import tensorflow as tf

 - Hyperparameters

In [11]:
vec_size = len(word2vec_lookup['<go>'])
oneHot_size = len(words2oneHot['<go>'])
enc_len = 50
dec_len = enc_len
n_layer1 = 256
l_r = 1e-3
epoch = 3
batch_size = 50

 - Weight and bias functions

In [12]:
# generate random weight and bias
def _weight(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def _bias(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

 - Encoder and Decoder input
 <br> 50:50

In [13]:
# word vectors constant
vecs = tf.constant(vec_list)

# input - word vectors
word_vec = [tf.placeholder(tf.float32, shape=[None, vec_size], name='word_vec') for i in range(enc_len)]    

# encoder input
w_enc = _weight([vec_size, n_layer1])
b_enc = _bias([n_layer1])
enc_inp = [tf.nn.dropout(tf.nn.relu(tf.matmul(word_vec[i], w_enc)+b_enc), keep_prob=.8, name='enc_inp') \
           for i in range(enc_len)]

# training target - word one-hot
word_target = [tf.placeholder(tf.int32, shape=[None, oneHot_size], name='word_target') for i in range(dec_len)]

# one-hot to word vectors
target = [tf.nn.embedding_lookup(vecs,tf.arg_max(word_target[i], dimension=1)) for i in range(dec_len-1)]
target = [tf.zeros_like(target[0], dtype=np.float32, name="GO")] + target

# decoder input
w_dec = _weight([vec_size, n_layer1])
b_dec = _bias([n_layer1])
dec_inp = [tf.nn.dropout(tf.nn.relu(tf.matmul(target[i], w_dec)+b_dec), keep_prob=.8, name='dec_inp') \
           for i in range(dec_len)]

 - Seq2seq model

In [14]:
n_hiddens = 2
cells = [tf.contrib.rnn.BasicLSTMCell(n_layer1, forget_bias=1.0) for i in range(n_hiddens)]
cell = tf.contrib.rnn.MultiRNNCell(cells)

dec_outputs, dec_memory = tf.contrib.legacy_seq2seq.basic_rnn_seq2seq(
        enc_inp, 
        dec_inp, 
        cell
    )

w_dec_o = _weight([n_layer1, oneHot_size])
b_dec_o = _weight([oneHot_size])
dec_outputs = [tf.nn.softmax(tf.matmul(dec_outputs[i], w_dec_o)+b_dec_o, name='dec_out') \
               for i in range(dec_len)]

 - Loss function and optimizer

In [15]:
loss = 0
accu = 0
for y_hat, y_real in zip(dec_outputs, word_target):
    loss += tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=y_real))
    accu += tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_hat, 1), tf.argmax(y_real, 1)), tf.float32))

op = tf.train.RMSPropOptimizer(l_r).minimize(loss)

### - ***Training***

In [None]:
init = tf.global_variables_initializer()
#with tf.Session() as sess:
sess = tf.Session()
sess.run(init)
for epo in range(epoch):
    # len(sent_seg) == 19416
    for i in range(int(len(sent_seg)/batch_size)):
        batch_x = batch_X(i*batch_size, batch_size)
        batch_y = batch_Y(i*batch_size, batch_size)
        feed_dict = {word_vec[t]: batch_x[:, t] for t in range(enc_len)}
        feed_dict.update({word_target[t]: batch_y[:, t] for t in range(dec_len)})
        _ = sess.run([op], feed_dict=feed_dict)
        lo, ac = sess.run([loss, accu], feed_dict=feed_dict)
        if (i+1)%50 == 0:
            print(i+1)
            lo, ac = sess.run([loss, accu], feed_dict=feed_dict)
            print(lo, ac)
        #if (epo+1)%10 == 0:

50
464.766 0.0
100
464.766 0.0
150
432.642 32.14
200
432.035 32.74


words2oneHot
word2vec_lookup
oneHotPos2words
oneHotPos2vec_lookup
sent_seg
sent_seg_too_long