## 导入数据

In [1]:
import tensorflow as tf
import tensorflow.contrib as tfc
import os
from load_data import *


os.environ['CUDA_VISIBLE_DEVICES'] = '1' #使用 GPU 
def use_gpu_polite(using_rate=0.6):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = using_rate
    return config

In [2]:
# 导入数据，train data 是一个字典， ebd embdding 是词向量矩阵，作为embedding层的初始参数
train_data, test_data, ebd_weights = load_imdb_data()
print(train_data.keys())

Load pickle data from ../data/imdb_data_3col.pkl
Original 134957 words in vocabulary.
After truncated low frequent word:
words num: 40000/134957; words freq: 0.981
Words exit in w2v file: 39210/40004, rate: 98.015198%
Shape of weight matrix: (40006, 50)
Train data shape: (25000, 500) label length: 25000
Test data shape: (25000, 500) label length: 25000
dict_keys(['data', 'data_len', 'label'])


## text bilstm 模型，使用layers接口

In [3]:
class TextBiLstm():
    def __init__(self, class_num, ebd_weights, lstm_units=128):
        self.seq_input = tf.placeholder(dtype=tf.int32, shape=[None, None],
                                        name='sequence_input')
        self.seq_length = tf.placeholder(dtype=tf.int32, shape=[None], name='seq_length')
        self.sparse_label_input = tf.placeholder(dtype=tf.int32, shape=[None],
                                                 name='sparse_label')
        self.global_step = tf.Variable(0, trainable=False)
        self.global_step_op = tf.assign(self.global_step, self.global_step+1)
        # 使用动态指数递减学习率
        self.learning_rate = tf.train.exponential_decay(0.0015, self.global_step, decay_steps=10, 
                                       decay_rate=0.9, staircase=True)
        
        embedding_dim = ebd_weights.shape[1]
        with tf.name_scope('embedding'):
            self.W = tf.Variable(initial_value=ebd_weights, name='W')
            # batch * seq_len * emb_dim
            self.embedding_layer = tf.nn.embedding_lookup(self.W, self.seq_input)
        
        # 正反 双向的 LSTM cell，加入了dropout
        fw_cell = tfc.rnn.DropoutWrapper(tfc.rnn.LSTMCell(num_units=lstm_units), output_keep_prob=0.8)
        bw_cell = tfc.rnn.DropoutWrapper(tfc.rnn.LSTMCell(num_units=lstm_units), output_keep_prob=0.8)
        
        outputs, status = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, self.embedding_layer,
                                                           sequence_length=self.seq_length, dtype=tf.float32)
        # 将输出的 序列 和 最终状态 正反向拼接
        self.outputs_concat = tf.concat(outputs, axis=-1)
        (f_c, f_h), (b_c, b_h) = status
        self.status_concat = tf.concat([f_h, b_h], axis=-1)
        
        with tf.name_scope('output'):
            self.logits = tf.layers.dense(self.status_concat, class_num)
            self.prediction = tf.argmax(self.logits, axis=-1, output_type=tf.int32)

        with tf.name_scope('loss'):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.sparse_label_input,
                                                                    logits=self.logits)
            self.loss_sum = tf.reduce_sum(losses)
            self.loss = tf.reduce_mean(losses, name='loss')

        with tf.name_scope('accuracy'):
            correct_prediction = tf.equal(self.prediction, self.sparse_label_input)
            self.correct_num = tf.reduce_sum(tf.cast(correct_prediction, tf.float16))
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
        
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        gs_vs = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(gs_vs)

## 开始训练

In [4]:
train_x, train_y = train_data['data'], [1 if i == 'pos' else 0 for i in train_data['label']]
test_x, test_y = test_data['data'], [1 if i == 'pos' else 0 for i in test_data['label']]
train_x_len, test_x_len = train_data['data_len'], test_data['data_len']
lstm_model = TextBiLstm(class_num=2, ebd_weights=ebd_weights, lstm_units=128)
batch_size = 2000
epoch_max = 50

config = use_gpu_polite(0.8)
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    epoch_now = sess.run(lstm_model.global_step)
    
    while epoch_now < epoch_max:
        # 在训练集上按batch训练完所有，算作一个epoch
        batch_num = train_x.shape[0] // batch_size
        for i in range(batch_num+1):
            s_i = i * batch_size
            e_i = min((i+1)*batch_size, train_x.shape[0])
            if s_i >= e_i:
                continue
            in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
            in_x_len = train_x_len[s_i: e_i]
            feed_dict = {lstm_model.seq_input: in_x, lstm_model.sparse_label_input: in_y,
                        lstm_model.seq_length: in_x_len}
            sess.run(lstm_model.train_op, feed_dict)
        epoch_now = sess.run(lstm_model.global_step_op)
        
        if epoch_now % 10 == 0:  # 每10轮观察一下训练集测试集loss 和 acc
            # 训练集总的损失和acc也要分步测，否则内存不够
            batch_num = train_x.shape[0] // batch_size
            train_total_loss = 0
            train_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, train_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
                in_x_len = train_x_len[s_i: e_i]
                feed_dict = {lstm_model.seq_input: in_x, lstm_model.sparse_label_input: in_y,
                            lstm_model.seq_length: in_x_len}

                train_loss_one, train_correct_one = sess.run([lstm_model.loss_sum, lstm_model.correct_num], feed_dict)
                train_total_loss += train_loss_one
                train_total_correct += train_correct_one
            train_loss = train_total_loss / train_x.shape[0]
            train_acc = train_total_correct / train_x.shape[0]

            # 测试集的损失和acc
            batch_num = test_x.shape[0] // batch_size
            test_total_loss = 0
            test_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, test_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = test_x[s_i: e_i, :], test_y[s_i: e_i]
                in_x_len = test_x_len[s_i: e_i]
                feed_dict = {lstm_model.seq_input: in_x, lstm_model.sparse_label_input: in_y,
                            lstm_model.seq_length: in_x_len}

                test_loss_one, test_correct_one = sess.run([lstm_model.loss_sum, lstm_model.correct_num], feed_dict)
                test_total_loss += test_loss_one
                test_total_correct += test_correct_one
            test_loss = test_total_loss / test_x.shape[0]
            test_acc = test_total_correct / test_x.shape[0]
            
            lr_now = sess.run(lstm_model.learning_rate)
            print('Epoch %d, train loss %.4f, acc %.4f; test loss %.4f, acc %.4f. lr: %f' % 
                  (epoch_now, train_loss, train_acc, test_loss, test_acc, lr_now))   

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
Epoch 10, train loss 0.6926, acc 0.4984; test loss 0.6929, acc 0.4986. lr: 0.001350
Epoch 20, train loss 0.6921, acc 0.4987; test loss 0.6929, acc 0.4985. lr: 0.001215
Epoch 30, train loss 0.6904, acc 0.4985; test loss 0.6929, acc 0.4995. lr: 0.001093
Epoch 40, train loss 0.6859, acc 0.5121; test loss 0.6916, acc 0.5062. lr: 0.000984
Epoch 50, train loss 0.6447, acc 0.5545; test loss 0.6822, acc 0.5338. lr: 0.000886


## 模型用了一个很简单的双向LSTM，维度大小128  
## 训练了50轮但还是只有55%左右，可见拟合得十分慢  