## 导入数据

In [1]:
from load_data import *
import tensorflow as tf
import os


os.environ['CUDA_VISIBLE_DEVICES'] = '1' #使用 GPU 0
def use_gpu_polite(using_rate=0.6):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = using_rate
    return config



In [2]:
# 导入数据，train data 是一个字典， ebd embdding 是词向量矩阵，作为embedding层的初始参数
train_data, test_data, ebd_weights = load_imdb_data()
print(train_data.keys())

Load pickle data from ../data/imdb_data_3col.pkl
Original 134957 words in vocabulary.
After truncated low frequent word:
words num: 40000/134957; words freq: 0.981
Words exit in w2v file: 39210/40004, rate: 98.015198%
Shape of weight matrix: (40006, 50)
Train data shape: (25000, 500) label length: 25000
Test data shape: (25000, 500) label length: 25000
dict_keys(['data', 'data_len', 'label'])


## TextCNN模型，完全细节到每个参数设计，注意各个参数维度

In [3]:
class TextCNN():
    def __init__(self, seq_length, class_num, ebd_weights, filter_num, filter_sizes = [2, 3, 4]):
        seed_num = 7  # 参数初始化种子

        self.seq_input = tf.placeholder(dtype=tf.int32, shape=[None, seq_length],
                                        name='sequence_input')
        self.sparse_label_input = tf.placeholder(dtype=tf.int32, shape=[None],
                                                 name='sparse_label')
        self.global_step = tf.Variable(0, trainable=False)
        self.global_step_op = tf.assign(self.global_step, self.global_step+1)

        embedding_dim = ebd_weights.shape[1]
        with tf.name_scope('embedding'):
            self.W = tf.Variable(initial_value=ebd_weights, name='W')
            self.embedding_layer = tf.nn.embedding_lookup(self.W, self.seq_input)
            # batch * seq_len * emb_dim * in_channel(1)
            self.embedding_layer_expand = tf.expand_dims(self.embedding_layer, axis=-1)

        pool_layers = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope('conv-maxpool-%s' % i):
                # flter: seq_size * emb_dim * in_channel * out_channelout_channel
                filter_shape = [filter_size, embedding_dim, 1, filter_num]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1,
                                                         seed=seed_num), name='W')
                b = tf.Variable(tf.constant(0.1, shape=[filter_num]), name='b')
                conv = tf.nn.conv2d(self.embedding_layer_expand, W,
                                         strides=[1, 1, 1, 1], padding='VALID', name='conv')
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                pooling = tf.nn.max_pool(h, ksize=[1, seq_length-filter_size+1, 1, 1],
                                        strides=[1, 1, 1, 1], padding='VALID', name='pool')
                pool_layers.append(pooling)

        # batch * 1 * 1 * all --> batch * all
        all_dim = len(filter_sizes) * filter_num
        self.pool_flatten = tf.reshape(tf.concat(pool_layers, axis=-1), shape=[-1, all_dim])

        with tf.name_scope('output'):
            W = tf.Variable(tf.truncated_normal([all_dim, class_num], seed=seed_num, name='W'))
            b = tf.Variable(tf.constant(0.1, shape=[class_num]), name='b')
            self.logits = tf.nn.xw_plus_b(self.pool_flatten, W, b, name='logits')
            self.prediction = tf.argmax(self.logits, axis=-1, output_type=tf.int32, name='prediction')

        with tf.name_scope('loss'):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.sparse_label_input, logits=self.logits)
            self.loss_sum = tf.reduce_sum(losses)
            self.loss = tf.reduce_mean(losses, name='loss')

        with tf.name_scope('accuracy'):
            correct_predction = tf.equal(self.prediction, self.sparse_label_input)
            self.correct_num = tf.reduce_sum(tf.cast(correct_predction, tf.float16), name='correct_num')
            self.accuracy = tf.reduce_mean(tf.cast(correct_predction, tf.float16), name='accuracy')
        
        self.learning_rate = 1e-3
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        gs_vs = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(gs_vs)

## 开始训练，观察训练集和测试集上的loss 和 acc

In [4]:
train_x, train_y = train_data['data'], [1 if i == 'pos' else 0 for i in train_data['label']]
test_x, test_y = test_data['data'], [1 if i == 'pos' else 0 for i in test_data['label']]
cnn_model = TextCNN(seq_length=train_x.shape[1], class_num=2, ebd_weights=ebd_weights, filter_num=32)
batch_size = 1000
epoch_max = 50


config = use_gpu_polite()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    epoch_now = sess.run(cnn_model.global_step)
    
    while epoch_now < epoch_max:
        # 在训练集上按batch训练完所有，算作一个epoch
        batch_num = train_x.shape[0] // batch_size
        for i in range(batch_num+1):
            s_i = i * batch_size
            e_i = min((i+1)*batch_size, train_x.shape[0])
            if s_i >= e_i:
                continue
            in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
            feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}
            sess.run(cnn_model.train_op, feed_dict)

        epoch_now = sess.run(cnn_model.global_step_op)  # 跑完了一个epoch，epoch+1
        
        if epoch_now % 10 == 0:  # 每10轮观察一下训练集测试集loss 和 acc
            # 训练集总的损失和acc也要分步测，否则内存不够
            batch_num = train_x.shape[0] // batch_size
            train_total_loss = 0
            train_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, train_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
                feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}

                train_loss_one, train_correct_one = sess.run([cnn_model.loss_sum, cnn_model.correct_num], feed_dict)
                train_total_loss += train_loss_one
                train_total_correct += train_correct_one
            train_loss = train_total_loss / train_x.shape[0]
            train_acc = train_total_correct / train_x.shape[0]

            # 测试集的损失和acc
            batch_num = test_x.shape[0] // batch_size
            test_total_loss = 0
            test_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, test_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = test_x[s_i: e_i, :], test_y[s_i: e_i]
                feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}

                test_loss_one, test_correct_one = sess.run([cnn_model.loss_sum, cnn_model.correct_num], feed_dict)
                test_total_loss += test_loss_one
                test_total_correct += test_correct_one
            test_loss = test_total_loss / test_x.shape[0]
            test_acc = test_total_correct / test_x.shape[0]

            # 查看一组 中间参数
            flatten_param, logits = sess.run([cnn_model.pool_flatten, cnn_model.logits], 
                                             {cnn_model.seq_input: train_x[0:1, :]})
            print('Epoch %d, train loss %.4f, acc %.4f; test loss %.4f, acc %.4f' % 
                  (epoch_now, train_loss, train_acc, test_loss, test_acc))
            print(flatten_param, '\n', logits)


Epoch 10, train loss 4.5929, acc 0.6203; test loss 5.1768, acc 0.5987
[[5.095503   0.62826145 3.9841487  5.5019765  2.220422   4.743357
  5.8036704  3.8505344  4.7289195  4.699203   4.436813   4.1560545
  5.133413   3.617564   4.3160853  5.2199793  3.594837   3.6712592
  4.0461044  4.77945    3.2232623  2.431066   6.8121934  6.1613665
  3.3625274  3.9509478  3.7393603  4.708541   6.1360946  4.6546593
  4.4054646  3.2134683  5.3733535  5.9744163  5.67858    7.0276327
  4.220739   5.2743993  5.61334    5.665866   4.211244   5.8552175
  7.701244   4.1235967  7.8108835  4.97903    4.7353644  7.0625434
  4.47971    5.093168   4.5620117  6.091928   5.3831916  0.49923554
  8.651499   6.2799897  1.7130096  5.8069344  2.5869129  6.419062
  5.96157    4.238068   3.4507482  3.3257768  5.517814   5.8900013
  5.8953958  6.2903595  5.375534   6.1365404  5.9723315  4.97507
  6.4751463  5.5900445  6.282504   4.571477   7.2361274  5.486262
  6.6281266  6.0197763  5.3447547  5.01732    4.02223    5.8766