## 导入数据

In [1]:
from load_data import *
import tensorflow as tf
import os


os.environ['CUDA_VISIBLE_DEVICES'] = '1' #使用 GPU 0
def use_gpu_polite(using_rate=0.6):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = using_rate
    return config



In [2]:
# 导入数据，train data 是一个字典， ebd embdding 是词向量矩阵，作为embedding层的初始参数
train_data, test_data, ebd_weights = load_imdb_data()
print(train_data.keys())

Load pickle data from ../data/imdb_data_3col.pkl
Original 134957 words in vocabulary.
After truncated low frequent word:
words num: 40000/134957; words freq: 0.981
Words exit in w2v file: 39210/40004, rate: 98.015198%
Shape of weight matrix: (40006, 50)
Train data shape: (25000, 500) label length: 25000
Test data shape: (25000, 500) label length: 25000
dict_keys(['data', 'data_len', 'label'])


## TextCNN模型，使用layer借口实现

In [3]:
class TextCNN_layer():
    def __init__(self, seq_length, class_num, ebd_weights, filter_num, filter_sizes = [2, 3, 4]):
        seed_num = 7

        self.seq_input = tf.placeholder(dtype=tf.int32, shape=[None, seq_length],
                                        name='sequence_input')
        self.sparse_label_input = tf.placeholder(dtype=tf.int32, shape=[None],
                                                 name='sparse_label')
        self.global_step = tf.Variable(0, trainable=False)
        self.global_step_op = tf.assign(self.global_step, self.global_step+1)

        embedding_dim = ebd_weights.shape[1]
        with tf.name_scope('embedding'):
            self.W = tf.Variable(initial_value=ebd_weights, name='W')
            self.embedding_layer = tf.nn.embedding_lookup(self.W, self.seq_input)
            # batch * seq_len * emb_dim * in_channel(1)
            self.embedding_layer_expand = tf.expand_dims(self.embedding_layer, axis=-1)

        pool_layers = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope('conv-maxpool-%s' % i):
                conv = tf.layers.conv2d(self.embedding_layer_expand, filter_num, [filter_size, embedding_dim],
                                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1, seed=seed_num),
                                bias_initializer=tf.constant_initializer(0.1),
                                activation='relu', padding='VALID')
                pool = tf.layers.max_pooling2d(conv, pool_size=[seq_length-filter_size+1, 1], 
                                               strides=[1, 1], padding='VALID')
                pool_layers.append(pool)
        
        all_dim = len(filter_sizes) * filter_num
        self.pool_flatten = tf.reshape(tf.concat(pool_layers, -1), shape=[-1, all_dim])
        
#         with tf.name_scope('output'):
#             self.logits = tf.layers.dense(self.pool_flatten, class_num, 
#                                           kernel_initializer=tf.truncated_normal_initializer(seed=seed_num), 
#                                           bias_initializer=tf.constant_initializer(0.1))
#             self.prediction = tf.argmax(self.logits, axis=-1, output_type=tf.int32, name='prediction')

        with tf.name_scope('output'):
            W = tf.Variable(tf.truncated_normal([all_dim, class_num], seed=seed_num, name='W'))
            b = tf.Variable(tf.constant(0.1, shape=[class_num]), name='b')
            self.logits = tf.nn.xw_plus_b(self.pool_flatten, W, b, name='logits')
            self.prediction = tf.argmax(self.logits, axis=-1, output_type=tf.int32, name='prediction')

        with tf.name_scope('loss'):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.sparse_label_input, logits=self.logits)
            self.loss_sum = tf.reduce_sum(losses)
            self.loss = tf.reduce_mean(losses, name='loss')

        with tf.name_scope('accuracy'):
            correct_predction = tf.equal(self.prediction, self.sparse_label_input)
            self.correct_num = tf.reduce_sum(tf.cast(correct_predction, tf.float16), name='accuracy')
            self.accuracy = tf.reduce_mean(tf.cast(correct_predction, tf.float16), name='accuracy')
        
        self.learning_rate = 1e-3
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        gs_vs = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(gs_vs)

## 开始训练

In [4]:
train_x, train_y = train_data['data'], [1 if i == 'pos' else 0 for i in train_data['label']]
test_x, test_y = test_data['data'], [1 if i == 'pos' else 0 for i in test_data['label']]
cnn_model = TextCNN_layer(seq_length=train_x.shape[1], class_num=2, ebd_weights=ebd_weights, filter_num=32)
batch_size = 1000
epoch_max = 50


config = use_gpu_polite()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    epoch_now = sess.run(cnn_model.global_step)
    
    while epoch_now < epoch_max:
        # 在训练集上按batch训练完所有，算作一个epoch
        batch_num = train_x.shape[0] // batch_size
        for i in range(batch_num+1):
            s_i = i * batch_size
            e_i = min((i+1)*batch_size, train_x.shape[0])
            if s_i >= e_i:
                continue
            in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
            feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}
            sess.run(cnn_model.train_op, feed_dict)

        epoch_now = sess.run(cnn_model.global_step_op)  # 跑完了一个epoch，epoch+1
        
        if epoch_now % 10 == 0:  # 每10轮观察一下训练集测试集loss 和 acc
            # 训练集总的损失和acc也要分步测，否则内存不够
            batch_num = train_x.shape[0] // batch_size
            train_total_loss = 0
            train_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, train_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = train_x[s_i: e_i, :], train_y[s_i: e_i]
                feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}

                train_loss_one, train_correct_one = sess.run([cnn_model.loss_sum, cnn_model.correct_num], feed_dict)
                train_total_loss += train_loss_one
                train_total_correct += train_correct_one
            train_loss = train_total_loss / train_x.shape[0]
            train_acc = train_total_correct / train_x.shape[0]

            # 测试集的损失和acc
            batch_num = test_x.shape[0] // batch_size
            test_total_loss = 0
            test_total_correct = 0
            for i in range(batch_num+1):
                s_i = i * batch_size
                e_i = min((i+1)*batch_size, test_x.shape[0])
                if s_i >= e_i:
                    continue
                in_x, in_y = test_x[s_i: e_i, :], test_y[s_i: e_i]
                feed_dict = {cnn_model.seq_input: in_x, cnn_model.sparse_label_input: in_y}

                test_loss_one, test_correct_one = sess.run([cnn_model.loss_sum, cnn_model.correct_num], feed_dict)
                test_total_loss += test_loss_one
                test_total_correct += test_correct_one
            test_loss = test_total_loss / test_x.shape[0]
            test_acc = test_total_correct / test_x.shape[0]

            flatten_param, logits = sess.run([cnn_model.pool_flatten, cnn_model.logits], 
                                             {cnn_model.seq_input: train_x[0:1, :]})
            print('Epoch %d, train loss %.4f, acc %.4f; test loss %.4f, acc %.4f' % 
                  (epoch_now, train_loss, train_acc, test_loss, test_acc))
            print(flatten_param, '\n', logits)


Epoch 10, train loss 4.5887, acc 0.6209; test loss 5.1708, acc 0.5989
[[5.098266   0.6267031  3.973796   5.505822   2.2480958  4.749883
  5.784947   3.8512633  4.734154   4.691539   4.447296   4.1583285
  5.139508   3.5924542  4.315198   5.1952605  3.5926905  3.703297
  4.0715146  4.777809   3.2233436  2.4266162  6.819762   6.15767
  3.3594055  3.9477615  3.7517774  4.710288   6.134476   4.665935
  4.40662    3.2140205  5.3673306  5.9726353  5.72242    7.0440216
  4.2133174  5.263072   5.6293364  5.663528   4.213584   5.853416
  7.70406    4.1801705  7.8236217  4.993251   4.7438684  7.057441
  4.4783754  5.1031847  4.5588865  6.079002   5.352418   0.51727957
  8.656329   6.2806416  1.7194276  5.798382   2.59123    6.4119115
  5.9181786  4.2614393  3.4503758  3.3350475  5.5210953  5.8865037
  5.852014   6.280078   5.3702617  6.1314893  5.965096   4.957735
  6.4592805  5.5918875  6.2671504  4.5609756  7.2407146  5.4812913
  6.6172347  6.0109396  5.342335   4.9947124  4.027345   5.875778


**两个4-开头的文件都是实现TextCNN，这个使用的是lyaers接口，另一个扎扎实实地写conv和dense的每个参数。**  
**两种方法使用的参数初始化方法完全一样，随机种子也一样，都设置成7。期待训练结果在loss和acc的数值上完全一致。**  
**但是，还是存在细微差异，虽然总体上差别不大（几乎小数点后三位）。**  
**同时还打出了pooling之后的tensor，以及logits。发现pooling之后就有细微差距了，应该就是conv层产生了不同。**  
**猜测一：可能是因为conv有32个kernel，那里随机初始化两种方法有偏差。**  
**猜测二：还存在其它参数初始化存在随机，或者训练过程中有随机。因为两次运行同一个文件，flatten层输出也不一样。**  
**可以看到，layers接口集成度高，方便很多。事实上，可能底层方法和就和自己写的差不多。所以之后尽量都用layers接口去实现模型。**