In [1]:
import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics
import pickle

In [2]:
class TCNNConfig(object):
    """CNN param"""

    embedding_dim = 64  # word vector dimension
    seq_length = 800  # sequense length
    num_classes = 3  # class number
    num_filters = 256  # kernel number
    kernel_size = 5  # kernel size
    vocab_size = 5000  # vocab size

    hidden_dim = 128  # fully connected neuro number

    dropout_keep_prob = 0.5  # dropout keeping rate
    learning_rate = 1e-3  # learning rate

    batch_size = 64  # batch size
    num_epochs = 10  # total epoch number

    print_per_batch = 10  # output iterations


In [3]:
class TextCNN(object):
    """text classification，CNN model"""

    def __init__(self, config):
        self.config = config

        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN model"""
        # word embedding
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # fully connected layer，with dropout and ReLU
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # classifier
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # predictor

        with tf.name_scope("optimize"):
            # loss function，cross entropy
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # optimizor
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # accuracy
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            
            

In [4]:
save_dir = './checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')
param_saving_path = '../data/param-classify.dat'

In [5]:
def batch_iter(x, y, batch_size=64):
    """generate batchsize data"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

In [6]:
def get_time_dif(start_time):
    """get time"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict


def evaluate(sess, x_, y_):
    """evaluate the loss and accuracy"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

In [7]:
def train():
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    validation_rate = 0.1
    idx = int(x.shape[0] * validation_rate)
    x_train = x[idx:]
    x_val = x[:idx]
    y_train = y[idx:]
    y_val = y[:idx]
    
    print(x_train.shape)
    print(x_val.shape)
    print(y_train.shape)
    print(y_val.shape)
    
    
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    
    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # total batch number
    best_acc_val = 0.0  # best validation accuracy
    last_improved = 0  # last improving
    require_improvement = 1000  # if not improving after 1000 iterations, end early
    
    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.print_per_batch == 0:
                # get the loss and accuracy on training set and validation set
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # save the best result
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    print("Save model!")
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>4.4}, Train Acc: {2:>5.4%},' \
                      + ' Val Loss: {3:>4.4}, Val Acc: {4:>5.4%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # early end
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break 
        if flag:
            break

In [8]:
config = TCNNConfig()
with open(param_saving_path, 'rb') as f:
    data = pickle.load(f)

x = data['X']
y = data['Y']
print(len(x))
P = np.random.permutation(len(x))
x = x[P]
y = y[P]

wordToID = data['wordToID']
seq_length = data['seq_length']
config.vocab_size = len(wordToID)
config.seq_length = seq_length

model = TextCNN(config)

15000
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
Use the retry module or similar alternatives.


In [9]:
train()

(13500, 800)
(1500, 800)
(13500, 3)
(1500, 3)
Training and evaluating...
Epoch: 1
Save model!
Iter:      0, Train Loss: 1.097, Train Acc: 42.1875%, Val Loss: 1.099, Val Acc: 32.6000%, Time: 0:00:04 *
Save model!
Iter:     10, Train Loss: 1.094, Train Acc: 37.5000%, Val Loss: 1.098, Val Acc: 34.0000%, Time: 0:00:18 *
Save model!
Iter:     20, Train Loss: 1.077, Train Acc: 45.3125%, Val Loss: 1.086, Val Acc: 38.3333%, Time: 0:00:34 *
Save model!
Iter:     30, Train Loss: 1.061, Train Acc: 56.2500%, Val Loss: 1.058, Val Acc: 61.6000%, Time: 0:00:50 *
Iter:     40, Train Loss: 1.013, Train Acc: 46.8750%, Val Loss: 0.975, Val Acc: 59.8000%, Time: 0:01:04 
Iter:     50, Train Loss: 0.7824, Train Acc: 67.1875%, Val Loss: 0.8201, Val Acc: 59.1333%, Time: 0:01:17 
Iter:     60, Train Loss: 0.7701, Train Acc: 59.3750%, Val Loss: 0.7119, Val Acc: 59.8667%, Time: 0:01:33 
Save model!
Iter:     70, Train Loss: 0.6987, Train Acc: 64.0625%, Val Loss: 0.6899, Val Acc: 62.4000%, Time: 0:01:47 *
Save mo

Save model!
Iter:    730, Train Loss: 0.2358, Train Acc: 95.3125%, Val Loss: 0.6333, Val Acc: 77.4667%, Time: 0:16:11 *
Iter:    740, Train Loss: 0.3518, Train Acc: 87.5000%, Val Loss: 0.6369, Val Acc: 76.4667%, Time: 0:16:25 
Iter:    750, Train Loss: 0.3315, Train Acc: 84.3750%, Val Loss: 0.6356, Val Acc: 77.0000%, Time: 0:16:38 
Iter:    760, Train Loss: 0.2312, Train Acc: 90.6250%, Val Loss: 0.6393, Val Acc: 77.1333%, Time: 0:16:51 
Iter:    770, Train Loss: 0.2917, Train Acc: 90.6250%, Val Loss: 0.6508, Val Acc: 75.2000%, Time: 0:17:03 
Iter:    780, Train Loss: 0.1641, Train Acc: 95.3125%, Val Loss: 0.6236, Val Acc: 76.6000%, Time: 0:17:16 
Iter:    790, Train Loss: 0.2028, Train Acc: 89.0625%, Val Loss: 0.6346, Val Acc: 75.6000%, Time: 0:17:29 
Iter:    800, Train Loss: 0.162, Train Acc: 96.8750%, Val Loss: 0.6188, Val Acc: 76.8667%, Time: 0:17:41 
Iter:    810, Train Loss: 0.241, Train Acc: 93.7500%, Val Loss: 0.6383, Val Acc: 75.9333%, Time: 0:17:53 
Iter:    820, Train Loss: 

Iter:   1490, Train Loss: 0.0624, Train Acc: 98.4375%, Val Loss: 0.8605, Val Acc: 75.1333%, Time: 0:32:40 
Iter:   1500, Train Loss: 0.07063, Train Acc: 96.8750%, Val Loss: 0.9003, Val Acc: 76.0000%, Time: 0:32:52 
Iter:   1510, Train Loss: 0.03114, Train Acc: 98.4375%, Val Loss: 0.9323, Val Acc: 74.1333%, Time: 0:33:04 
Iter:   1520, Train Loss: 0.01073, Train Acc: 100.0000%, Val Loss: 0.9003, Val Acc: 75.4667%, Time: 0:33:17 
Iter:   1530, Train Loss: 0.02592, Train Acc: 98.4375%, Val Loss: 0.9008, Val Acc: 74.8667%, Time: 0:33:30 
Iter:   1540, Train Loss: 0.0152, Train Acc: 100.0000%, Val Loss: 0.9398, Val Acc: 75.4000%, Time: 0:33:42 
Iter:   1550, Train Loss: 0.03059, Train Acc: 98.4375%, Val Loss: 0.9336, Val Acc: 75.7333%, Time: 0:33:55 
Iter:   1560, Train Loss: 0.05194, Train Acc: 98.4375%, Val Loss: 0.9136, Val Acc: 74.2000%, Time: 0:34:08 
Iter:   1570, Train Loss: 0.03241, Train Acc: 100.0000%, Val Loss: 0.9325, Val Acc: 75.3333%, Time: 0:34:21 
Iter:   1580, Train Loss: 0