In [1]:
import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics
import pickle

In [2]:
class TCNNConfig(object):
    """CNN param"""

    embedding_dim = 64  # word vector dimension
    seq_length = 800  # sequense length
    num_classes = 3  # class number
    num_filters = 256  # kernel number
    kernel_size = 5  # kernel size
    vocab_size = 5000  # vocab size

    hidden_dim = 128  # fully connected neuro number

    dropout_keep_prob = 0.5  # dropout keeping rate
    learning_rate = 1e-3  # learning rate

    batch_size = 64  # batch size
    num_epochs = 10  # total epoch number

    print_per_batch = 10  # output iterations
    save_per_batch = 10  # save tensorboard iterations


In [3]:
class TextCNN(object):
    """text classification，CNN model"""

    def __init__(self, config):
        self.config = config

        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN model"""
        # word embedding
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # fully connected layer，with dropout and ReLU
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # classifier
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # predictor

        with tf.name_scope("optimize"):
            # loss function，cross entropy
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # optimizor
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # accuracy
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            
            

In [4]:
save_dir = './checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')
param_saving_path = '../data/param-classify.dat'

In [5]:
def batch_iter(x, y, batch_size=64):
    """generate batchsize data"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

In [6]:
def get_time_dif(start_time):
    """get time"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict


def evaluate(sess, x_, y_):
    """evaluate the loss and accuracy"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

In [7]:
def train():
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    
    
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    validation_rate = 0.1
    idx = int(x.shape[0] * validation_rate)
    x_train = x[idx:]
    x_val = x[:idx]
    y_train = y[idx:]
    y_val = y[:idx]
    
    print(x_train.shape)
    print(x_val.shape)
    print(y_train.shape)
    print(y_val.shape)
    
    
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)
    
    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # total batch number
    best_acc_val = 0.0  # best validation accuracy
    last_improved = 0  # last improving
    require_improvement = 1000  # if not improving after 1000 iterations, end early
    
    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
            
            if total_batch % config.save_per_batch == 0:
                # save to tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)


            if total_batch % config.print_per_batch == 0:
                # get the loss and accuracy on training set and validation set
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # save the best result
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    print("Save model!")
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>4.4}, Train Acc: {2:>5.2%},' \
                      + ' Val Loss: {3:>4.4}, Val Acc: {4:>5.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict) 
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # early end
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break 
        if flag:
            break

In [8]:
config = TCNNConfig()
with open(param_saving_path, 'rb') as f:
    data = pickle.load(f)

x = data['X']
y = data['Y']
print(len(x))
P = np.random.permutation(len(x))
x = x[P]
y = y[P]

wordToID = data['wordToID']
seq_length = data['seq_length']
config.vocab_size = len(wordToID)
config.seq_length = seq_length

model = TextCNN(config)

15000
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
Use the retry module or similar alternatives.


In [9]:
train()

(13500, 800)
(1500, 800)
(13500, 3)
(1500, 3)
Training and evaluating...
Epoch: 1
Save model!
Iter:      0, Train Loss:  1.1, Train Acc: 32.81%, Val Loss: 1.098, Val Acc: 35.33%, Time: 0:00:07 *
Iter:     10, Train Loss: 1.089, Train Acc: 39.06%, Val Loss: 1.097, Val Acc: 32.40%, Time: 0:00:23 
Save model!
Iter:     20, Train Loss: 1.076, Train Acc: 59.38%, Val Loss: 1.079, Val Acc: 58.27%, Time: 0:00:40 *
Iter:     30, Train Loss: 1.028, Train Acc: 64.06%, Val Loss: 1.031, Val Acc: 57.67%, Time: 0:00:55 
Save model!
Iter:     40, Train Loss: 0.9044, Train Acc: 67.19%, Val Loss: 0.9084, Val Acc: 63.20%, Time: 0:01:09 *
Iter:     50, Train Loss: 0.7553, Train Acc: 64.06%, Val Loss: 0.7621, Val Acc: 60.73%, Time: 0:01:25 
Save model!
Iter:     60, Train Loss: 0.7358, Train Acc: 59.38%, Val Loss: 0.6763, Val Acc: 65.27%, Time: 0:01:41 *
Iter:     70, Train Loss: 0.569, Train Acc: 75.00%, Val Loss: 0.6544, Val Acc: 64.60%, Time: 0:01:56 
Save model!
Iter:     80, Train Loss: 0.6878, Train 

Iter:    770, Train Loss: 0.3459, Train Acc: 84.38%, Val Loss: 0.6511, Val Acc: 76.40%, Time: 0:18:22 
Iter:    780, Train Loss: 0.2286, Train Acc: 92.19%, Val Loss: 0.6435, Val Acc: 75.67%, Time: 0:18:34 
Iter:    790, Train Loss: 0.233, Train Acc: 92.19%, Val Loss: 0.6447, Val Acc: 75.53%, Time: 0:18:47 
Iter:    800, Train Loss: 0.2458, Train Acc: 93.75%, Val Loss: 0.6694, Val Acc: 75.07%, Time: 0:18:59 
Iter:    810, Train Loss: 0.2958, Train Acc: 87.50%, Val Loss: 0.6558, Val Acc: 75.80%, Time: 0:19:11 
Iter:    820, Train Loss: 0.2761, Train Acc: 92.19%, Val Loss: 0.6628, Val Acc: 75.33%, Time: 0:19:23 
Iter:    830, Train Loss: 0.2831, Train Acc: 89.06%, Val Loss: 0.6529, Val Acc: 75.47%, Time: 0:19:36 
Iter:    840, Train Loss: 0.1485, Train Acc: 93.75%, Val Loss: 0.6333, Val Acc: 76.27%, Time: 0:19:48 
Epoch: 5
Iter:    850, Train Loss: 0.1546, Train Acc: 95.31%, Val Loss: 0.6429, Val Acc: 75.27%, Time: 0:20:00 
Iter:    860, Train Loss: 0.121, Train Acc: 98.44%, Val Loss: 0.6

Iter:   1560, Train Loss: 0.1575, Train Acc: 96.88%, Val Loss: 0.9247, Val Acc: 76.20%, Time: 0:34:50 
Iter:   1570, Train Loss: 0.05706, Train Acc: 96.88%, Val Loss: 0.9041, Val Acc: 76.00%, Time: 0:35:03 
Iter:   1580, Train Loss: 0.03391, Train Acc: 98.44%, Val Loss: 0.8539, Val Acc: 76.00%, Time: 0:35:15 
Iter:   1590, Train Loss: 0.04286, Train Acc: 98.44%, Val Loss: 0.8526, Val Acc: 75.60%, Time: 0:35:28 
Iter:   1600, Train Loss: 0.1182, Train Acc: 96.88%, Val Loss: 0.8679, Val Acc: 75.40%, Time: 0:35:40 
Iter:   1610, Train Loss: 0.03595, Train Acc: 98.44%, Val Loss: 0.8741, Val Acc: 75.47%, Time: 0:35:52 
Iter:   1620, Train Loss: 0.2124, Train Acc: 93.75%, Val Loss: 0.9026, Val Acc: 76.13%, Time: 0:36:05 
Iter:   1630, Train Loss: 0.1065, Train Acc: 96.88%, Val Loss: 0.9044, Val Acc: 75.47%, Time: 0:36:17 
Iter:   1640, Train Loss: 0.04242, Train Acc: 98.44%, Val Loss: 0.873, Val Acc: 75.60%, Time: 0:36:29 
Iter:   1650, Train Loss: 0.1858, Train Acc: 92.19%, Val Loss: 0.8789

In [10]:
# train()

In [11]:
# train()

In [12]:
# with open(param_saving_path, 'rb') as f:
#     data = pickle.load(f)
    
# data_save = {"wordToID": data['wordToID'], "seq_length": data['seq_length']}

# new_saving_path = "../data/param-classify-test.dat"
# with open(new_saving_path, 'wb') as f:
#     pickle.dump(data_save, f, pickle.HIGHEST_PROTOCOL)