In [1]:
import tensorflow as tf
import os
import sys
import numpy as np
import math

tf.logging.set_verbosity(tf.logging.INFO)


In [2]:
class Vocab:
  def __init__(self, filename, num_word_threshold):
    self._id_to_word = {}
    self._word_to_id = {}
    self._unk = -1
    self._num_word_threshold = num_word_threshold
    self._read_dict(filename)

  def _read_dict(self, filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    for line in lines:
        word, occurence = line.strip('\r\n').split('\t')
        word = word.decode('utf-8')
        occurence = int(occurence)
        if word != '<UNK>' and occurence < self._num_word_threshold:
            continue
        idx = len(self._id_to_word)
        if word == '<UNK>':
            self._unk = idx
        if idx in self._id_to_word or word in self._word_to_id:
            raise Exception('There shouldn\'t be duplicate word in dict.')
        self._word_to_id[word] = idx
        self._id_to_word[idx] = word
  
  @property
  def unk(self):
    return self._unk

  def word_to_id(self, word):
    return self._word_to_id.get(word, self._unk)

  def size(self):
    return len(self._word_to_id)

  def encode(self, sentence, data_type):
    if data_type == 'word-level':
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
    elif data_type == 'char-level':
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
    else:
        raise Exception('%s is not supported' % (data_type))
    return word_ids

In [3]:
class CategoryVocab(object):
  def __init__(self, filename):
    self._category_to_id = {}
    with open(filename, 'r') as f:
      lines = f.readlines()

    for line in lines:
      label, idx = line.strip('\r\n').decode('utf-8').split('\t')
      idx = int(idx)
      self._category_to_id[label] = idx

  def category_to_id(self, category_name):
    if not category_name in self._category_to_id:
      raise Exception("%s is not in our label list." % category_name)
    return self._category_to_id[category_name]

  def get_class_num(self):
    return len(self._category_to_id)

In [4]:
class TextDataSet:
  def __init__(self, filename, vocab, category_vocab, num_timesteps, data_type):
    self._vocab = vocab
    self._category_vocab = category_vocab
    self._inputs = []
    self._outputs = []
    self._indicator = 0
    self._num_timesteps = num_timesteps
    self._data_type = data_type
    assert self._data_type in ['word-level', 'char-level']
    self._parse_file(filename)

  def _parse_file(self, filename):
    tf.logging.info('Loading data from %s', filename)
    with open(filename, 'r') as f:
      lines = f.readlines()

    for line in lines:
      label, content = line.strip('\r\n').decode('utf-8').split('\t')
      id_label = self._category_vocab.category_to_id(label)
      id_words = self._vocab.encode(content, self._data_type)
      id_words = id_words[0:self._num_timesteps]
      id_words = id_words + [
          self._vocab.unk for i in range(self._num_timesteps - len(id_words))]
      self._inputs.append(id_words)
      self._outputs.append(id_label)

    self._inputs = np.asarray(self._inputs, dtype=np.int32)
    self._outputs = np.asarray(self._outputs, dtype=np.int32)
    self._random_shuffle()

  def _random_shuffle(self):
    p = np.random.permutation(len(self._inputs))
    self._inputs = self._inputs[p]
    self._outputs = self._outputs[p]

  def next(self, batch_size):
    if self._indicator + batch_size > len(self._inputs):
        self._random_shuffle()
        self._indicator = 0

    end_indicator = self._indicator + batch_size
    assert end_indicator <= len(self._inputs)

    batch_inputs = self._inputs[self._indicator: end_indicator]
    batch_outputs = self._outputs[self._indicator: end_indicator]
    self._indicator = end_indicator
    return batch_inputs, batch_outputs

In [5]:
def get_default_params():
  return tf.contrib.training.HParams(
      num_embedding_nodes=16,
      num_timesteps=600,
      num_lstm_nodes=[32, 32],
      num_lstm_layers=2,
      num_fc_nodes=32,
      batch_size=100,
      cell_type='lstm',
      clip_lstm_grads=1.0,
      learning_rate=0.001,
  )

def create_rnn_cell(num_lstm_node, cell_type):
  if cell_type == 'lstm':
    return tf.contrib.rnn.BasicLSTMCell(num_lstm_node, state_is_tuple=True)
  elif cell_type == 'gru':
    return tf.contrib.rnn.GRUCell(hidden_dim)
  else:
    raise Exception("%s has not been supported" % cell_type)

def dropout(cell, keep_prob):
  return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

In [6]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    inputs  = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    labels = tf.placeholder(tf.int32, (batch_size,))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(
      tf.zeros([], tf.int64), name='global_step', trainable=False)

    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)

    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)
        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        _rnn_outputs, _ = tf.nn.dynamic_rnn(cell,
                                           embed_inputs,
                                           initial_state=initial_state)
        last = _rnn_outputs[:, -1, :]

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        fc = tf.layers.dense(last, hps.num_fc_nodes, name='fc1')
        fc = tf.contrib.layers.dropout(fc, keep_prob)
        fc = tf.nn.relu(fc)
        logits = tf.layers.dense(fc, num_classes, name='fc2')

    with tf.variable_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels)
        loss = tf.reduce_mean(softmax_loss)
        y_pred = tf.argmax(tf.nn.softmax(logits), 1, output_type=tf.int32)
        correct_pred = tf.equal(labels, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('accuracy', accuracy)

    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step=global_step)

    return ((inputs, labels, keep_prob),
          (loss, accuracy, train_op),
          global_step)


In [7]:
word_level_train_file = 'cnews_data/word-level/cnews.train.txt'
word_level_val_file = 'cnews_data/word-level/cnews.val.txt'
word_level_test_file  = 'cnews_data/word-level/cnews.test.txt'
word_level_vocab_file = 'cnews_data/word-level/cnews.vocab.txt'
word_level_category_file = 'cnews_data/word-level/cnews.category.txt'
word_level_output_folder = 'cnews_data/word-level/dir_runs'

char_level_train_file = 'cnews_data/char-level/cnews.train.txt'
char_level_val_file = 'cnews_data/char-level/cnews.val.txt'
char_level_test_file  = 'cnews_data/char-level/cnews.test.txt'
char_level_vocab_file = 'cnews_data/char-level/cnews.vocab.txt'
char_level_category_file = 'cnews_data/char-level/cnews.category.txt'
char_level_output_folder = 'cnews_data/char-level/dir_runs'

data_type = 'word-level'
num_word_threshold = 10

if data_type == 'word-level':
    train_file = word_level_train_file
    val_file = word_level_val_file
    test_file = word_level_test_file
    vocab_file = word_level_vocab_file
    category_file = word_level_category_file
    output_folder = word_level_output_folder
elif data_type == 'char-level':
    train_file = char_level_train_file
    val_file = char_level_val_file
    test_file = char_level_test_file
    vocab_file = char_level_vocab_file
    category_file = char_level_category_file
    output_folder = char_level_output_folder

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

hps = get_default_params()
vocab = Vocab(vocab_file, num_word_threshold)
category_vocab = CategoryVocab(category_file)

vocab_size = vocab.size()
num_classes = category_vocab.get_class_num()
tf.logging.info("vocab_size: %d" % vocab_size)
tf.logging.info("num_classes: %d" % num_classes)

train_dataset = TextDataSet(train_file, vocab, category_vocab, hps.num_timesteps, data_type)
val_dataset = TextDataSet(val_file, vocab, category_vocab, hps.num_timesteps, data_type)
test_dataset = TextDataSet(test_file, vocab, category_vocab, hps.num_timesteps, data_type)

placeholders, metrics, global_step = create_model(hps, vocab_size, num_classes)

loss, accuracy, train_op = metrics
summary_op = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

train_keep_rate_for_dropout = 0.8
test_keep_rate_for_dropout = 1.0

INFO:tensorflow:vocab_size: 77323
INFO:tensorflow:num_classes: 10
INFO:tensorflow:Loading data from cnews_data/word-level/cnews.train.txt
INFO:tensorflow:Loading data from cnews_data/word-level/cnews.val.txt
INFO:tensorflow:Loading data from cnews_data/word-level/cnews.test.txt
Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
INFO:tensorflow:variable name: embedding/embeddings:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: lstm_nn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/fc1/kernel:0
INFO:tensorflow:variable name: fc/fc1/bias:0
INFO:tensorflow:variable name: fc/fc2/kernel:0
INFO:tensorflow:variable name: fc/fc2/bias:0
I

In [9]:
num_train_iters = 10000
num_test_iters = 100
num_val_iters = 50
with tf.Session() as sess:
    sess.run(init_op)
    writer = tf.summary.FileWriter(output_folder, sess.graph)
    for i in range(num_train_iters):
        batch_inputs, batch_labels = train_dataset.next(hps.batch_size)
        input_vals = (batch_inputs, batch_labels, train_keep_rate_for_dropout)
        feed_dict = dict(zip(placeholders, input_vals))
        fetches = [global_step, loss, accuracy, train_op, summary_op]
        outputs = sess.run(fetches, feed_dict)

        global_step_val, loss_val, accuracy_val = outputs[0:3]
        if global_step_val % 100 == 0:
            tf.logging.info('Step: %5d, loss: %3.5f, accuracy: %4.5f'
                            % (global_step_val, loss_val, accuracy_val))

INFO:tensorflow:Step:   100, loss: 2.22502, accuracy: 0.15000
INFO:tensorflow:Step:   200, loss: 2.19878, accuracy: 0.08000
INFO:tensorflow:Step:   300, loss: 2.23281, accuracy: 0.11000
INFO:tensorflow:Step:   400, loss: 2.13852, accuracy: 0.19000
INFO:tensorflow:Step:   500, loss: 2.27273, accuracy: 0.10000
INFO:tensorflow:Step:   600, loss: 2.24877, accuracy: 0.07000
INFO:tensorflow:Step:   700, loss: 2.21698, accuracy: 0.10000
INFO:tensorflow:Step:   800, loss: 2.10033, accuracy: 0.15000
INFO:tensorflow:Step:   900, loss: 2.00499, accuracy: 0.29000
INFO:tensorflow:Step:  1000, loss: 1.87917, accuracy: 0.19000
INFO:tensorflow:Step:  1100, loss: 1.83067, accuracy: 0.16000
INFO:tensorflow:Step:  1200, loss: 1.85928, accuracy: 0.18000
INFO:tensorflow:Step:  1300, loss: 1.77081, accuracy: 0.18000
INFO:tensorflow:Step:  1400, loss: 2.13553, accuracy: 0.17000
INFO:tensorflow:Step:  1500, loss: 1.67256, accuracy: 0.31000
INFO:tensorflow:Step:  1600, loss: 1.64716, accuracy: 0.31000
INFO:ten

In [None]:
'''
word-level: filter=100 vocab_size=17067
char-level: filter=100 vocab_size=3583
        word    char
Train   99.7%   98.9%
Valid   92.7%   94.4%
Test    93.2%   95%
'''