In [1]:
import numpy as np
import os
import re
from collections import defaultdict
import random

In [2]:
%tensorflow_version 1.x
import tensorflow as tf

TensorFlow 1.x selected.


In [3]:
MAX_DOC_LENGTH = 500
NUM_CLASSES = 20
path = '\\20news-bydate\\'

In [4]:
class RNN:
  def __init__(self, vocab_size, embedding_size, lstm_size, batch_size):
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._lstm_size = lstm_size
    self._batch_size = batch_size

    self._data = tf.placeholder(tf.int32, shape = [batch_size, MAX_DOC_LENGTH])
    self._labels = tf.placeholder(tf.int32, shape = [batch_size, ])
    self._sentence_lengths = tf.placeholder(tf.int32, shape = [batch_size, ])
    self._final_tokens = tf.placeholder(tf.int32, shape = [batch_size, ])

  def embedding_layer(self, indices):
    pretrained_vectors = []
    pretrained_vectors.append(np.zeros(self._embedding_size))
    np.random.seed(2021)

    for _ in range(self._vocab_size + 1):
      pretrained_vectors.append(np.random.normal(loc = 0., scale = 1., size = self._embedding_size))

    pretrained_vectors = np.array(pretrained_vectors)

    with tf.variable_scope('embedding', reuse = tf.AUTO_REUSE):
      self._embedding_matrix = tf.get_variable(
          name = 'embedding',
          shape = (self._vocab_size + 2, self._embedding_size),
          initializer = tf.constant_initializer(pretrained_vectors)
      )

    return tf.nn.embedding_lookup(self._embedding_matrix, indices)

  def LSTM_layer(self, embeddings):
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(self._lstm_size)
    zero_state = tf.zeros(shape = (self._batch_size, self._lstm_size))
    initial_state = tf.contrib.rnn.LSTMStateTuple(zero_state, zero_state)

    lstm_inputs = tf.unstack(tf.transpose(embeddings, perm = [1, 0, 2]))
    with tf.variable_scope('lstm', reuse = tf.AUTO_REUSE):
      lstm_outputs, last_state = tf.nn.static_rnn (
          cell = lstm_cell,
          inputs = lstm_inputs,
          initial_state = initial_state,
          sequence_length = self._sentence_lengths
      )

    lstm_outputs = tf.unstack(tf.transpose(lstm_outputs, perm = [1, 0, 2]))
    lstm_outputs = tf.concat(lstm_outputs, axis = 0)

    mask = tf.sequence_mask(
        lengths = self._sentence_lengths,
        maxlen = MAX_DOC_LENGTH,
        dtype = tf.float32
    )

    mask = tf.concat(tf.unstack(mask, axis = 0), axis = 0)
    mask = tf.expand_dims(mask, -1)

    lstm_outputs = mask * lstm_outputs
    lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits = self._batch_size)
    lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis = 1)
    lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(tf.cast(self._sentence_lengths, tf.float32), -1)

    return lstm_outputs_average

  def build_graph(self):
    embeddings = self.embedding_layer(self._data)
    lstm_outputs = self.LSTM_layer(embeddings)

    with tf.variable_scope('final_layer_weights', reuse = tf.AUTO_REUSE):
      weights = tf.get_variable(
          name = 'final_layer_weights',
          shape = (self._lstm_size, NUM_CLASSES),
          initializer = tf.random_normal_initializer(seed = 2021)
      )

    with tf.variable_scope('final_layer_biases', reuse = tf.AUTO_REUSE):
      biases = tf.get_variable(
          name = 'final_layer_biases',
          shape = (NUM_CLASSES),
          initializer = tf.random_normal_initializer(seed = 2021)
      )

    logits = tf.matmul(lstm_outputs, weights) + biases

    labels_one_hot = tf.one_hot(
        indices = self._labels,
        depth = NUM_CLASSES,
        dtype = tf.float32
    )

    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels = labels_one_hot,
        logits = logits
    )

    loss = tf.reduce_mean(loss)

    probs = tf.nn.softmax(logits)
    predicted_labels = tf.argmax(probs, axis = 1)
    predicted_labels = tf.squeeze(predicted_labels)

    return predicted_labels, loss

  def trainer(self, loss, learning_rate):
    with tf.variable_scope('optimizer', reuse = tf.AUTO_REUSE):
      train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
      return train_op

In [5]:
class DataReader:
  def __init__(self, path, batch_size):
    self._batch_size = batch_size
    with open(path, encoding = 'ISO-8859-1') as f:
      d_lines = f.read().splitlines()
    self._data = []
    self._labels = []
    self._sentence_lengths = []
    self._final_tokens = []
    
    for data_id, line in enumerate(d_lines):
      features = line.split('<fff>')
      label, doc_id, sentence_length = int(features[0]), int(features[1]), int(features[2])
      tokens = [int(token) for token in features[3].split()]
      final_token = tokens[sentence_length - 1]
      self._data.append(tokens)
      self._labels.append(label)
      self._sentence_lengths.append(sentence_length)
      self._final_tokens.append(final_token)
    
    self._data = np.array(self._data)
    self._labels = np.array(self._labels)
    self._sentence_lengths = np.array(self._sentence_lengths)
    self._final_tokens = np.array(self._final_tokens)

    self._num_epoch = 0
    self._current_part = 0
  
  def next_batch(self):
    start = self._current_part * self._batch_size
    end = start + self._batch_size
    self._current_part += 1

    if end + self._batch_size > len(self._data):
      self._num_epoch += 1
      self._current_part = 0
      indices = range(len(self._data))
      random.seed(2021)
      random.shuffle(list(indices))
      tmp_data = []
      tmp_labels = []
      tmp_sentence_lengths = []
      tmp_final_tokens = []
      for idx in indices:
        tmp_data.append(self._data[idx])
        tmp_labels.append(self._labels[idx])
        tmp_sentence_lengths.append(self._sentence_lengths[idx])
        tmp_final_tokens.append(self._final_tokens[idx])
      self._data, self._labels, self._sentence_lengths, self._final_tokens = tmp_data, tmp_labels, tmp_sentence_lengths, tmp_final_tokens
    
    return self._data[start:end], self._labels[start:end], self._sentence_lengths[start:end], self._final_tokens[start:end]


In [6]:
def load_dataset():
  train_data_reader = DataReader(
      path = path + 'w2v\\20news-train-encoded.txt',
      batch_size = 50
  )
  test_data_reader = DataReader(
      path = path + 'w2v\\20news-test-encoded.txt',
      batch_size = 50
  )
  
  return train_data_reader, test_data_reader


In [7]:
with open(path + 'w2v\\vocab-raw.txt', encoding = 'ISO-8859-1') as f:
  vocab_size = len(f.read().splitlines())

  tf.random.set_random_seed(2021)
  rnn = RNN(
      vocab_size = vocab_size,
      embedding_size = 300,
      lstm_size = 50,
      batch_size = 50
  )
  predicted_labels, loss = rnn.build_graph()
  train_op = rnn.trainer(loss = loss, learning_rate = 0.01)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels inpu

In [8]:
with tf.Session() as sess:
  train_data_reader, test_data_reader = load_dataset()

  step = 0
  MAX_STEP = 11000

  sess.run(tf.global_variables_initializer())
  while step < MAX_STEP:
    next_train_batch = train_data_reader.next_batch()
    train_data, train_labels, train_sentence_lengths, train_final_tokens = next_train_batch
    plabels_eval, loss_eval, _ = sess.run(
        [predicted_labels, loss, train_op],
        feed_dict = {
            rnn._data: train_data,
            rnn._labels: train_labels,
            rnn._sentence_lengths: train_sentence_lengths,
            rnn._final_tokens: train_final_tokens
        }
    )
    step += 1
    if step % 20 == 0:
      print('step: ' + str(step) +' - loss: ', str(loss_eval))
    if train_data_reader._current_part == 0:
      num_true_preds = 0
      while True:
        next_test_batch = test_data_reader.next_batch()
        test_data, test_labels, test_sentence_lenghts, test_final_tokens = next_test_batch

        test_plabels_eval = sess.run(
            predicted_labels,
            feed_dict = {
                rnn._data: test_data,
                rnn._labels: test_labels,
                rnn._sentence_lengths: test_sentence_lenghts,
                rnn._final_tokens: test_final_tokens
            }
        )
        matches = np.equal(test_plabels_eval, test_labels)
        num_true_preds += np.sum(matches.astype(float))

        if test_data_reader._current_part == 0:
          break
      
      print('Epoch: ', train_data_reader._num_epoch)
      print('Accuracy on test data: ', num_true_preds * 100. / len(test_data_reader._data))

step: 20 - loss:  0.0012361328
step: 40 - loss:  0.4900295
step: 60 - loss:  5.7572503
step: 80 - loss:  0.86787254
step: 100 - loss:  3.8929074
step: 120 - loss:  5.416351
step: 140 - loss:  3.0255537
step: 160 - loss:  3.612365
step: 180 - loss:  3.6553693
step: 200 - loss:  9.0240965
step: 220 - loss:  3.5073512
Epoch:  1
Accuracy on test data:  4.84599044078598
step: 240 - loss:  3.8009467
step: 260 - loss:  2.95274
step: 280 - loss:  2.6134386
step: 300 - loss:  2.9435427
step: 320 - loss:  3.3065193
step: 340 - loss:  1.9001852
step: 360 - loss:  3.9714172
step: 380 - loss:  2.4186883
step: 400 - loss:  1.2801465
step: 420 - loss:  2.5218585
step: 440 - loss:  2.4225063
Epoch:  2
Accuracy on test data:  9.373340414232608
step: 460 - loss:  2.1379087
step: 480 - loss:  2.3613937
step: 500 - loss:  2.5775368
step: 520 - loss:  1.3846714
step: 540 - loss:  2.2108655
step: 560 - loss:  2.1179504
step: 580 - loss:  1.4321709
step: 600 - loss:  2.5070975
step: 620 - loss:  3.0569947
st