In [1]:
import numpy as np
import random
import os

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
tf.compat.v1.disable_eager_execution()

In [3]:
class MLP:
  def __init__(self, vocab_size, hidden_size, NUM_CLASSES):
    self._vocab_size = vocab_size
    self._hidden_size = hidden_size
    self.NUM_CLASSES = NUM_CLASSES
  def build_graph(self):
    self._X = tf.placeholder(tf.float32, shape = [None, self._vocab_size])
    self._real_y = tf.placeholder(tf.int32, shape = [None, ])

    with tf.variable_scope('w1', reuse = tf.AUTO_REUSE):
      weights_1 = tf.get_variable(
          name = 'weight_input_hidden',
          shape = (self._vocab_size, self._hidden_size),
          initializer = tf.random_normal_initializer(seed = 2021)
      )

    with tf.variable_scope('b1', reuse = tf.AUTO_REUSE):
      biases_1 = tf.get_variable(
          name = 'biases_input_hidden',
          shape = (self._hidden_size),
          initializer = tf.random_normal_initializer(seed = 2021)
      )

    with tf.variable_scope('w2', reuse = tf.AUTO_REUSE):
      weights_2 = tf.get_variable(
          name = 'weight_input_output',
          shape = (self._hidden_size, self.NUM_CLASSES),
          initializer = tf.random_normal_initializer(seed = 2021)
      )
    
    with tf.variable_scope('b2', reuse = tf.AUTO_REUSE):
      biases_2 = tf.get_variable(
          name = 'biases_input_hidden',
          shape = (self.NUM_CLASSES),
          initializer = tf.random_normal_initializer(seed = 2021)
      )

    hidden = tf.matmul(self._X, weights_1) + biases_1
    hidden = tf.sigmoid(hidden)
    logits = tf.matmul(hidden, weights_2) + biases_2
    
    labels_one_hot = tf.one_hot(indices = self._real_y, depth = self.NUM_CLASSES, dtype = tf.float32)
    loss = tf.nn.softmax_cross_entropy_with_logits(labels = labels_one_hot, logits = logits)
    loss = tf.reduce_mean(loss)

    probs = tf.nn.softmax(logits)
    predicted_labels = tf.argmax(probs, axis = 1)
    predicted_labels = tf.squeeze(predicted_labels)

    return predicted_labels, loss

  def trainer(self, loss, learning_rate):
    with tf.variable_scope('op', reuse = tf.AUTO_REUSE):
      train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_op

In [4]:
path = '/datasets/20news-bydate'
with open(path + '/words_idfs.txt') as f:
  vocab_size = len(f.read().splitlines())

mlp = MLP(
    vocab_size = vocab_size,
    hidden_size = 50,
    NUM_CLASSES = 20
)

predicted_labels, loss = mlp.build_graph()

train_op = mlp.trainer(loss = loss, learning_rate = 0.1)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [5]:
class DataReader:
  def __init__(self, path, batch_size, vocab_size):
    self._batch_size = batch_size
    with open(path, encoding = 'ISO-8859-1') as f:
      d_lines = f.read().splitlines()
    self._data = []
    self._labels = []
    
    for data_id, line in enumerate(d_lines):
      vector = [0.0 for _ in range(vocab_size)]
      features = line.split('<fff>')
      label, doc_id = int(features[0]), int(features[1])
      tokens = features[2].split()
      for token in tokens:
        index, value = int(token.split(':')[0]), float(token.split(':')[1])
        vector[index] = value
      self._data.append(vector)
      self._labels.append(label)
    
    self._data = np.array(self._data)
    self._labels = np.array(self._labels)

    self._num_epoch = 0
    self._batch_id = 0
  
  def next_batch(self):
    start = self._batch_id * self._batch_size
    end = start + self._batch_size
    self._batch_id += 1

    if end + self._batch_size > len(self._data):
      end = len(self._data)
      self._num_epoch += 1
      self._batch_id = 0
      indices = range(len(self._data))
      random.seed(2021)
      random.shuffle(list(indices))
      tmp_data = []
      tmp_labels = []
      for idx in indices:
        tmp_data.append(self._data[idx])
        tmp_labels.append(self._labels[idx])
      self._data, self._labels = tmp_data, tmp_labels
    
    return self._data[start:end], self._labels[start:end]


In [6]:
def load_dataset():
  train_data_reader = DataReader(
      path = path + '/20news_train_tfidf.txt',
      batch_size = 50,
      vocab_size = vocab_size
  )
  test_data_reader = DataReader(
      path = path + '/20news_test_tfidf.txt',
      batch_size = 50,
      vocab_size = vocab_size
  )
  
  return train_data_reader, test_data_reader


In [7]:
def save_parameters(name, value, epoch):
  filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
  if len(value.shape) == 1:
    string_form = ','.join([str(number) for number in value])
  else:
    string_form = '\n'.join([','.join([str(number) for number in value[row]]) for row in range(value.shape[0])])
  if not os.path.exists(path + '/saved_paras/' + filename.split('/')[0]):
    os.mkdir(path + '/saved_paras/' + filename.split('/')[0])
  with open(path + '/saved_paras/' + filename, 'w') as f:
    f.write(string_form)

In [8]:
with tf.Session() as sess:
  train_data_reader, test_data_reader = load_dataset()
  step, MAX_STEP = 0, 30_000
  
  sess.run(tf.global_variables_initializer())
  while step < MAX_STEP:
    train_data, train_labels = train_data_reader.next_batch()
    plabels_eval, loss_eval, _ = sess.run(
        [predicted_labels, loss, train_op],
        feed_dict = {
            mlp._X: train_data,
            mlp._real_y: train_labels
        }
    )
    step += 1
    if step % 100 == 0: print('step: {}, loss: {}'.format(step, loss_eval))
  
  trainable_variables = tf.trainable_variables()
  for variable in trainable_variables:
    if not os.path.exists(path + '/saved_paras/'):
      os.mkdir(path + '/saved_paras/')
    save_parameters(
        name = variable.name,
        value = variable.eval(),
        epoch = train_data_reader._num_epoch
    )
  


step: 100, loss: 3.380756139755249
step: 200, loss: 4.678699016571045
step: 300, loss: 3.776603937149048
step: 400, loss: 0.6784253120422363
step: 500, loss: 2.983941078186035
step: 600, loss: 0.7472978830337524
step: 700, loss: 3.0822641849517822
step: 800, loss: 0.0062517630867660046
step: 900, loss: 5.262300968170166
step: 1000, loss: 0.16384920477867126
step: 1100, loss: 0.023481659591197968
step: 1200, loss: 0.8647221922874451
step: 1300, loss: 0.00037774283555336297
step: 1400, loss: 0.2628019452095032
step: 1500, loss: 0.011880683712661266
step: 1600, loss: 0.15504354238510132
step: 1700, loss: 0.002276916755363345
step: 1800, loss: 0.00021758633374702185
step: 1900, loss: 4.3450108933029696e-05
step: 2000, loss: 3.78542099497281e-05
step: 2100, loss: 0.00030328790307976305
step: 2200, loss: 0.0004473777371458709
step: 2300, loss: 0.002018778584897518
step: 2400, loss: 0.005366782657802105
step: 2500, loss: 0.0011066037695854902
step: 2600, loss: 7.411694969050586e-05
step: 2700

In [17]:
def restore_parameters(name, epoch):
  filename = name.replace(':', '-colon-') + '-epoch-{}.txt'.format(epoch)
  with open(path + '/saved_paras/' + filename) as f:
    lines = f.read().splitlines()
  if len(lines) == 1:
    value = [float(number) for number in lines[0].split(',')]
  else:
    value = [[float(number) for number in lines[row].split(',')] for row in range(len(lines))]
  return value

In [49]:
with tf.Session() as sess:
  trainable_variables = tf.trainable_variables()
  for variable in trainable_variables:
    saved_value = restore_parameters(variable.name, epoch = 132)
    assign_op = variable.assign(saved_value)
    sess.run(assign_op)

Tensor("Assign_40:0", shape=(13973, 50), dtype=float32_ref)
Tensor("Assign_41:0", shape=(50,), dtype=float32_ref)
Tensor("Assign_42:0", shape=(50, 20), dtype=float32_ref)
Tensor("Assign_43:0", shape=(20,), dtype=float32_ref)


In [47]:
test_data_reader = DataReader(
  path = path + '/20news_test_tfidf.txt',
  batch_size = 50,
  vocab_size = vocab_size
)

with tf.Session() as sess:
  epoch = 10

  trainable_variables = tf.trainable_variables()
  for variable in trainable_variables:
    saved_value = restore_parameters(variable.name, epoch = 132)
    assign_op = variable.assign(saved_value)
    sess.run(assign_op)

  num_true_preds = 0
  while True:
    test_data, test_labels = test_data_reader.next_batch()
    test_plabels_eval = sess.run(
        predicted_labels,
        feed_dict = {
            mlp._X: test_data,
            mlp._real_y: test_labels
        }
    )
    matches = np.equal(test_plabels_eval, test_labels)
    num_true_preds += np.sum(matches.astype('float'))

    if test_data_reader._batch_id == 0:
      break
  
  print('epoch: ', epoch)
  print('accuracy on test data: ', num_true_preds / len(test_data_reader._data))

epoch:  10
accuracy on test data:  0.7683218268720128
