In [67]:
from __future__ import print_function 
import numpy as np
import pandas as pd
import tensorflow as tf 

## Load the dataset (without word embedding and one-hot labels)

In [68]:
label_num = 8

labels = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
label_dict = {labels[i]: i for i in range(8)}
index_dict = {i: labels[i] for i in range(8)}
print(label_dict)
print(index_dict)

In [69]:
f = np.load('corpus_all_9999.npz')

In [71]:
# retrieve raw datasets (without embeddings) from the data file

train_doc = f['train_doc']
valid_doc = f['valid_doc']
test_doc = f['test_doc']

## Check classes distributions 

In [1]:
def get_class_distribution(data, index_dict):
    d = {index_dict[key]:0 for key in index_dict}
    for doc in data:
        d[index_dict[doc[1]]] += 1
    return d

In [None]:
train_classes_distri = get_class_distribution(train_doc, index_dict)
print(train_classes_distri)

In [None]:
validation_classes_distri = get_class_distribution(valid_doc, index_dict)
print(validation_classes_distri)

In [None]:
test_classes_distri = get_class_distribution(test_doc, index_dict)
print(test_classes_distri)

## Build the vocabulary

In [81]:
from collections import Counter 

In [2]:
# API for corpuse format conversion 

def map_word_and_index(input_doc, top=10000):
    """ construct words and indices mapping dictionaries 
    """
    counts_new = []            
    words_count_list = Counter([word for doc in input_doc for sent in doc[0] for word in sent]).most_common(top)
    word2index = {item[0]: index for index, item in enumerate(words_count_list, 1)}
    index2word = {index: item[0] for index, item in enumerate(words_count_list, 1)}
    return word2index, index2word

def convert2words(doc):
    """ flatten a document into a list of words 
    """
    return [word for sent in doc for word in sent]

def doc2index(doc, word2index):
    """ map words to indices for the flatten document 
    """
    return [[word2index[word]] for word in doc]

def convert_corpus(corpus, word2index):
    corpus_words = [(convert2words(doc[0]), doc[1]) for doc in corpus]
    return np.asarray([doc2index(doc[0], word2index) for doc in corpus_words])

In [None]:
word2index, index2word = map_word_and_index(train_dc, 10000)

In [94]:
# datasets with each document as a list of word indices (no sentence structure)

train_wordasindex = convert_corpus(train_doc, word2index)
valid_wordasindex = convert_corpus(valid_doc, word2index)
test_wordasindex = convert_corpus(test_doc, word2index)

## Embed words 

In [1]:
from gensim.models import Word2Vec

In [3]:
model = Word2Vec.load("word2vec_model")

In [125]:
def embed_from_index(model, idx):
    """ apply embedding to a word represented as an index 
    """
    if idx != 0:
        return model.wv[index2word[idx]]
    else:
        return np.zeros(shape=model.wv[index2word[1]].shape)

def embed_corpus(corpus, word2index, model):
    """ apply word embedding to the dataset 
    """
    temp = list(corpus)
    return np.asarray([[embed_from_index(model, idx) for sent in doc for idx in sent] for doc in temp])

In [115]:
# embed words with trained Word2Vec model (with sentence structure)
# This part is only for hierarchical model which is subject to future exploration

# train_wordasindex_embed = embed_corpus(train_doc, word2index, model)

# valid_wordasindex_embed = embed_corpus(valid_doc, word2index, model)

# test_wordasindex_embed = embed_corpus(test_doc, word2index, model)

## Convert labels 

In [116]:
def encode_label(label, size):
    """ one-hot encode the given label 
    """
    l = [0]*size
    l[label] = 1
    return l

def encode_class(corpus, size):
    """ apply one-hot encoding to the dataset labels 
    """
    return np.asarray([encode_label(doc[1], size) for doc in corpus])

In [117]:
train_label = encode_class(train_doc, 8)
valid_label = encode_class(valid_doc, 8)
test_label = encode_class(test_doc, 8)

In [121]:
# maximum doc length in terms of number of words

min_count = 100
max_count = 0

for doc in valid_wordasindex:
    if len(doc) < min_count:
        min_count = len(doc)
    if len(doc) > max_count:
        max_count = len(doc)

print(min_count)
print(max_count)


16
6686


## Save the flattened dataset and reload

In [None]:
np.savez('corpus_wordasindex_all_ 9999', train_wordasindex=train_wordasindex,
         valid_wordasindex=valid_wordasindex, test_wordasindex=test_wordasindex)

In [None]:
# reload the flattened dataset, use it after you save it once 

d = np.load('corpus_wordasindex_all_ 9999')

train_wordasindex = d['train_wordasindex']
valid_wordasindex = d['valid_wordasindex']
test_wordasindex = d['test_wordasindex']

## Classification model 

In [177]:
max_features = 10000

learning_rate = 0.001
maxlen = 6700
batch_size = 50
total_batch = int(train_wordasindex.shape[0]/batch_size)
input_dims = 100
num_hidden= 50
epochs = 100

index=0

### Padding with 0s

In [128]:
def pad_with_zeros(sequence, maxlen):
    """ pad text body with preceding 0s 
    """
    if len(sequence) > maxlen:
        raise Exception
    else:
        return [[0]]*(maxlen-len(sequence))+sequence

def corpus_pad_with_zeros(corpus, maxlen):
    """ apply zero padding for the whole dataset 
    """
    return np.asarray([pad_with_zeros(sent, maxlen) for sent in corpus])

In [131]:
# zero padding on the texts

train_corpus = corpus_pad_with_zeros(train_wordasindex, maxlen)

valid_corpus = corpus_pad_with_zeros(valid_wordasindex, maxlen)

test_corpus = corpus_pad_with_zeros(test_wordasindex, maxlen)

In [134]:
# embed the dataset (without sentence structure)

train_corpus_embed = embed_corpus(train_corpus, word2index, model)

valid_corpus_embed = embed_corpus(valid_corpus, word2index, model)

test_corpus_embed = embed_corpus(test_corpus, word2index, model)

In [137]:
def next_batch(data, index, size):
    """ return next batch in format: index, x batch, y batch
    """
    if index + size <= data[0].shape[0]:
        return index+size, data[0][index:index+size], data[1][index:index+size]
    else:
        return index+size-data[0].shape[0], np.concatenate((data[0][index:],data[0][:index+size-data[0].shape[0]]), 0), \
    np.concatenate((data[1][index:],data[1][:index+size-data[1].shape[0]]), 0)

def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

## Model (LSTM)

For recurrent neural networks, tensorflow wants a data format of [Batch Size, Sequence Length, Input Dimension].

In [154]:
x = tf.placeholder(tf.float32, [None, maxlen, input_dims])
y = tf.placeholder(tf.float32, [None, 8])

Constructing a LSTM cell, simply call the "tf.contrib.rnn.LSTMCell" function with given arguments for size of the hidden state, "state_is_tuple=True" will get both the hidden state and the cell state.

You can also easily construct RNN variants, for example, call "tf.conctrib.rnn.GRUCell" for a GRU network, all the rest is the same. 

In [155]:
with tf.variable_scope('cell_def', reuse=True):
    cell_fw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
    # cell_bw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

Building a forward LSTM network, call "tf.nn.dynamic_rnn" with the constructed LSTM cell and input tensor, specifying "sequence_length" will dynamically unroll the network to a matching length to the current input during computation. "val" will be a sequence of outputs and "state" will be the last hidden state from the LSTM.

In [157]:
with tf.variable_scope('lstm_def'):
    val, state = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=length(x))    # val are the hidden states

apply nonlinear transformation to the LSTM output (we choose mean of the hidden states here, you can try out different variants such as the final hidden state, with or without transformation, etc) and render for classification using softmax.

In [163]:
W = tf.Variable(tf.truncated_normal(shape=[50, 32]))
b = tf.Variable(tf.constant(0.0, shape=[32]))

V = tf.Variable(tf.truncated_normal(shape=[32, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

# dropout_rate = tf.placeholder(tf.float32)

h = tf.nn.relu(tf.matmul(tf.reduce_mean(val, axis=1), W) + b)
# h = tf.nn.relu(tf.matmul(state[1], W) + b)
u = tf.matmul(h, V) + c
p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)
loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))

In [164]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [165]:
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))

In [None]:
# instantiate model saver to save the model in a session 
saver = tf.train.Saver()

### Training 

In [166]:
sess = tf.InteractiveSession()

In [167]:
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for i in range(epochs+1):
    xloss = 0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch((train_corpus_embed, train_label), index, batch_size)
        _, xloss, acc_train = sess.run([optimizer, loss, accuracy], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 2 == 0:
        acc_val = sess.run(accuracy, feed_dict={x:valid_corpus_embed, y:valid_label})
        print("epoch %d, Training acc: %g, Validation acc: %g " % (i, acc_train, acc_val))
        
save_path = saver.save(sess, "models/model.ckpt")
print("Model saved in file: %s" % save_path)

sess.close()

epoch 0, run 0, loss 0.889945
epoch 0, run 10, loss 0.940488
epoch 0, run 20, loss 0.737024
epoch 0, run 30, loss 1.28468
epoch 0, Validation acc: 44.4%
epoch 1, run 0, loss 0.989343
epoch 1, run 10, loss 0.927828
epoch 1, run 20, loss 0.820058


### Test

In [None]:
with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "models/model.ckpt")
    print("Model restored.")

    acc = sess.run(accuracy, feed_dict={x:test_corpus_embed, y:test_label})
    print("Test acc: %g" % (acc))

## Questions

- What are the benefits and downsides of the RNN-based representation over the bag of words representation used last week? 
- How would availability of data affect your answer?
- One possible architectural variant is to use only the final hidden state of the RNN as the document representation (i.e., x) rather than the average of the hidden states over time. How does this work? What are the potential benefits and downsides to this representation?
- Try different RNN architectures, e.g., simple Elman RNNs or GRUs or LSTMs. Which ones work best?
- What happens if you use a bidirectional LSTM (i.e., the dashed arrows in the figure)?

## Playground

### Bidirectional LSTM 

In [None]:
x = tf.placeholder(tf.float32, [None, maxlen, input_dims])
y = tf.placeholder(tf.float32, [None, 8])

Bidirectional LSTM needs a forward LSTM cell and a backward LSTM cell (they are the same here). 

In [None]:
with tf.variable_scope('cells_def', reuse=True):
    cell_fw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
    cell_bw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

Build a bidirectional LSTM network is as easy as calling the "tf.bidirectional_dynamic_rnn" function with the foward and backward cells, the returned outputs and states are both a tuple of two, each contains the sequence of output states and the final hidden state for the corresponding direction.

In [None]:
with tf.variable_scope('bidirlstm_def'):
    outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, x, dtype=tf.float32, sequence_length=length(x))    # val are the h_ts 

You can choose how to use the outputs and hidden states from both directions. Here we are only using the output states and we take the mean between the forward and backward directions. 

In [None]:
val = (outputs[0] + outputs[1])/2

Up to this point, you should get the new representation of the document, classify it through further nonlinear transformation and softmax.

In [None]:
W = tf.Variable(tf.truncated_normal(shape=[50, 32]))
b = tf.Variable(tf.constant(0.0, shape=[32]))

V = tf.Variable(tf.truncated_normal(shape=[32, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

# dropout_rate = tf.placeholder(tf.float32)

h = tf.nn.relu(tf.matmul(tf.reduce_mean(val, axis=1), W) + b)
# h = tf.nn.relu(tf.matmul(state[1], W) + b)
u = tf.matmul(h, V) + c
p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)
loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))
saver = tf.train.Saver()

### Training 

In [None]:
sess = tf.InteractiveSession()

In [None]:
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for i in range(epochs+1):
    xloss = 0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch((train_corpus_embed, train_label), index, batch_size)
        _, xloss, acc_train = sess.run([optimizer, loss, accuracy], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 2 == 0:
        acc_val = sess.run(accuracy, feed_dict={x:valid_corpus_embed, y:valid_label})
        print("epoch %d, Training acc: %g, Validation acc: %g " % (i, acc_train, acc_val))
        
save_path = saver.save(sess, "models/bidir_model.ckpt")
print("Model saved in file: %s" % save_path)

sess.close()

### Tests

In [None]:
with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "models/bidir_model.ckpt")
    print("Model restored.")

    acc = sess.run(accuracy, feed_dict={x:test_corpus_embed, y:test_label})
    print("Test acc: %g" % (acc))