In [67]:
from __future__ import print_function 
import numpy as np
import pandas as pd
import tensorflow as tf 

## Load the dataset

In [68]:
label_num = 8

In [69]:
f = np.load('corpus_all_9999.npz')

In [71]:
train_doc = f['train_doc']
valid_doc = f['valid_doc']
test_doc = f['test_doc']

## Build the vocabulary

In [81]:
from collections import Counter 

In [2]:
# API for corpuse format conversion 

def map_word_and_index(input_doc, top=10000):
    counts_new = []            
    words_count_list = Counter([word for doc in input_doc for sent in doc[0] for word in sent]).most_common(top)
    word2index = {item[0]: index for index, item in enumerate(words_count_list, 1)}
    index2word = {index: item[0] for index, item in enumerate(words_count_list, 1)}
    return word2index, index2word

def convert2words(doc):
    return [word for sent in doc for word in sent]

def doc2index(doc, word2index):
    return [[word2index[word]] for word in doc]

def convert_corpus(corpus, word2index):
    corpus_words = [(convert2words(doc[0]), doc[1]) for doc in corpus]
    return np.asarray([doc2index(doc[0], word2index) for doc in corpus_words])

In [None]:
word2index, index2word = map_word_and_index(train_dc, 10000)

In [94]:
train_wordasindex = convert_corpus(train_doc, word2index)
valid_wordasindex = convert_corpus(valid_doc, word2index)
test_wordasindex = convert_corpus(test_doc, word2index)

## Embed words 

In [1]:
from gensim.models import Word2Vec

In [3]:
model = Word2Vec.load("word2vec_model")

In [125]:
def embed_from_index(model, idx):
    if idx != 0:
        return model.wv[index2word[idx]]
    else:
        return np.zeros(shape=model.wv[index2word[1]].shape)

def embed_corpus(corpus, word2index, model):
    temp = list(corpus)
    return np.asarray([[embed_from_index(model, idx) for sent in doc for idx in sent] for doc in temp])

In [115]:
train_wordasindex_embed = embed_corpus(train_doc, word2index, model)
valid_wordasindex_embed = embed_corpus(valid_doc, word2index, model)
test_wordasindex_embed = embed_corpus(test_doc, word2index, model)

## Convert labels 

In [116]:
def encode_label(label, size):
    l = [0]*size
    l[label] = 1
    return l

def encode_class(corpus, size):
    return np.asarray([encode_label(doc[1], size) for doc in corpus])

In [117]:
train_label = encode_class(train_doc, 8)
valid_label = encode_class(valid_doc, 8)
test_label = encode_class(test_doc, 8)

In [121]:
# maximum doc length in terms of number of words

min_count = 100
max_count = 0

for doc in valid_wordasindex:
    if len(doc) < min_count:
        min_count = len(doc)
    if len(doc) > max_count:
        max_count = len(doc)

print(min_count)
print(max_count)


16
6686


## Dataset

In [None]:
np.savez('corpus_wordasindex_all_ 9999', train_wordasindex=train_wordasindex,
         valid_wordasindex_embed=v, test_wordasindex=test_wordasindex)

## Classification model 

In [177]:
max_features = 10000

learning_rate = 0.001
maxlen = 6700
batch_size = 50
total_batch = int(train_wordasindex.shape[0]/batch_size)
input_dims = 100
num_hidden= 50
epochs = 100

index=0

### Padding with 0s

In [128]:
def pad_with_zeros(sequence, maxlen):
    if len(sequence) > maxlen:
        raise Exception
    else:
        return sequence+[[0]]*(maxlen-len(sequence))

def corpus_pad_with_zeros(corpus, maxlen):
    return np.asarray([pad_with_zeros(sent, maxlen) for sent in corpus])

In [131]:
train_corpus = corpus_pad_with_zeros(train_wordasindex, maxlen)
valid_corpus = corpus_pad_with_zeros(valid_wordasindex, maxlen)
test_corpus = corpus_pad_with_zeros(test_wordasindex, maxlen)

In [134]:
train_corpus_embed = embed_corpus(train_corpus, word2index, model)
valid_corpus_embed = embed_corpus(valid_corpus, word2index, model)
test_corpus_embed = embed_corpus(test_corpus, word2index, model)

In [137]:
def next_batch(data, index, size):
    """ return next batch in format: index, x batch, y batch
    """
    if index + size <= data[0].shape[0]:
        return index+size, data[0][index:index+size], data[1][index:index+size]
    else:
        return index+size-data[0].shape[0], np.concatenate((data[0][index:],data[0][:index+size-data[0].shape[0]]), 0), \
    np.concatenate((data[1][index:],data[1][:index+size-data[1].shape[0]]), 0)

def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

In [154]:
# tensorflow wants a data format of [Batch Size, Sequence Length, Input Dimension].
x = tf.placeholder(tf.float32, [None, maxlen, input_dims])
y = tf.placeholder(tf.float32, [None, 8])

In [155]:
with tf.variable_scope('cellsdef', reuse=True):
    cell_fw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
    cell_bw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

In [157]:
with tf.variable_scope('lstmrnn'):
    val, state = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=length(x))    # val are the h_ts 

In [163]:
W = tf.Variable(tf.truncated_normal(shape=[50, 32]))
b = tf.Variable(tf.constant(0.0, shape=[32]))

V = tf.Variable(tf.truncated_normal(shape=[32, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

# dropout_rate = tf.placeholder(tf.float32)

h = tf.nn.relu(tf.matmul(tf.reduce_mean(val, axis=1), W) + b)
# h = tf.nn.relu(tf.matmul(state[1], W) + b)
u = tf.matmul(h, V) + c
p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)
loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))

In [164]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [165]:
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))

### Validation 

In [166]:
sess = tf.InteractiveSession()

In [167]:
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for i in range(epochs+1):
    xloss = 0
    acc = 0.0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch((train_corpus_embed, train_label), index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 2 == 0:
        acc = sess.run(accuracy, feed_dict={x:valid_corpus_embed, y:valid_label})
        print("epoch %d, Validation acc: %g" % (i, acc * 100), end="")
        print("%")

epoch 0, run 0, loss 0.889945
epoch 0, run 10, loss 0.940488
epoch 0, run 20, loss 0.737024
epoch 0, run 30, loss 1.28468
epoch 0, Validation acc: 44.4%
epoch 1, run 0, loss 0.989343
epoch 1, run 10, loss 0.927828
epoch 1, run 20, loss 0.820058


### Test

In [3]:
sess = tf.InteractiveSession()

NameError: name 'tf' is not defined

In [4]:
init = tf.global_variables_initializer()
sess.run(init)

NameError: name 'tf' is not defined

In [None]:
for i in range(epochs+1):
    xloss = 0
    acc = 0.0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch((train_corpus_embed, train_label), index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 2 == 0:
        acc = sess.run(accuracy, feed_dict={x:test_corpus_embed, y:test_label})
        print("epoch %d, Test acc: %g" % (i, acc * 100), end="")
        print("%")

## Questions

- What are the benefits and downsides of the RNN-based representation over the bag of words representation used last week? 
- How would availability of data affect your answer?
- One possible architectural variant is to use only the final hidden state of the RNN as the document representation (i.e., x) rather than the average of the hidden states over time. How does this work? What are the potential benefits and downsides to this representation?
- Try different RNN architectures, e.g., simple Elman RNNs or GRUs or LSTMs. Which ones work best?
- What happens if you use a bidirectional LSTM (i.e., the dashed arrows in the figure)?