In [2]:
from __future__ import print_function 
import numpy as np
import pandas as pd
import tensorflow as tf 

In [11]:
label_num = 8
labels = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
label_dict = {labels[i]: i for i in range(8)}
index_dict = {i: labels[i] for i in range(8)}
print(label_dict)
print(index_dict)

{'ooo': 0, 'Too': 1, 'oEo': 2, 'ooD': 3, 'TEo': 4, 'ToD': 5, 'oED': 6, 'TED': 7}
{0: 'ooo', 1: 'Too', 2: 'oEo', 3: 'ooD', 4: 'TEo', 5: 'ToD', 6: 'oED', 7: 'TED'}


In [3]:
f = np.load('corpus_all_9999.npz')

In [4]:
f.files

['test_doc', 'valid_doc', 'train_doc']

In [5]:
train_doc = f['train_doc']

In [6]:
print(type(train_doc))

<class 'numpy.ndarray'>


In [7]:
print(train_doc[0][1])

0


In [8]:
valid_doc = f['valid_doc']

In [9]:
test_doc = f['test_doc']

In [9]:
min_count = 100
max_count = 0

for doc in train_doc:
    for sent in doc[0]:
        if len(sent) < min_count:
            min_count = len(sent)
        if len(sent) > max_count:
            max_count = len(sent)
            
print(min_count)
print(max_count)


1
290


In [10]:
def convert2words(doc):
    return [word for sent in doc for word in sent]

In [11]:
train_doc_words = np.asarray([(convert2words(doc[0]), doc[1]) for doc in train_doc])

In [12]:
# print(train_doc_words[0][0])

In [13]:
# print(train_doc[0])

## check classes distributions 

In [12]:
def get_class_distribution(data, index_dict):
    d = {index_dict[key]:0 for key in index_dict}
    for doc in data:
        d[index_dict[doc[1]]] += 1
    return d

In [13]:
train_classes_distri = get_class_distribution(train_doc, index_dict)
print(train_classes_distri)

{'ooo': 966, 'Too': 275, 'oEo': 97, 'ooD': 112, 'TEo': 18, 'ToD': 85, 'oED': 10, 'TED': 16}


In [14]:
validation_classes_distri = get_class_distribution(valid_doc, index_dict)
print(validation_classes_distri)

{'ooo': 86, 'Too': 50, 'oEo': 33, 'ooD': 25, 'TEo': 9, 'ToD': 32, 'oED': 8, 'TED': 7}


In [15]:
test_classes_distri = get_class_distribution(test_doc, index_dict)
print(test_classes_distri)

{'ooo': 73, 'Too': 56, 'oEo': 40, 'ooD': 27, 'TEo': 8, 'ToD': 28, 'oED': 6, 'TED': 11}


## Build the vocabulary

In [14]:
from collections import Counter 

In [15]:
counts_new = []
            
words_count_list = Counter([word for doc in train_doc for sent in doc[0] for word in sent]).most_common(10000)

In [16]:
print(len(words_count_list))
# print(words_count_list)

10000


In [17]:
l = [1,2,3,4,4]
for i in enumerate(l, 1):
    print(i)

(1, 1)
(2, 2)
(3, 3)
(4, 4)
(5, 4)


In [18]:
word2index = {item[0]: index for index, item in enumerate(words_count_list, 1)}
index2word = {index: item[0] for index, item in enumerate(words_count_list, 1)}

In [19]:
print(word2index['and'])
print(index2word[3])

3
and


In [20]:
def doc2index(doc, word2index):
    return [word2index[word] for word in doc]

In [21]:
train_wordasindex = np.asarray([doc2index(doc[0], word2index) for doc in train_doc_words])

In [22]:
print(train_wordasindex.shape)
print(train_wordasindex[0][0])
# print(train_wordasindex[0])

(1579,)
73


In [24]:
# API for corpuse format conversion 

def convert2words(doc):
    return [word for sent in doc for word in sent]

def doc2index(doc, word2index):
    return [[word2index[word]] for word in doc]

def convert_corpus(corpus, word2index):
    corpus_words = [(convert2words(doc[0]), doc[1]) for doc in corpus]
    return np.asarray([doc2index(doc[0], word2index) for doc in corpus_words])

In [25]:
train_wordasindex = convert_corpus(train_doc, word2index)
valid_wordasindex = convert_corpus(valid_doc, word2index)
test_wordasindex = convert_corpus(test_doc, word2index)

In [26]:
print(train_wordasindex.shape)
print(len(train_wordasindex[0]))
print(train_wordasindex[0][0])

(1579,)
1364
[73]


## Embed words 

In [27]:
from gensim.models import Word2Vec

In [28]:
model = Word2Vec.load("word2vec_model")

In [29]:
print(len(model.wv.vocab))

10000


In [30]:
print(word2index['computer'])
print(index2word[394])
# print(model.wv[index2word[394]])
print(model.wv[index2word[394]].shape)
# model.most_similar(index2word[394])

394
computer
(100,)


In [31]:
def embed_from_index(model, idx):
    if idx != 0:
        return model.wv[index2word[idx]]
    else:
        return np.zeros(shape=model.wv[index2word[1]].shape)

def embed_corpus(corpus, word2index, model):
    temp = list(corpus)
    return np.asarray([[embed_from_index(model, idx) for sent in doc for idx in sent] for doc in temp])

In [32]:
# train_wordasindex_embed = embed_corpus(train_doc, word2index, model)


In [33]:
# print(train_wordasindex_embed.shape)
# print(len(train_wordasindex_embed[0]))
# print(train_wordasindex_embed[0][0])

In [34]:
# print(train_wordasindex_embed[0][0].shape)

In [35]:
# train_wordasindex_embed = embed_corpus(train_doc, word2index, model)
# valid_wordasindex_embed = embed_corpus(valid_doc, word2index, model)
# test_wordasindex_embed = embed_corpus(test_doc, word2index, model)

## Convert labels 

In [33]:
def encode_label(label, size):
    l = [0]*size
    l[label] = 1
    return l

def encode_class(corpus, size):
    return np.asarray([encode_label(doc[1], size) for doc in corpus])

In [34]:
train_label = encode_class(train_doc, 8)
valid_label = encode_class(valid_doc, 8)
test_label = encode_class(test_doc, 8)

In [35]:
print(train_label.shape)

(1579, 8)


In [36]:
train_data = (train_wordasindex, train_label)

In [37]:
print(train_data[0].shape)
print(train_data[1].shape)
# print(train_data[0][0])

(1579,)
(1579, 8)


In [38]:
min_count = 100
max_count = 0

for doc in valid_wordasindex:
    if len(doc) < min_count:
        min_count = len(doc)
    if len(doc) > max_count:
        max_count = len(doc)

print(min_count)
print(max_count)


16
6686


## Classification model 

In [39]:
max_features = 10000

learning_rate = 0.001
maxlen = 6700
batch_size = 50
total_batch = int(train_data[0].shape[0]/batch_size)
input_dims = 100
num_hidden= 50
epochs = 100

index=0

In [40]:
print(total_batch)

31


### Padding with 0s

In [41]:
train_data
train_wordasindex
valid_wordasindex
test_wordasindex
train_label
valid_label
test_label
print(train_wordasindex.shape)
# print(train_wordasindex[0])

(1579,)


In [42]:
for sent in train_wordasindex:
    for word_index in sent:
        if word_index == 0:
            print("failed")
            

In [43]:
[[0]]*5

[[0], [0], [0], [0], [0]]

In [44]:
def pad_with_zeros(sequence, maxlen):
    if len(sequence) > maxlen:
        raise Exception
    else:
        return sequence+[[0]]*(maxlen-len(sequence))

In [45]:
k = [1, 2, 3]
pad_with_zeros(k,5)

[1, 2, 3, [0], [0]]

In [46]:
def corpus_pad_with_zeros(corpus, maxlen):
    return np.asarray([pad_with_zeros(sent, maxlen) for sent in corpus])

In [47]:
train_corpus = corpus_pad_with_zeros(train_wordasindex, maxlen)
valid_corpus = corpus_pad_with_zeros(valid_wordasindex, maxlen)
# test_corpus = corpus_pad_with_zeros(test_wordasindex, maxlen)



In [None]:
# print(train_corpus.shape)
# print(train_label.shape)
# print(train_corpus[0].shape)
# print(train_corpus[3].shape)
# print(train_corpus[45].shape)
# print(train_corpus[90].shape)
# print(train_corpus[0])

In [48]:
train_corpus_embed = embed_corpus(train_corpus, word2index, model)
valid_corpus_embed = embed_corpus(valid_corpus, word2index, model)
# test_corpus_embed = embed_corpus(test_corpus, word2index, model)

MemoryError: 

In [None]:
# print(train_corpus_embed.shape)
# print(train_label.shape)
# print(train_corpus_embed[0].shape)
# print(train_corpus_embed[3].shape)
# print(train_corpus_embed[45].shape)
# print(train_corpus_embed[90].shape)
# print(train_corpus_embed[0])

In [None]:
print(valid_corpus.shape)
print(valid_label.shape)

In [None]:
def next_batch(data, index, size):
    """ return next batch in format: index, x batch, y batch
    """
    if index + size <= data[0].shape[0]:
        return index+size, data[0][index:index+size], data[1][index:index+size]
    else:
        return index+size-data[0].shape[0], np.concatenate((data[0][index:],data[0][:index+size-data[0].shape[0]]), 0), \
    np.concatenate((data[1][index:],data[1][:index+size-data[1].shape[0]]), 0)

In [None]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

In [None]:
# tensorflow wants a data format of [Batch Size, Sequence Length, Input Dimension].
x = tf.placeholder(tf.float32, [None, maxlen, input_dims])
y = tf.placeholder(tf.float32, [None, 8])

In [None]:
with tf.variable_scope('cellsdef', reuse=True):
    cell_fw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
    cell_bw = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

In [None]:
with tf.variable_scope('lstmrnn'):
    val, state = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=length(x))    # val are the h_ts 

In [None]:
print(val.shape)
print(len(state))
print(state[0].shape)
print(state[1].shape)

In [None]:
print(length(x))

In [None]:
W = tf.Variable(tf.truncated_normal(shape=[50, 32]))
b = tf.Variable(tf.constant(0.0, shape=[32]))

V = tf.Variable(tf.truncated_normal(shape=[32, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

# dropout_rate = tf.placeholder(tf.float32)

h = tf.nn.relu(tf.matmul(tf.reduce_mean(val, axis=1), W) + b)
# h = tf.nn.relu(tf.matmul(state[1], W) + b)
u = tf.matmul(h, V) + c
p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)
loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [None]:
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))

In [None]:
sess = tf.InteractiveSession()

In [None]:
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for i in range(epochs+1):
    xloss = 0
    acc = 0.0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch((train_corpus_embed, train_label), index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 2 == 0:
        acc = sess.run(accuracy, feed_dict={x:valid_corpus_embed, y:valid_label})
        print("epoch %d, Validation acc: %g" % (i, acc * 100), end="")
        print("%")

## Questions

- What are the benefits and downsides of the RNN-based representation over the bag of words representation used last week? 
- How would availability of data affect your answer?
- One possible architectural variant is to use only the final hidden state of the RNN as the document representation (i.e., x) rather than the average of the hidden states over time. How does this work? What are the potential benefits and downsides to this representation?
- Try different RNN architectures, e.g., simple Elman RNNs or GRUs or LSTMs. Which ones work best?
- What happens if you use a bidirectional LSTM (i.e., the dashed arrows in the figure)?