## Practical 2: Text Classification with Word Embedding
<p>Oxford CS - Deep NLP 2017<br>
https://www.cs.ox.ac.uk/teaching/courses/2016-2017/dl/</p>
<p>[Yannis Assael, Brendan Shillingford, Chris Dyer]</p>

In [1]:
import numpy as np
import time
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Load TED dataset 

In [6]:
import urllib.request
import zipfile
import lxml.etree

In [7]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [72]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

doc_list = doc.xpath('//content/text()')
label_list = doc.xpath('//keywords/text()')
# input_text = '\n'.join(doc_list)
del doc

In [1]:
def get_label(keywords, label_dict):
    label_string = keywords.lower()
    if ("technology" in label_string) and ("entertainment" in label_string) and ("design" in label_string):
        return label_dict['TED']
    elif ("entertainment" in label_string) and ("design" in label_string):
        return label_dict['oED']
    elif ("technology" in label_string) and ("design" in label_string):
        return label_dict['ToD']
    elif ("technology" in label_string) and ("entertainment" in label_string):
        return label_dict['TEo']
    elif ("design" in label_string):
        return label_dict['ooD']
    elif ("entertainment" in label_string):
        return label_dict['oEo']
    elif ("technology" in label_string):
        return label_dict['Too']
    else:
        return label_dict['ooo']

In [49]:
labels = ['ooo', 'Too', 'oEo', 'ooD', 'TEo', 'ToD', 'oED', 'TED']
label_dict = {labels[i]: i for i in range(8)}

label_list_temp = [get_label(keywords, label_dict) for keywords in label_list]   
labelled_doc = list(zip(doc_list, label_list_temp))

talks, business, creativity, curiosity, goal-setting, innovation, motivation, potential, success, work
talks, Planets, TEDx, bacteria, biology, engineering, environment, evolution, exploration, future, innovation, intelligence, microbiology, nature, potential, science
talks, Debate, Guns, activism, big problems, children, choice, community, future, goal-setting, government, law, leadership, marketing, parenting, policy, social change, violence
talks, Brazil, Slavery, art, beauty, community, creativity, culture, design, global issues, humanity, identity, photography, race, social change, society, visualizations
talks, NASA, communication, computers, creativity, design, engineering, exploration, future, innovation, interface design, invention, microsoft, potential, prediction, product design, technology, visualizations
talks, Africa, Internet, community, democracy, development, future, government, identity, leadership, politics, potential
talks, ancient world, animals, biology, biosphere

In [3]:
from random import sample

def divide_dataset(labelled_doc, num_train, num_valid, num_test=None, shuffle=False):
    if num_test == None:
        num_test = len(labelled_doc) - num_train - num_valid
        
    if shuffle:
        temp = sample(labelled_doc, len(labelled_doc))
    else:
        temp = labelled_doc
    return temp[:num_train], temp[num_train:-num_test], temp[-num_test:]

In [58]:
train_doc_temp, valid_doc_temp, test_doc_temp = divide_dataset(labelled_doc, 1585, 250, 250, shuffle=True)

print(len(train_doc_temp), len(valid_doc_temp), len(test_doc_temp))

1585
250
250


## Build vocabulary using training set 

In [None]:
from collections import Counter 

def tokenize_and_lowercase(text):
    text_noparens = re.sub(r'\([^)]*\)', '', text)
    sentences_strings = []
    for line in text_noparens.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        sentences_strings.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
        
    sentences= []
    for sent_str in sentences_strings:
        tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
        sentences.append(tokens)
    return sentences

def get_most_common_words_list(sentences, num_words):
    counts_ted_top1000 = []
    c = Counter([word for sent in sentences for word in sent])
    list_most_common = c.most_common(num_words)
    words_most_common = [item[0] for item in list_most_common]
    #for word, count in list_most_common:
    #    counts_ted_top1000.append(count)
    return words_most_common
    
def replace_unknown_token(sent_list, words_most_common, unknown_token="UNK"):
    filtered_list = [word if word in words_most_common else unknown_token for word in sent_list]  # so fast !!!
    return filtered_list

def tokenize_and_lowercase_most_common(text, words_most_common):
    text_noparens = re.sub(r'\([^)]*\)', '', text)
    sentences_strings = []
    for line in text_noparens.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        sentences_strings.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
        
    sentences= []
    for sent_str in sentences_strings:
        tokens = replace_unknown_token(re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split(), words_most_common)
        if tokens != []:
            sentences.append(tokens)
    return sentences

def build_dataset(doc_temp, words_most_common):
    doc_new = [(tokenize_and_lowercase_most_common(doc[0], words_most_common), doc[1]) for doc in doc_temp]
    doc_final = [item for item in doc_new if item[0] != []]
    return doc_final

In [124]:
input_text = '\n'.join([train_doc_temp[i][0] for i in range(len(train_doc_temp))])
sentences_ted = tokenize_and_lowercase(input_text)
# train_doc = [(tokenize_and_lowercase(doc_temp[0]), doc_temp[1]) for doc_temp in train_doc_temp]

# Get the list of 9999 most common words 
words_most_common = get_most_common_words_list(sentences_ted, 9999)

# Training set 
train_doc = [(tokenize_and_lowercase_most_common(doc_temp[0], words_most_common), doc_temp[1]) for doc_temp in train_doc_temp]

### Rebuild the vocabulary 

In [None]:
def rebuild_vocab(train_doc):
    """ filtered out unknow tokens, rebuild the dataset with the top vocabulary.
    """
    sentences = [sent for doc in train_doc for sent in doc[0]]
    return sentences 

In [149]:
sentences = rebuild_vocab(train_doc)

## Model 

### Word2Vec Embedding

In [5]:
import os 
from gensim.models import Word2Vec

def build_word2vec_model(name, sentences=None, min_count=10, size=100):
    if not os.path.isfile(name):
        model = Word2Vec(sentences, min_count=min_count, size=size)
        model.save(name)
    else:
        model = Word2Vec.load(name)
    return model 

In [None]:
model = build_word2vec_model('word2vec_model', , sentences=sentences, min_count=10, size=100)

## Dataset

In [193]:
# Training set 
train_doc = build_dataset(train_doc_temp, words_most_common)

# Validation set 
valid_doc = build_dataset(valid_doc_temp, words_most_common)

# Test set 
test_doc = build_dataset(test_doc_temp, words_most_common)

In [None]:
np.savez('corpus_all_9999', train_doc=train_doc, valid_doc=valid_doc, test_doc=test_doc)

## Bag of Means 

In [176]:
def embed_text(model, text):
    """ embed the input text as a model vector 
    
    Arguments:
        model: Word2Vec model.
        text: input text
    
    Outputs:
        embedded vector 
    """
    vector_list = [model.wv[word] for sent in text for word in sent]
    return sum(vector_list) / len(vector_list)
    
def embed_corpus(model, corpus):
    return np.asarray([embed_text(model, doc[0]) for doc in corpus])

def encode_label(label, size):
    l = [0]*size
    l[label] = 1
    return l

def encode_class(corpus, size):
    return np.asarray([encode_label(doc[1], size) for doc in corpus])

def embedded_with_class(model, doc, size):
    doc_x = embed_corpus(model, doc)
    doc_y = encode_class(doc, size)
    return doc_x, doc_y

In [199]:
# train_doc_embedded = embed_corpus(model, train_doc)

train_doc_embed_with_class = embedded_with_class(model, train_doc, len(label_dict))
valid_doc_embed_with_class = embedded_with_class(model, valid_doc, len(label_dict))
test_doc_embed_with_class = embedded_with_class(model, test_doc, len(label_dict))

In [None]:
np.savez('embedded_corpus_all_9999', train_doc_embedded=train_doc_embed_with_class,
         valid_doc_embedded=valid_doc_embed_with_class, test_doc_embedded=test_doc_embed_with_class)

## TensorFlow model 

In [204]:
import tensorflow as tf 

In [309]:
epoch = 2000
learning_rate = 0.0001
batch_size = 50
total_batch = int(train_doc_embedded.shape[0] / batch_size)
index = 0

31


In [205]:
x = tf.placeholder(tf.float32, shape=[None, 100])
y = tf.placeholder(tf.int32, shape=[None, 8])

In [210]:
W = tf.Variable(tf.truncated_normal(shape=[100, 256]))
b = tf.Variable(tf.constant(0.0, shape=[256]))

V = tf.Variable(tf.truncated_normal(shape=[256, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

In [211]:
h = tf.tanh(tf.matmul(x, W) + b)
u = tf.matmul(h, V) + c

p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)

In [217]:
loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(tf.clip_by_value(p, 1e-10, 1.0)), 1))
# loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))

accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))

In [219]:
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [None]:
saver = tf.train.Saver()

In [263]:
def next_batch(data, index, size):
    """ return next batch in format: index, x batch, y batch
    """
    if index + size <= data[0].shape[0]:
        return index+size, data[0][index:index+size], data[1][index:index+size]
    else:
        return index+size-data[0].shape[0], np.concatenate((data[0][index:],data[0][:index+size-data[0].shape[0]]), 0), \
    np.concatenate((data[1][index:],data[1][:index+size-data[1].shape[0]]), 0)

### Validation

In [None]:
x_valid, y_valid = (valid_doc_embed_with_class[0], valid_doc_embed_with_class[1])

In [223]:
sess = tf.InteractiveSession()

In [280]:
init = tf.global_variables_initializer()
sess.run(init)

In [282]:
for i in range(epoch):
    xloss = 0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch(train_doc_embed_with_class, index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_})
        
        if j % 10 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 100 == 0:
        acc = sess.run(accuracy, feed_dict={x:valid_doc_embed_with_class[0], y:valid_doc_embed_with_class[1]})
        print("Validation acc: %g" % (acc * 100), end="")
        print("%")
        
save_path = saver.save(sess, "/tmp/validation_model"+str(time.time())+".ckpt")
print("Model saved in file: %s" % save_path)            

epoch 0, run 0, loss 18.2937
epoch 0, run 10, loss 8.32358
epoch 0, run 20, loss 6.38116
epoch 0, run 30, loss 7.52312
epoch 1, run 0, loss 5.39021
epoch 1, run 10, loss 4.09569
epoch 1, run 20, loss 3.92336
epoch 1, run 30, loss 5.10267
epoch 2, run 0, loss 5.87365
epoch 2, run 10, loss 1.41565
epoch 2, run 20, loss 3.9831
epoch 2, run 30, loss 4.16819
epoch 3, run 0, loss 5.06596
epoch 3, run 10, loss 1.5319
epoch 3, run 20, loss 2.74353
epoch 3, run 30, loss 2.56959
epoch 4, run 0, loss 3.12929
epoch 4, run 10, loss 1.81007
epoch 4, run 20, loss 2.54993
epoch 4, run 30, loss 2.5161
epoch 5, run 0, loss 2.72305
epoch 5, run 10, loss 2.00362
epoch 5, run 20, loss 2.17282
epoch 5, run 30, loss 2.11775
epoch 6, run 0, loss 2.05125
epoch 6, run 10, loss 1.12926
epoch 6, run 20, loss 1.57404
epoch 6, run 30, loss 2.68889
epoch 7, run 0, loss 1.82154
epoch 7, run 10, loss 1.21378
epoch 7, run 20, loss 1.54869
epoch 7, run 30, loss 2.48492
epoch 8, run 0, loss 2.01407
epoch 8, run 10, loss 

### Test

In [295]:
x_test, y_test = (test_doc_embed_with_class[0], test_doc_embed_with_class[1])

(250, 100)
(250, 8)


In [316]:
epoch = 3000

In [315]:
init = tf.global_variables_initializer()
sess.run(init)

In [317]:
for i in range(epoch):
    xloss = 0
    acc = 0.0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch(train_doc_embed_with_class, index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_})
        
        if j % 30 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 100 == 0:
        acc = sess.run(accuracy, feed_dict={x:test_doc_embed_with_class[0], y:test_doc_embed_with_class[1]})
        print("Test acc: %g" % (acc * 100), end="")
        print("%")
        
save_path = saver.save(sess, "/tmp/test_model"+str(time.time())+".ckpt")
print("Model saved in file: %s" % save_path)     

Validation acc: 27.2%
Validation acc: 42.4%
Validation acc: 47.6%
Validation acc: 44.8%
Validation acc: 42%
Validation acc: 41.6%
Validation acc: 44%
Validation acc: 47.2%
Validation acc: 41.6%
Validation acc: 43.6%
Validation acc: 42.8%
Validation acc: 40%
Validation acc: 40.8%
Validation acc: 40.8%
Validation acc: 40.4%
Validation acc: 42.4%
Validation acc: 41.2%
Validation acc: 42.8%
Validation acc: 41.6%
Validation acc: 39.6%
Validation acc: 42%
Validation acc: 43.6%
Validation acc: 43.6%
Validation acc: 43.6%
Validation acc: 43.6%
Validation acc: 40.4%
Validation acc: 40.8%
Validation acc: 41.2%
Validation acc: 43.6%
Validation acc: 42.8%


## Questions to answer 

- Compare the learning curves of the model starting from random embeddings, starting from GloVe embeddings (http://nlp.stanford.edu/data/glove.6B.zip; 50 dimensions) or fixed to be the GloVe values. Training in batches is more stable (e.g. 50), which model works best on training vs. test? Which model works best on held-out accuracy?
- What happens if you try alternative non-linearities (logistic sigmoid or ReLU instead of tanh)?
- What happens if you add dropout to the network?
- What happens if you vary the size of the hidden layer?
- How would the code change if you wanted to add a second hidden layer?
- How does the training algorithm affect the quality of the model?
- Project the embeddings of the labels onto 2 dimensions and visualise (each row of the projection matrix V corresponds a label embedding). Do you see anything interesting?

In [332]:
W = tf.Variable(tf.truncated_normal(shape=[100, 256]))
b = tf.Variable(tf.constant(0.0, shape=[256]))

W2 = tf.Variable(tf.truncated_normal(shape=[256, 128]))
b2 = tf.Variable(tf.constant(0.0, shape=[128]))

V = tf.Variable(tf.truncated_normal(shape=[128, 8]))
c = tf.Variable(tf.constant(0.0, shape=[8]))

dropout_rate = tf.placeholder(tf.float32)

h = tf.nn.relu(tf.matmul(x, W) + b)
h2 = tf.nn.relu(tf.matmul(h, W2) + b2)
h2_drop = tf.nn.dropout(h2, keep_prob=dropout_rate)
u = tf.matmul(h2_drop, V) + c
p = tf.nn.softmax(u)
pred = tf.argmax(p, 1)

loss = tf.reduce_mean(tf.reduce_sum(-tf.cast(y, tf.float32)*tf.log(p), 1))
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, tf.argmax(y, 1)), tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()

In [None]:
init = tf.global_variables_initializer()
sess.run(init)

In [333]:
for i in range(epoch):
    xloss = 0
    acc = 0.0
    
    for j in range(total_batch):
        # need to incoporate y in the batches and expand to 8 classes 
        index, x_, y_ = next_batch(train_doc_embed_with_class, index, batch_size)
        _, xloss = sess.run([optimizer, loss], feed_dict={x: x_, y: y_, dropout_rate: 0.5})
        
        if j % 30 == 0:
            print("epoch %d, run %d, loss %g" % (i, j, xloss))
            
    if i % 100 == 0:
        acc = sess.run(accuracy, feed_dict={x:test_doc_embed_with_class[0], y:test_doc_embed_with_class[1], dropout_rate: 1.0})
        print("epoch %d, Test acc: %g" % (i, acc * 100), end="")
        print("%")
        
save_path = saver.save(sess, "/tmp/model"+str(time.time())+".ckpt")
print("Model saved in file: %s" % save_path)     

epoch 0, Validation acc: 30.4%
epoch 100, Validation acc: 39.2%
epoch 200, Validation acc: 40.4%
epoch 300, Validation acc: 42%
epoch 400, Validation acc: 44%
epoch 500, Validation acc: 44.8%
epoch 600, Validation acc: 45.2%
epoch 700, Validation acc: 45.2%
epoch 800, Validation acc: 44.8%
epoch 900, Validation acc: 45.2%
epoch 1000, Validation acc: 44.8%
epoch 1100, Validation acc: 44.8%
epoch 1200, Validation acc: 44.8%
epoch 1300, Validation acc: 42%
epoch 1400, Validation acc: 43.2%
epoch 1500, Validation acc: 44%
epoch 1600, Validation acc: 44%
epoch 1700, Validation acc: 42%
epoch 1800, Validation acc: 42.4%
epoch 1900, Validation acc: 39.2%
epoch 2000, Validation acc: 43.2%
epoch 2100, Validation acc: 42.4%
epoch 2200, Validation acc: 42.8%
epoch 2300, Validation acc: 42.4%
epoch 2400, Validation acc: 44.4%
epoch 2500, Validation acc: 43.2%
epoch 2600, Validation acc: 43.2%
epoch 2700, Validation acc: 44%
epoch 2800, Validation acc: 43.6%
epoch 2900, Validation acc: 42%
