Deep Learning
=============

Assignment 5
------------

The goal of this assignment is to train a Word2Vec skip-gram model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [None]:
%matplotlib inline
import os, zipfile, random, math
import collections, random
import numpy as np
from sklearn.manifold import TSNE
from tqdm import tqdm
import tensorflow as tf
from urllib.request import urlretrieve
from matplotlib import pylab

In [None]:
data_root = './dataset/'
url = 'http://mattmahoney.net/dc/'

class TqdmUpTo(tqdm):
    def update_to(self, count=1, blockSize=1, totalSize=None):
        if totalSize is not None:
            self.total = totalSize
        # It will also set self.n = count * blockSize
        self.update(count * blockSize - self.n)

def download_file(filename, expected_bytes=None, force=False):
    dest_filename = os.path.join(data_root, filename)
    if force or not os.path.exists(dest_filename):
        print('Download: %s' % filename)
        with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1) as t:
            dest_filename, _ = urlretrieve(url+filename, dest_filename,
                                 reporthook=t.update_to, data=None)
        print('\n%s Download Complete!' % filename)
        
    if expected_bytes:
        statinfo = os.stat(dest_filename)
        not_expected_bytes_error = 'Failed to verify ' + dest_filename + '. Can you get to it with a browser?'
        assert statinfo.st_size == expected_bytes, not_expected_bytes_error
        
    return dest_filename

file = download_file('text8.zip', force=False)

In [None]:
def read_file(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0]),
                                encoding='utf-8').split()
    return data

words = read_file(file)
print('Data size %d' % len(words))

Build the dictionary and replace rare words with UNK token.

In [None]:
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    # Append a new list to the original list.
    # collections.Counter:
    # eg: l = [a, b, a, a, b, c] -> collections.Counter(l)
    # return {'a': 3, 'b': 2, 'c': 1}
    # most_common(k): return 最常见的k个元素. ('a', 3)
    count.extend(
        collections.Counter(words).most_common(vocabulary_size -1))
    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
del words

In [None]:
print('Most common words (+UNK)', count[:5])
print('data:', data[:5])

In [None]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window # target label at the center of the buffer.
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [None]:
print('data: ', [reverse_dictionary[di] for di in data[:32]])


for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=16, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(16)])
    
for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 1
    batch, labels = generate_batch(batch_size=16, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(16)])

In [None]:
labels

In [None]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64

graph = tf.Graph()

with graph.as_default(), tf.device('/device:GPU:0'):
    
    # Input data.
    # [128,]
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    # [128, 1]
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    # [16,]
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # Variables
    # [50000, 128]
    embeddings = tf.Variable(
        tf.random_uniform(shape=[vocabulary_size, embedding_size],
                          minval=-1.0, maxval=1.0))
    # [50000, 128]
    softmax_weights = tf.Variable(
        tf.truncated_normal(shape=[vocabulary_size, embedding_size],
                            mean=0.0, stddev=1. / math.sqrt(embedding_size)))
    # [50000,]
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    
    # Compute.
    _loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, 
                                   biases=softmax_biases,
                                   labels=train_labels,
                                   inputs=embed,
                                   num_sampled=num_sampled,
                                   num_classes=vocabulary_size))
    
    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(_loss)
    
    # compute the similarity between minibatch examles and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, 
                           tf.transpose(normalized_embeddings))

In [None]:
steps = 1000000
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
with tf.Session(graph=graph, config=config) as session:
    tf.global_variables_initializer().run()
    print('[Tensorflow]: Initialized!')
    average_loss = 0
    for step in range(steps):
        batch_data, batch_labels = generate_batch(
            batch_size=batch_size,
            num_skips=num_skips,
            skip_window=skip_window)
        feed_dict = {train_dataset: batch_data,
                    train_labels: batch_labels}
        _, loss = session.run([optimizer, _loss], feed_dict=feed_dict)
        average_loss += loss
        if (step+1) % 2000 == 0:
            average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
        if (step+1) % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s' % valid_word
                for k in range(top_k):
                    closed_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, closed_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

In [None]:
num_points = 400

tsne = TSNE(perplexity=30, 
            n_components=2, 
            init='pca',
            n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points])

In [None]:
%matplotlib inline
def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings.'
    pylab.figure(figsize=(15,15))
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x,y), xytext=(5,2), 
                       textcoords='offset points',
                       ha='right', va='bottom')
    pylab.show()
        
words = [reverse_dictionary[i] for i in range(1, num_points)]
plot(two_d_embeddings, words)

---

Problem
-------

An alternative to skip-gram is another Word2Vec model called [CBOW](http://arxiv.org/abs/1301.3781) (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.

---

In [None]:
data_index = 0

def generate_batch(batch_size, bag_window):
    global data_index
    span =  bag_window * 2 + 1
    
    batch = np.ndarray(shape=(batch_size, span-1), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size):
        buffer_list = list(buffer)
        labels[i, 0] = buffer_list.pop(bag_window)
        batch[i] = buffer_list
        
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [None]:
print('data: ', [reverse_dictionary[di] for di in data[:32]])


for bag_window in [1, 2]:
    data_index = 0
    batch, labels = generate_batch(batch_size=4, bag_window=bag_window)
    print('\nwith bag_window = %d:' % bag_window)
    print('    batch:', [[reverse_dictionary[w] for w in bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(4)])

In [None]:
batch_size = 128
embedding_size = 128
bag_window = 2

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64

graph = tf.Graph()

with graph.as_default(), tf.device('/device:GPU:0'):
    
    # Input data.
    # [128,]
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size, bag_window * 2])
    # [128, 1]
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    # [16,]
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # Variables
    # [50000, 128]
    embeddings = tf.Variable(
        tf.random_uniform(shape=[vocabulary_size, embedding_size],
                          minval=-1.0, maxval=1.0))
    # [50000, 128]
    softmax_weights = tf.Variable(
        tf.truncated_normal(shape=[vocabulary_size, embedding_size],
                            mean=0.0, stddev=1. / math.sqrt(embedding_size)))
    # [50000,]
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    
    # Compute.
    _loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, 
                                   biases=softmax_biases,
                                   labels=train_labels,
                                   inputs=tf.reduce_sum(embed, 1),
                                   num_sampled=num_sampled,
                                   num_classes=vocabulary_size))
    
    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(_loss)
    
    # compute the similarity between minibatch examles and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
      normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, 
                           tf.transpose(normalized_embeddings))

In [None]:
steps = 100000
config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
with tf.Session(graph=graph, config=config) as session:
    tf.global_variables_initializer().run()
    print('[Tensorflow]: Initialized!')
    average_loss = 0
    for step in range(steps):
        batch_data, batch_labels = generate_batch(
            batch_size=batch_size,
            bag_window=bag_window)
        feed_dict = {train_dataset: batch_data,
                    train_labels: batch_labels}
        _, loss = session.run([optimizer, _loss], feed_dict=feed_dict)
        average_loss += loss
        if (step+1) % 2000 == 0:
            average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step+1, average_loss))
            average_loss = 0
        if (step+1) % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s' % valid_word
                for k in range(top_k):
                    closed_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, closed_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

In [None]:
num_points = 400

tsne = TSNE(perplexity=30, 
            n_components=2, 
            init='pca',
            n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points])

In [None]:
%matplotlib inline
def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings.'
    pylab.figure(figsize=(15,15))
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x,y), xytext=(5,2), 
                       textcoords='offset points',
                       ha='right', va='bottom')
    pylab.show()
        
words = [reverse_dictionary[i] for i in range(1, num_points)]
plot(two_d_embeddings, words)