In [1]:
import os
import numpy as np
import tensorflow as tf
import utils
import cbow_gen

In [2]:
tf.reset_default_graph()
VOCAB_SIZE = 50000
BATCH_SIZE = 128
#dimension of the word embedding vectors
EMBED_SIZE = 128
#the context window
SKIP_WINDOW = 2
#number of negative examples to sample
NUM_SAMPLED = 64
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
#number of tokens to visualize
NUM_VISUALIZE = 3000

In [3]:
def word2vec(dataset):
    #Step 1:get input, output from the dataset
    with tf.name_scope('data'):
        iterator = dataset.make_initializable_iterator()
        around_words, target_words = iterator.get_next()
    #Step 2+3:define weights and embedding lookup.
    with tf.name_scope('embed'):
        embeds = None
        embed_matrix = tf.get_variable('embed_matrix',shape=[VOCAB_SIZE,EMBED_SIZE],
                                      initializer = tf.random_uniform_initializer())
        for i in range(2*SKIP_WINDOW):
            embed = tf.nn.embedding_lookup(embed_matrix, around_words[:,i])
            print('embedding %d shape: %s'%(i,embed.get_shape().as_list()))
            emb_x,emb_y = embed.get_shape().as_list()
            if embeds is None:
                embeds = tf.reshape(embed,[emb_x,emb_y,1])
            else:
                embeds = tf.concat([embeds,tf.reshape(embed,[emb_x,emb_y,1])],2)
        assert embeds.get_shape().as_list()[2]==2*SKIP_WINDOW
        avg_embed =  tf.reduce_mean(embeds,2,keep_dims=False)
    #Step 4: construct variables for NCE loss and define loss function
    with tf.name_scope('loss'):
        softmax_weights = tf.get_variable('nce_weight',shape=[VOCAB_SIZE,EMBED_SIZE],
                                     initializer=tf.truncated_normal_initializer(stddev=1.0/(EMBED_SIZE**0.5)))
        softmax_biases = tf.get_variable('nce_bias',initializer=tf.zeros([VOCAB_SIZE]))
        #define loss function to be SM loss function
        loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights,
                                                         biases = softmax_biases,
                                                         inputs = avg_embed,
                                                         labels = target_words,
                                                         num_sampled = NUM_SAMPLED,
                                                         num_classes = VOCAB_SIZE),name='loss')
    #Step 5:define optimizer
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
        utils.safe_mkdir('checkpoints2')
        with tf.Session() as sess:
            sess.run(iterator.initializer)
            sess.run(tf.global_variables_initializer())
            total_loss = 0.0
            writer = tf.summary.FileWriter('graphs/word2vec_simple',sess.graph)
            for index in range(NUM_TRAIN_STEPS):
                try:
                    loss_batch, _ = sess.run([loss,optimizer])
                    total_loss+=loss_batch
                    if (index + 1) % SKIP_STEP == 0:
                        print('Average loss at step {}:{:5.1f}'.format(index,total_loss/SKIP_STEP))
                        total_loss = 0.0
                except tf.errors.OutOfRangeError:
                    sess.run(iterator.initializer)
            writer.close()

In [4]:
def gen():
    yield from cbow_gen.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

def main():
    dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE,SKIP_WINDOW*2]), tf.TensorShape([BATCH_SIZE, 1])))
    word2vec(dataset)
main()

embedding 0 shape: [128, 128]
embedding 1 shape: [128, 128]
embedding 2 shape: [128, 128]
embedding 3 shape: [128, 128]
data/text8.zip already exists
Average loss at step 4999:  4.0
Average loss at step 9999:  3.6
Average loss at step 14999:  3.5
Average loss at step 19999:  3.4
Average loss at step 24999:  3.4
Average loss at step 29999:  3.3
Average loss at step 34999:  3.3
Average loss at step 39999:  3.3
Average loss at step 44999:  3.3
Average loss at step 49999:  3.2
Average loss at step 54999:  3.2
Average loss at step 59999:  3.2
Average loss at step 64999:  3.2
Average loss at step 69999:  3.1
Average loss at step 74999:  3.1
Average loss at step 79999:  3.1
Average loss at step 84999:  3.1
Average loss at step 89999:  3.1
Average loss at step 94999:  3.1
Average loss at step 99999:  3.0
