## Word2Vec: skip-gram model

Train a one hidden layer neural network. The weights are the "word vectors"  
2 training methods: hierachical softmax and negative sampling, would implement nagtive sampling here  
Negative sampling is a simplified model of Noise Contrastive Estimation (NCE)  

*How to structure tensorflow model*. 
Phase 1: assemble your graph
1. Define placeholders for input and output
2. Define the weights
3. Define the inference model
4. Define loss function
5. Define optimizer

Phase 2: execute the computation (training the model)
1. Initialize all model vaiables for the first time
2. Feed in the training data. Might involve randomizing the order of data samples
3. Execute the inference model on training data, so it calculates for each training input example the output with the current model parameters
4. Compute the cost
5. Adjust the model parameters to minimize/maximize the cost depending on the model



In [None]:
# Phase 1
# 1. define the placeholders
# Input is the center word and output is the target (context) word. Instead of using one-hot vectors, we input the index of those words directly.
center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE])

# 2. Define the weight
embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))

# 3. Inference (compute the forward path of the graph)
# each row of embed_matrix corresponds to the vector representation of the word at that index.
#  So to get the representation of all the center words in the batch, we get the slice of all corresponding rows in the embedding matrix.
# tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None)
embed = tf.nn.embedding_lookup(embed_matrix, center_words)

# 4. Define the loss function
# tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, remove_accidental_hits=False, partition_strategy='mod', name='nce_loss')
# acutually in word2vec, the thrid argument is actually inputs and the fourth is labels
# for nce_loss, we need weights and biases for the hidden layer to calculate nce loss
nce_weight = tf.Variable(tf.truncated_norm([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / EMBED_SIZE ** 0.5))
nce_bias = tf.Variable(tf.zeros[VOCAB_SIZE])
loss = tf.reducemean(tf.nn.nce_loss(weights=nce_weight,
                                    biases=nce_bias,
                                    labels=target_words,
                                    input=embed,
                                    num_sampled=NUM_SAMPLED,
                                    num_classes=VOCAL_SIZE))
# 5. Define optimizer
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

# Phase 2.
with tf.Session() as sess:
    sess . run ( tf . global_variables_initializer ())
    average_loss = 0.0
    for index in xrange(NUM_TRAIN_STEPS):
        batch = batch_gen.next()
        loss_batch, _ = sess.run([loss, optimizer],
                                feed_dict={center_words: batch[0], target_words: batch[1]})
        average_loss += loss_batch
        if (index + 1) % 2000 == 0:
            print ('Average loss at step {}: {:5.1f}'.format(index + 1, average_loss / (index + 1)))
            

## Name Scope

In [None]:
# Above definition of the model would lead the nodes in tensorboard scateering all over
# Should tell tensorboard to know which nodes should be grouped
with tf.name_scope(name_of_that_scope):
    # declare op_1
    # declare op_2
    # ...

    
with tf.name_scope('data'):
    center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name='center_words')
    target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name='target_words')

    
with tf.name_scope('embed'):
    embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), name='embed_matrix')
    
with tf.name_scope('loss'):
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')
    nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / math.sqrt(EMBED_SIZE)), name='nce_weight')
    nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, name='loss')

## Build model class

In [None]:
class SkipGramModel:
    """Build the graph for word2vec model"""
    def __init__(self, params):
        pass
    
    def _create_placeholders(self):
        """Step 1: Define the placeholders for input and output"""
        pass
    
    def _create_embedding(self):
        """Step 2: define weights."""
        pass
    
    def _create_loss(self):
        """Step 3 + 4: define the inference and loss function"""
        pass
    
    def _create_optimizer(self):
        """Step 5: define optimizer"""
        pass
    

## Visualize the results

In [None]:
from tensorflow.contrib.tensorboard.plugins import projector
# obtain the embedding_matrix after you’ve trained it
final_embed_matrix = sess.run(model.embed_matrix)

# create a variable to hold your embeddings. It has to be a variable. Constants
# don’t work. You also can’t just use the embed_matrix we defined earlier for our model. Why
# is that so? I don't know. I get the 500 most popular words
embedding_var = tf.Variable(final_embed_matrix[:500], name="embedding")
sess.run(embedding_var.initializer)
config = projector.ProjectorConfig()
summary_writer = tf.summary.FileWriter(LOGDIR)

# add embeddings to config
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name
 
# link the embeddings to their metadata file. In this case, the file taht contains
# the most 500 most popular words in our vocabulary
embedding.metadata_path = LOGDIR + '/vocab_500.tsv'

# save a configuration file that TensorBoard will read during startup
projector.visualize_embeddings(summary_writer, config)

# save our embedding
saver_embed = tf.train.Saver([embedding_var])
saver_embed.save(sess, LOGDIR + '/skip-gram.ckpt', 1)
