# Word2Vec - The Skipgram Model

Considering the sentence :
"Hello my name is Gitesh."

With a window size of 1, we have the dataset:
([With,window],a), ([a,size],window)....

Skipgram model tries to predict each context word from its target word, and so the task becomes to predict 'with' and 'window' from 'a' and to predict 'a' and 'size' from 'window' and so on...

In [22]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import numpy as np
import tensorflow as tf
import matplotlib
matplotlib.use('TKAgg')
from matplotlib import pyplot as plt
from utils import *

In [23]:
#Constants
batch_size = 128
vocabulary_size = 50000
embedding_size = 128 #Dimension of the embedding vector
num_sampled = 64 #Number of negative examples to sample

In [24]:
#Loading the dataset

train_data, val_data, reverse_dictionary = load_data()
print('Number of training examples: ', len(train_data)* batch_size)
print('Number of validation examples: ', len(val_data))
print(train_data[0])

Loaded train data!
Loaded val data!
Loaded reverse dictionary!
Number of training examples:  3840000
Number of validation examples:  16
(array([ 3081,  3081,    12,    12,     6,     6,   195,   195,     2,
           2,  3134,  3134,    46,    46,    59,    59,   156,   156,
         128,   128,   742,   742,   477,   477, 10572, 10572,   134,
         134,     1,     1, 27350, 27350,     2,     2,     1,     1,
         103,   103,   855,   855,     3,     3,     1,     1, 15068,
       15068,     0,     0,     2,     2,     1,     1,   151,   151,
         855,   855,  3581,  3581,     1,     1,   195,   195,    11,
          11,   191,   191,    59,    59,     5,     5,     6,     6,
       10713, 10713,   215,   215,     7,     7,  1325,  1325,   105,
         105,   455,   455,    20,    20,    59,    59,  2732,  2732,
         363,   363,     7,     7,  3673,  3673,     1,     1,   709,
         709,     2,     2,   372,   372,    27,    27,    41,    41,
          37,    37,   

In [25]:
#Creating the computational graph/Model

def skipgram():
    batch_inputs = tf.placeholder(tf.int32, shape = [batch_size,])
    batch_labels = tf.placeholder(tf.int32, shape = [batch_size,1])
    val_dataset = tf.constant(val_data, dtype=tf.int32)
    
    with tf.variable_scope('word2vec') as scope:
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
        batch_embeddings = tf.nn.embedding_lookup(embeddings, batch_inputs)
        
        weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
        biases = tf.Variable(tf.zeros([vocabulary_size]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=weights,
                                             biases = biases,
                                             labels = batch_labels,
                                             inputs = batch_embeddings,
                                             num_sampled = num_sampled,
                                             num_classes = vocabulary_size))
        
        norm= tf.sqrt(tf.reduce_mean(tf.square(embeddings), axis = 1, keep_dims = True))
        normalized_embeddings = embeddings/norm
        
        val_embeddings = tf.nn.embedding_lookup(normalized_embeddings, val_dataset)
        similarity = tf.matmul(val_embeddings, normalized_embeddings, transpose_b=True)
        
        return batch_inputs, batch_labels, normalized_embeddings, loss, similarity
        

In [26]:
def run():
    batch_inputs, batch_labels, normalized_embeddings, loss, similarity = skipgram()
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        
        average_loss = 0.0
        for step, batch_data in enumerate(train_data):
            inputs, labels = batch_data
            feed_dict = {batch_inputs: inputs, batch_labels: labels}
            _,loss_val = sess.run([optimizer, loss], feed_dict)
            average_loss = average_loss + loss_val
            
            if step%1000 == 0 :
                if step > 0:
                    average_loss /= 1000
                print('Loss at iter',step,':',average_loss)
                average_loss = 0
                
            if step%5000 == 0:
                sim = similarity.eval()
                for i in range(len(val_data)):
                    top_k = 8
                    nearest = (-sim[i,:]).argsort()[1:top_k+1]
                    print_closest_words(val_data[i], nearest, reverse_dictionary)
                    
        final_embeddings = normalized_embeddings.eval()
        
        return final_embeddings
                    
                

In [27]:
final_embeddings = run()

Loss at iter 0 : 311.9130859375
Nearest to he: luminance, expending, ballast, pundit, signatory, uniting, doyle, pva,
Nearest to called: threats, membrane, ifc, responds, roms, yakuza, bentine, rohypnol,
Nearest to for: sir, placed, matth, modular, cypher, having, fagan, dunraven,
Nearest to if: mauryan, misrepresented, flaps, rims, netherlands, erupted, sparrow, jehoahaz,
Nearest to state: candidate, internalized, annoyed, doctorow, shem, nationally, facilities, cervix,
Nearest to that: fermium, antares, unexplained, distributes, ambassadors, disturb, spartan, mondo,
Nearest to no: luria, dreamcast, softer, indicating, spook, broch, offshoot, apostasy,
Nearest to d: pn, tunny, trolling, teen, cloning, camelot, tractate, brake,
Nearest to however: hoc, lex, thaddeus, locrian, foreshadowed, rodin, japheth, chaffee,
Nearest to system: mpr, franchised, himmler, commissioned, taiga, stein, localization, amenable,
Nearest to as: sutras, coffey, ym, box, reshaped, uneventful, lodovico, sarah

Loss at iter 26000 : 6.483825495958328
Loss at iter 27000 : 6.505431483507157
Loss at iter 28000 : 5.944219085216522
Loss at iter 29000 : 6.254699397087097


In [28]:

from sklearn.manifold import TSNE
visualize_embeddings(final_embeddings, reverse_dictionary)