Word Embeddings are dense vector representations of words in low dimensional vector space. Word2Vec is the most popular word embedding model. The use of Word2Vec is to group words that semantically similar in vector space. It computes similarities mathematically.




There are two variants :
1. CBOW (Continuous Bag of Words) : It tries to predict a word on based of its neighbours.
2. SkipGram : It tries to predict the neighbours of a given word.



1. Build a 3 layer neural network.
2. The objective of network is to predict the neighbouring word given a word.
3. Remove the last layer and keep the input and hidden layer.
4. Now, input a word from within the vocabulary. The output given at the hidden layer is the ‘word embedding’ of the input word.


In [0]:
import numpy as np
import tensorflow as tf

In [0]:
corpus_raw = 'He is the king . The king is royal . She is the royal queen'
corpus_raw = corpus_raw.lower()

In [0]:
words = []
for word in corpus_raw.split():
    if word != '.': 
        words.append(word)

In [0]:
words=set(words)


In [5]:
words

{'he', 'is', 'king', 'queen', 'royal', 'she', 'the'}

In [0]:
word2int = {}
int2word = {}
vocab_size = len(words)

for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [7]:
print(word2int["queen"])
print(int2word[6])

1
king


In [0]:
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())


In [9]:
print(sentences)

[['he', 'is', 'the', 'king'], ['the', 'king', 'is', 'royal'], ['she', 'is', 'the', 'royal', 'queen']]


In [0]:
data = []
WINDOW_SIZE = 2

def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [0]:
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : 
            if nb_word != word:
                data.append([word, nb_word])

In [12]:
data

[['he', 'is'],
 ['he', 'the'],
 ['is', 'he'],
 ['is', 'the'],
 ['is', 'king'],
 ['the', 'he'],
 ['the', 'is'],
 ['the', 'king'],
 ['king', 'is'],
 ['king', 'the'],
 ['the', 'king'],
 ['the', 'is'],
 ['king', 'the'],
 ['king', 'is'],
 ['king', 'royal'],
 ['is', 'the'],
 ['is', 'king'],
 ['is', 'royal'],
 ['royal', 'king'],
 ['royal', 'is'],
 ['she', 'is'],
 ['she', 'the'],
 ['is', 'she'],
 ['is', 'the'],
 ['is', 'royal'],
 ['the', 'she'],
 ['the', 'is'],
 ['the', 'royal'],
 ['the', 'queen'],
 ['royal', 'is'],
 ['royal', 'the'],
 ['royal', 'queen'],
 ['queen', 'the'],
 ['queen', 'royal']]

In [0]:
x_train = [] 
y_train = [] 
for data_word in data:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))


In [0]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [0]:
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [16]:
EMBEDDING_DIM = 5 
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))
hidden_representation = tf.add(tf.matmul(x,W1), b1)

Instructions for updating:
Colocations handled automatically by placer.


In [0]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [0]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) 

# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 10000
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

In [19]:
print(sess.run(W1))
print(sess.run(b1))

[[ 1.2999856   1.3043436   1.1777382   1.6747016  -1.6034336 ]
 [-0.6418186  -1.6659697   0.6622133  -1.0588969  -1.478851  ]
 [ 0.5652176   0.76805556 -0.64750046 -0.09762229 -0.31200808]
 [ 0.13241744  0.16272512  2.3293061   0.582963    2.2461472 ]
 [ 0.11834886  1.5018452  -0.18658994 -0.33290207 -1.1634549 ]
 [-1.1003639  -2.7193942   0.19062264  1.9049916   1.1136706 ]
 [-0.36808482 -0.16243415 -0.68501765 -1.7686338   0.6701767 ]]
[-1.5962734  1.3971279 -1.0591146 -1.8266429 -1.8087096]


In [0]:
vectors = sess.run(W1 + b1)

In [21]:
vectors

array([[-0.29628778,  2.7014713 ,  0.11862361, -0.1519413 , -3.4121432 ],
       [-2.238092  , -0.26884186, -0.39690125, -2.8855398 , -3.2875605 ],
       [-1.0310558 ,  2.1651835 , -1.706615  , -1.9242651 , -2.1207178 ],
       [-1.463856  ,  1.559853  ,  1.2701916 , -1.2436799 ,  0.43743753],
       [-1.4779246 ,  2.898973  , -1.2457045 , -2.159545  , -2.9721646 ],
       [-2.6966372 , -1.3222663 , -0.86849195,  0.07834876, -0.69503903],
       [-1.9643582 ,  1.2346938 , -1.7441323 , -3.5952768 , -1.1385329 ]],
      dtype=float32)

In [22]:
print(vectors[ word2int['queen'] ])

[-2.238092   -0.26884186 -0.39690125 -2.8855398  -3.2875605 ]


In [0]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [24]:
print(int2word[find_closest(word2int['queen'], vectors)])

king
