# Word2vec tensorflow impl

In [47]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import sent_tokenize, word_tokenize

In [48]:
corpus_raw = 'He is the king. The king is royal. She is the royal queen.'

# convert to lower case
corpus_raw_lower = corpus_raw.lower()
corpus_raw_lower

'he is the king. the king is royal. she is the royal queen.'

In [49]:
words = set(word for word in word_tokenize(corpus_raw_lower) if word != '.')
print(words)

word2int={}
int2word={}

vocab_size = len(words)

for index, word in enumerate(words):
    word2int[word] = index
    int2word[index] = word

{'she', 'queen', 'the', 'king', 'he', 'royal', 'is'}


In [50]:
print(word2int['queen'])
print(int2word[3])

1
king


In [51]:
sentences = [[word for word in word_tokenize(sent) if word !='.'] for sent in sent_tokenize(corpus_raw_lower)]
sentences

[['he', 'is', 'the', 'king'],
 ['the', 'king', 'is', 'royal'],
 ['she', 'is', 'the', 'royal', 'queen']]

In [52]:
# trainning data
data = []
WINDOW_SIZE = 2

for sentence in sentences:
    for word_idx, word in enumerate(sentence):
        for context_word in sentence[max(word_idx - WINDOW_SIZE, 0):min(word_idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if context_word != word:
                data.append([word, context_word])

print(data)

[['he', 'is'], ['he', 'the'], ['is', 'he'], ['is', 'the'], ['is', 'king'], ['the', 'he'], ['the', 'is'], ['the', 'king'], ['king', 'is'], ['king', 'the'], ['the', 'king'], ['the', 'is'], ['king', 'the'], ['king', 'is'], ['king', 'royal'], ['is', 'the'], ['is', 'king'], ['is', 'royal'], ['royal', 'king'], ['royal', 'is'], ['she', 'is'], ['she', 'the'], ['is', 'she'], ['is', 'the'], ['is', 'royal'], ['the', 'she'], ['the', 'is'], ['the', 'royal'], ['the', 'queen'], ['royal', 'is'], ['royal', 'the'], ['royal', 'queen'], ['queen', 'the'], ['queen', 'royal']]


In [55]:
def to_one_hot(word_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[word_index] = 1
    return temp

In [62]:
x_train = [] # input word
y_train = [] # output word

for input_word, output_word in data:
    x_train.append(to_one_hot(word2int[input_word], vocab_size))
    y_train.append(to_one_hot(word2int[output_word], vocab_size))
                   
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [64]:
print('x shape: ', x_train.shape)
print('y shape: ', y_train.shape)

x shape:  (34, 7)
y shape:  (34, 7)


In [74]:
x = tf.placeholder(tf.float32, shape=(None, vocab_size), name='input_word')
y = tf.placeholder(tf.float32, shape=(None, vocab_size), name='output_word')

EMBEDDING_DIM = 5

W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM]))

hidden_representation = tf.add(tf.matmul(x, W1), b1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))

prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_representation, W2), b2))

In [87]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
#     cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction)
    cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y * tf.log(prediction), reduction_indices=[1]))
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
    
    for _ in range(1000):
        _, cross_lost = sess.run([optimizer, cross_entropy_loss], feed_dict={
            x: x_train, 
            y: y_train
        })
#         print('loss is: ', cross_lost)
    
#     print(sess.run(W1))
#     print(sess.run(b1))
    
    vectors = sess.run(tf.add(W1, b1))
    print(vectors)

[[-0.4616795  -0.7554153  -0.72968066 -0.06520873 -0.793494  ]
 [ 0.9333521  -0.34531945 -0.69240093  0.7342201  -2.0610204 ]
 [ 2.0102563  -0.26171118  0.4310136  -1.6687319  -0.6300094 ]
 [ 0.17167017 -1.1180968  -1.0076189  -0.04077378 -2.2900486 ]
 [-1.5176055  -1.462054   -0.6837474   0.01302123 -1.51083   ]
 [-0.01195743 -1.2185957   1.0184906  -1.202682   -2.8647263 ]
 [ 1.6805906   0.4025997   1.079875    1.0438187  -0.9559208 ]]


In [88]:
print(vectors[word2int['queen']])

[ 0.9333521  -0.34531945 -0.69240093  0.7342201  -2.0610204 ]


In [91]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2) ** 2))

def find_closest(word_index, vectors):
    min_dist = 100000
    min_index = -1
    
    query_vector = vectors[word_index]
    
    for index, vec in enumerate(vectors):
        if euclidean_dist(vec, query_vector) < min_dist and not np.array_equal(vec, query_vector):
            min_dist = euclidean_dist(vec, query_vector)
            min_index=index
    return min_index

In [93]:
print(int2word[find_closest(word2int['king'], vectors)])
print(int2word[find_closest(word2int['queen'], vectors)])

queen
king
