-
Notifications
You must be signed in to change notification settings - Fork 1
/
Tensorflow_word2vec1.py
89 lines (83 loc) · 3.13 KB
/
Tensorflow_word2vec1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/tutorials/word2vec
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import collections
#
X = tf.placeholder(tf.int32, shape=[25])
# [batch_size, 1] for nn.nce_loss
Y_ = tf.placeholder(tf.int32, shape=[25, 1])
# sentences
sentences = ["They are like cats and dogs",
"cats and dogs are friends",
"she have cats and dogs"
"cats and dogs are fighting"
"cats are sensitive",
"dog are loyal",
"cats are liquid",
"cats are wonderful",
"dogs are barking",
"cats are furry"
"dogs are cute"
"there have cats and dogs"
"It's raining cats and dogs",
"dogs and cats like baths"
"I love cats and dogs",
"we all love cats and dogs",
"he likes cats",
"she loves dogs",
"everyone loves cats and dogs"]
def build_dataset(sent):
# [(most count word1, n1),(second word2, n2)]
count_word = collections.Counter(" ".join(sent).split()).most_common()
rdictionary = [i[0] for i in count_word] #word
dictionary = {w: i for i, w in enumerate(rdictionary)} #id
data = [dictionary[word] for word in " ".join(sent).split()]
# The actual code for this tutorial is very short
# ([the, code], actual), ([actual, for], code), ...
cbow_pairs = []
for i in range(1, len(data)-1) :
cbow_pairs.append([[data[i-1], data[i+1]], data[i]])
# skip-gram pairs
# (actual, the), (actual, code), (code, actual), ...
len_dic = len(dictionary)
sgp = [];
for i in cbow_pairs:
sgp.append([i[1], i[0][0]])
sgp.append([i[1], i[0][1]])
return len_dic, sgp, rdictionary
len_dic, sgp, rdictionary = build_dataset(sentences)
# y = xw +b
W = tf.Variable(tf.random_uniform([len_dic, 2],-1.0, 1.0))
b = tf.Variable(tf.zeros([len_dic]))
# variables for the NCE loss
embeddings = tf.Variable(tf.random_uniform([len_dic, 2], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, X)
# tf.nn.nce_loss(weights, biases, inputs, labels, ...)
# negative samples
loss = tf.reduce_mean(tf.nn.nce_loss(W, b, Y_, embed, 12, len_dic))
train = tf.train.GradientDescentOptimizer(0.4).minimize(loss)
def generate_batch(size):
assert size <= len(sgp)
x_data = []
y_data = []
r = np.random.choice(range(len(sgp)), size, replace=False)
for i in r:
x_data.append(sgp[i][0]) # n dim
y_data.append([sgp[i][1]]) # n, 1 dim
return x_data, y_data
with tf.Session() as sess:
tf.global_variables_initializer().run()
for i in range(100):
batch_inputs, batch_labels = generate_batch(25)
_, loss_val = sess.run([train, loss], {X: batch_inputs, Y_: batch_labels})
if i % 10 == 0:
print("Loss :", i, loss_val) # loss
# normalize
final_embeddings = embeddings.eval()
for i, l in enumerate(rdictionary[:10]):
x, y = final_embeddings[i,:]
plt.scatter(x, y)
plt.annotate(l, xy=(x, y), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom')
plt.savefig("tf_word2vec.png")