### word2vec是一个简单的神经网络，有三个网络层组成：

１个输入层
1个隐藏层
1个输出层

输入层输入的就是上面我们说的数据对的数字表示，输出到隐藏层。
隐藏层的神经网络单元的数量，其实就是我们所说的embedding size，只有为什么，我们后面简单计算一下就知道。需要注意的是，我们的隐藏层后面不需要使用激活函数。
输出层，我们使用softmax操作，得到每一个预测结果的概率
###### reference:https://github.com/washuwashu/NLP-training/tree/master/task4
##### https://github.com/luozhouyang/machine-learning-notes/tree/master/word2vec
##### https://blog.csdn.net/weixin_41701299/article/details/90267242
##### https://blog.csdn.net/AZRRR/article/details/90293578

In [None]:
import os

import numpy as np
import tensorflow as tf

from .data import SkipGramDataSet

dataset = SkipGramDataSet(os.path.join(os.path.curdir, "word2vec/test.txt"))

VOCAB_SIZE = dataset.vocab_size
EMBEDDING_SIZE = 128
SKIP_WINDOW = 2

NUM_SAMPLED = 64

BATCH_SIZE = 32
WINDOW_SIZE = 2
LOG_DIR = "/tmp/word2vec"

TRAIN_STEPS = 10000

LEARNING_RATE = 0.1


class Word2Vec(object):

  def __init__(self):
    self.graph = tf.Graph()
    with self.graph.as_default():
      with tf.name_scope("inputs"):
        self.x = tf.placeholder(shape=(None, VOCAB_SIZE), dtype=tf.float32)
        self.y = tf.placeholder(shape=(None, VOCAB_SIZE), dtype=tf.float32)

      with tf.name_scope("layer1"):
        self.W1 = tf.Variable(
          tf.random_uniform([VOCAB_SIZE, EMBEDDING_SIZE], -1, 1),
          dtype=tf.float32)
        self.b1 = tf.Variable(tf.random_normal([EMBEDDING_SIZE]),
                              dtype=tf.float32)
      hidden = tf.add(self.b1, tf.matmul(self.x, self.W1))

      with tf.name_scope("layer2"):
        self.W2 = tf.Variable(
          tf.random_uniform([EMBEDDING_SIZE, VOCAB_SIZE], -1, 1),
          dtype=tf.float32)
        self.b2 = tf.Variable(tf.random_normal([VOCAB_SIZE]),
                              dtype=tf.float32)

      self.prediction = tf.nn.softmax(
        tf.add(tf.matmul(hidden, self.W2), self.b2))

      log = self.y * tf.log(self.prediction)
      self.loss = tf.reduce_mean(
        -tf.reduce_sum(log, reduction_indices=[1], keepdims=True))

      self.opt = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(
        self.loss)

  def _one_hot_input(self, dataset):
    features, labels = dataset.gen_batch_inputs(BATCH_SIZE, WINDOW_SIZE)
    f, l = [], []
    for w in features:
      tmp = np.zeros([VOCAB_SIZE])
      tmp[w] = 1
      f.append(tmp)
    for w in labels:
      tmp = np.zeros(VOCAB_SIZE)
      tmp[w] = 1
      l.append(tmp)
    return f, l

  def train(self, dataset, n_iters, ):
    with tf.Session(graph=self.graph) as sess:
      sess.run(tf.global_variables_initializer())
      for i in range(n_iters):
        features, labels = self._one_hot_input(dataset)

        predi, loss = sess.run([self.prediction, self.loss],
                               feed_dict={
                                 self.x: features,
                                 self.y: labels
                               })
        print("loss:%s" % loss)

  def predict(self):
    pass

  def nearest(self, n):
    pass

  def similarity(self, a, b):
    pass


word2vec = Word2Vec()
word2vec.train(dataset, TRAIN_STEPS)

In [None]:
# 为代码如下：
'''
import tensorflow as tf

# 假设vocab_size = 1000
VOCAB_SIZE = 1000
# 假设embedding_size = 300
EMBEDDINGS_SIZE = 300

# 输入单词x是一个[1,vocab_size]大小的矩阵。当然实际上我们一般会用一批单词作为输入，那么就是[N, vocab_size]的矩阵了
x = tf.placeholder(tf.float32, shape=(1,VOCAB_SIZE))
# W1是一个[vocab_size, embedding_size]大小的矩阵
W1 = tf.Variable(tf.random_normal([VOCAB_SIZE, EMBEDDING_SIZE]))
# b1是一个[1，embedding_size]大小的矩阵
b1 = tf.Variable(tf.random_normal([EMBEDDING_SIZE]))
# 简单的矩阵乘法和加法
hidden = tf.add(tf.mutmul(x,W1),b1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_SIZE,VOCAB_SIZE]))
b2 = tf.Variable(tf.random_normal([VOCAB_SIZE]))
# 输出是一个vocab_size大小的矩阵，每个值都是一个词的概率值
prediction = tf.nn.softmax(tf.add(tf.mutmul(hidden,w2),b2))
'''