# Word2Vec（Continuous Bag-of-Words(CBOW)&Skip-gram(SG)）

In [1]:
from collections import defaultdict
import numpy as np

# 超参数
setting = {
    "window": 2,  # 窗口尺寸（至左边或右边，即左边2个和右边2个）
    "n": 8,  # 词向量维度
    "epochs": 50,
    "learning_rate": 0.01
}

class word2vec():
    def __init__(self):
        self.n = setting['n']
        self.window = setting['window']
        self.epochs = setting['epochs']
        self.learning_rate = setting['learning_rate']

    def generate_train_data(self, setting, corpus_file):
        """建立训练用的词向量"""
        # 计算非重复词语的频数
        word_counts = defaultdict(int)
        with open(corpus_file, 'r', encoding='utf-8') as file:
            for line in file:
                row = [word.lower() for word in line.strip().split()]
                for word in row:
                    word_counts[word] += 1

        self.count_word = len(word_counts.keys())  # 非重复词语数量
        self.words_list = list(word_counts.keys())  # 非重语料列表
        self.word_idx = dict((word, i) for i, word in enumerate(self.words_list))  # 建立word_idx 字典，加快访问速度
        self.idx_word = dict((i, word) for i, word in enumerate(self.words_list))  # 建立idx_word 字典，加快访问速度

        training_data = []

        with open(corpus_file, 'r', encoding='utf-8') as file:
            for line in file:
                setence = [word.lower() for word in line.strip().split()]  # 遍历每行（每句话）
                len_setence = len(setence)
                for i, word in enumerate(setence):  # 遍历每个词（目标词）
                    w_target = self.word2onehot(setence[i])  # 把词语转化成向量
                    w_context = []  # 上下文

                    # 获取上下文向量
                    for j in range(i - self.window, i + self.window + 1):  # 遍历窗口内的上下文
                        if j != i and j >= 0 and j < len_setence:
                            w_context.append(self.word2onehot(setence[j]))  # 把上下文的词语转化为one-hot向量

                    # 把向量保存到训练集列表中
                    training_data.append([w_target, w_context])  # 每个词语的向量和该词语的上下文向量加入到训练集中
        return np.array(training_data)

    def word2onehot(self, word):
        """建立one-hot词向量"""
        word_vec = np.zeros(self.count_word)  # 建立长度为词语数量的全0向量

        word_index = self.word_idx[word]  # 获取这个词语独一无二的编号
        word_vec[word_index] = 1  # 将全0向量中编号处的 0 改成 1 ，该词语的one-hot向量
        return word_vec

    def train(self, training_data):
        """训练"""
        # 初始化权重
        self.w1 = np.random.uniform(-1, 1, (self.count_word, self.n))  # shape = （count_word ， n）
        self.w2 = np.random.uniform(-1, 1, (self.n, self.count_word))  # shape = （n ， count_vec）

        # 遍历每个epoch
        for i in range(self.epochs):
            self.loss = 0  # 初始化loss
            for w_t, w_c in training_data:  # 遍历每个训练集数据
                # 前向传播
                y_predict, h, u = self.forward(w_t)
                # 计算误差
                EI = np.sum([np.subtract(y_predict, word) for word in w_c], axis=0)
                # 反向传播，并更新参数
                self.backprop(EI, h, w_t)
                # 计算误差
                self.loss += -np.sum([u[word.tolist().index(1)] for word in w_c]) + len(w_c) * np.log(
                    np.sum(np.exp(u)))
            print('EPOCH:', i, 'LOSS:', self.loss)

    def forward(self, x):
        """前向传播"""
        h = np.dot(self.w1.T, x)  # 隐藏层的输出(w1.T的形状是（n,count_word）,x的形状是（count_word,）所以结果的形状为（n,）)
        u = np.dot(self.w2.T, h)  # 输出层的输出(w2.T的形状是（count_word,n）,h的形状是（n,),所以结果的形状是(count_word,)

        y_c = self.softmax(u)  # 激活层输出
        return y_c, h, u

    def softmax(self, x):
        """softmax"""
        e_x = np.exp(x - np.max(x))  # x的形状为(count_word,)
        return e_x / np.sum(e_x, axis=0)

    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  # 损失函数对w2偏导
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))  # 损失函数对w1偏导

        # update weight
        self.w1 = self.w1 - (self.learning_rate * dl_dw1)  # 更新w
        self.w2 = self.w2 - (self.learning_rate * dl_dw2)  # 更新w

    def word_vec(self, word):
        w_index = self.word_idx[word]
        v_w = self.w1[w_index]
        return v_w

# 实例化模型
w2v = word2vec()
training_data = w2v.generate_train_data(setting, r'C:\Users\zhangxiaomi\Desktop\chengyu.txt')#中文小语料库
print(training_data.shape)

w2v.train(training_data)

(16, 2)
EPOCH: 0 LOSS: 55.224879099445474
EPOCH: 1 LOSS: 54.196646055363274
EPOCH: 2 LOSS: 53.192748181489804
EPOCH: 3 LOSS: 52.21212860245161
EPOCH: 4 LOSS: 51.25379138206123
EPOCH: 5 LOSS: 50.31679803317917
EPOCH: 6 LOSS: 49.400264262857064
EPOCH: 7 LOSS: 48.503356935242216
EPOCH: 8 LOSS: 47.62529123543032
EPOCH: 9 LOSS: 46.76532801827887
EPOCH: 10 LOSS: 45.92277132709312
EPOCH: 11 LOSS: 45.0969660680321
EPOCH: 12 LOSS: 44.28729582703142
EPOCH: 13 LOSS: 43.49318081697999
EPOCH: 14 LOSS: 42.71407594380799
EPOCH: 15 LOSS: 41.94946898103367
EPOCH: 16 LOSS: 41.19887884317104
EPOCH: 17 LOSS: 40.4618539492154
EPOCH: 18 LOSS: 39.73797066819708
EPOCH: 19 LOSS: 39.02683183952553
EPOCH: 20 LOSS: 38.328065361535415
EPOCH: 21 LOSS: 37.641322842294244
EPOCH: 22 LOSS: 36.96627830733808
EPOCH: 23 LOSS: 36.30262695957007
EPOCH: 24 LOSS: 35.650083987085
EPOCH: 25 LOSS: 35.00838341517538
EPOCH: 26 LOSS: 34.37727699923115
EPOCH: 27 LOSS: 33.75653315566698
EPOCH: 28 LOSS: 33.14593592840209
EPOCH: 29 LOS

