# Glove（Continuous Bag-of-Words(CBOW)&Skip-gram(SG)）

In [1]:
from collections import defaultdict
import numpy as np

# 超参数
setting = {
    "window": 2,  # 窗口尺寸（至左边或右边，即左边2个和右边2个）
    "n": 8,  # 词向量维度
    "epochs": 50,
    "learning_rate": 0.01,
    "x_max": 100,
    "alpha": 0.75
}

class GloVe():
    def __init__(self):
        self.n = setting['n']
        self.epochs = setting['epochs']
        self.learning_rate = setting['learning_rate']
        self.x_max = setting['x_max']
        self.alpha = setting['alpha']

    def generate_train_data(self, setting, corpus_file):
        """建立训练用的词向量"""
        # 计算非重复词语的频数和共现矩阵
        word_counts = defaultdict(int)
        co_occurrence_matrix = defaultdict(lambda: defaultdict(int))
        with open(corpus_file, 'r', encoding='utf-8') as file:
            for line in file:
                tokens = line.strip().split()
                for i, target_word in enumerate(tokens):
                    word_counts[target_word] += 1
                    context_words = tokens[max(0, i - setting['window']):i] + tokens[i + 1:i + setting['window'] + 1]
                    for context_word in context_words:
                        co_occurrence_matrix[target_word][context_word] += 1
                        co_occurrence_matrix[context_word][target_word] += 1

        self.count_word = len(word_counts.keys())  # 非重复词语数量
        self.words_list = list(word_counts.keys())  # 非重语料列表
        self.word_idx = dict((word, i) for i, word in enumerate(self.words_list))  # 建立word_idx 字典，加快访问速度
        self.idx_word = dict((i, word) for i, word in enumerate(self.words_list))  # 建立idx_word 字典，加快访问速度

        # 构建共现矩阵和权重矩阵
        X = np.zeros((self.count_word, self.count_word))
        for target_word, context_dict in co_occurrence_matrix.items():
            for context_word, count in context_dict.items():
                i, j = self.word_idx[target_word], self.word_idx[context_word]
                X[i, j] = count

        return X

    def train(self, X):
        """训练"""
        # 初始化权重
        self.W = np.random.uniform(-0.5 / self.n, 0.5 / self.n, (self.count_word, self.n))
        self.b = np.random.uniform(-0.5 / self.n, 0.5 / self.n, self.count_word)
        self.U = np.random.uniform(-0.5 / self.n, 0.5 / self.n, (self.count_word, self.n))
        self.c = np.random.uniform(-0.5 / self.n, 0.5 / self.n, self.count_word)

        # 计算权重
        f_x = np.vectorize(lambda x: (x / self.x_max) ** self.alpha if x < self.x_max else 1)
        X = f_x(X)

        # 遍历每个epoch
        for epoch in range(self.epochs):
            loss = 0
            for i in range(self.count_word):
                for j in range(self.count_word):
                    if X[i, j] > 0:
                        # 计算预测值
                        diff = np.dot(self.W[i], self.U[j]) + self.b[i] + self.c[j] - np.log(X[i, j])
                        # 计算梯度
                        grad_W = 2 * diff * self.U[j]
                        grad_U = 2 * diff * self.W[i]
                        grad_b = 2 * diff
                        grad_c = 2 * diff
                        # 更新参数
                        self.W[i] -= self.learning_rate * grad_W
                        self.U[j] -= self.learning_rate * grad_U
                        self.b[i] -= self.learning_rate * grad_b
                        self.c[j] -= self.learning_rate * grad_c
                        # 计算损失
                        loss += diff ** 2

            print('EPOCH:', epoch, 'LOSS:', loss)

# 实例化模型
glove = GloVe()
X = glove.generate_train_data(setting, r'C:\Users\zhangxiaomi\Desktop\chengyu.txt')  # 中文小语料库
print(X.shape)

glove.train(X)


(16, 16)
EPOCH: 0 LOSS: 139.01980951986334
EPOCH: 1 LOSS: 128.00263892032214
EPOCH: 2 LOSS: 117.85713324639025
EPOCH: 3 LOSS: 108.51319199052521
EPOCH: 4 LOSS: 99.90666134580957
EPOCH: 5 LOSS: 91.97876119545013
EPOCH: 6 LOSS: 84.67558295118782
EPOCH: 7 LOSS: 77.94764589861003
EPOCH: 8 LOSS: 71.74950257517833
EPOCH: 9 LOSS: 66.03938579239045
EPOCH: 10 LOSS: 60.77889144799829
EPOCH: 11 LOSS: 55.93269241691442
EPOCH: 12 LOSS: 51.46827967112346
EPOCH: 13 LOSS: 47.355727437217446
EPOCH: 14 LOSS: 43.56747970995807
EPOCH: 15 LOSS: 40.07815584058814
EPOCH: 16 LOSS: 36.864373237484344
EPOCH: 17 LOSS: 33.904585474393045
EPOCH: 18 LOSS: 31.17893431264145
EPOCH: 19 LOSS: 28.669114319145823
EPOCH: 20 LOSS: 26.35824890970003
EPOCH: 21 LOSS: 24.23077677282664
EPOCH: 22 LOSS: 22.27234773783123
EPOCH: 23 LOSS: 20.46972724495155
EPOCH: 24 LOSS: 18.810708658175418
EPOCH: 25 LOSS: 17.284032734361276
EPOCH: 26 LOSS: 15.879313627247583
EPOCH: 27 LOSS: 14.586970862967055
EPOCH: 28 LOSS: 13.398166775735067
EP