# 使用gensim训练word2vec

本DEMO使用了大部分数据(180000)未通过集群训练，用全部数据我的小服务器就崩溃了，使用全部数据预训练的词向量地址：  

链接: https://pan.baidu.com/s/1ewlck3zwXVQuAzraZ26Euw 提取码: qbpr 

In [None]:
import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f5955b43970>

In [None]:
# split data to 10 fold
fold_num = 9
data_file = '../data/train_set.csv'
import pandas as pd


def all_data2fold(fold_num, num=180000):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]

    total = len(labels)

    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    label2id = {}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

2020-07-28 09:11:29,448 INFO: Fold lens [18000, 18000, 18000, 18000, 18000, 18000, 18000, 18000, 18000, 18000]


In [None]:
# build train data for word2vec
fold_id = 10

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

2020-07-28 09:11:29,536 INFO: Total 180000 docs.


In [None]:
logging.info('Start training...')
from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("./word2vec.bin")

2020-07-28 09:11:29,549 INFO: Start training...
2020-07-28 09:11:30,418 INFO: 'pattern' package not found; tag filters are not available for English
2020-07-28 09:11:52,036 INFO: collecting all words and their counts
2020-07-28 09:11:52,038 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-28 09:11:53,437 INFO: PROGRESS: at sentence #10000, processed 9061087 words, keeping 5300 word types
2020-07-28 09:11:54,819 INFO: PROGRESS: at sentence #20000, processed 18059593 words, keeping 5652 word types
2020-07-28 09:11:56,224 INFO: PROGRESS: at sentence #30000, processed 27118110 words, keeping 5878 word types
2020-07-28 09:11:57,666 INFO: PROGRESS: at sentence #40000, processed 36348121 words, keeping 6024 word types
2020-07-28 09:11:59,095 INFO: PROGRESS: at sentence #50000, processed 45575675 words, keeping 6135 word types
2020-07-28 09:12:00,477 INFO: PROGRESS: at sentence #60000, processed 54592095 words, keeping 6224 word types
2020-07-28 09:12:01,850 INFO

In [None]:
# load model
model = Word2Vec.load("./word2vec.bin")

# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)

2020-07-28 09:30:32,729 INFO: loading Word2Vec object from ./word2vec.bin
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-07-28 09:30:32,816 INFO: loading wv recursively from ./word2vec.bin.wv.* with mmap=None
2020-07-28 09:30:32,818 INFO: setting ignored attribute vectors_norm to None
2020-07-28 09:30:32,819 INFO: loading vocabulary recursively from ./word2vec.bin.vocabulary.* with mmap=None
2020-07-28 09:30:32,821 INFO: loading trainables recursively from ./word2vec.bin.trainables.* with mmap=None
2020-07-28 09:30:32,822 INFO: setting ignored attribute cum_table to None
2020-07-28 09:30:32,823 INFO: loaded ./word2vec.bin
2020-07-28 09:30:32,847 INFO: storing 5981x100 projection weights into ./word2vec.txt


原理已经比较熟悉了，这次看看优化方式：

Word2vec 有两种优化方式，一种是负采样，一种是层序Softmax：



一、负采样，以skip-gram model为例。中心词生成背景词可以由两个相互独立事件的联合组成来近似（引自李沐大神的讲解）。



第一个事件是，中心词和背景词同时出现在窗口中。第二个事件是，中心词和K个噪声词不同时出现在窗口数据中，其中噪声词由噪声分布随机生成。



这里我们就可以知道上一个文章开头说到的，负采样是一种等价操作还是近似操作？我们在第二个事件中，使用了K个噪声词。但是实际上呢？应该远远大于K。



还是那个例子，句子为"我/永远/爱/中国/共产党"，中心词为'爱'，我们在选择噪声词的时候，选择了K个，但是实际上，在词汇表中，排除掉'我'，'永远'，'中国'，'共产党' 这四个词汇的其他词都可以算做我的噪声词，然而为了减少复杂度，我只选择了其中的K个，所以当然应该是近似了。



二、层序Softmax。



层序Softmax 对应的就是在输出层使用一个霍夫曼树，代替了原本在输出层统一进行的softmax。


首先，我们需要了解霍夫曼树在这里是如何构建的。



简单讲，霍夫曼树是一个二叉树，以语料中出现过的词当做叶子节点，以各词在语料中出现的次数当做权值进行构造。其中叶子节点有N个，就是词典的大小，非叶子节点有N-1个（包括根节点）。



比如说我的所有文章中，“共产党”这个词出现了 100次，是最大的，那么根节点的左分支（或者右分支）就对应着”共产党“这个词，另一个分支做与根节点相同的操作，找到排除”共产党“这个词之外的所有词中最大的词，比如”中国“作为其中的左分支（或者右分支），以此类推，一个霍夫曼树就成功构建。



霍夫曼树中，我们需要注意的是，每个非叶子节点对应一个向量，每个叶子节点对应一个向量。两种向量都会随着模型的训练进行更新。



其中叶子节点的向量就是我们的词向量，而非叶子节点上的向量就是没有什么实际含义，它的作用就是帮助我们计算模型在霍夫曼树上不断的进行二分类时候的概率。



以上面那句话为例，我们现在中心词为‘爱’，然后，我要预测背景词‘中国’。首先我们要确定的是我的叶子节点是包含所有单词的，也就是包含了我这个简单句子的五个单词（不考虑前期数据清洗低频率词的情况）。



也就是说，在这个霍夫曼树上，有且仅有一条路径，让我从根节点出发，经过多次判断（也就是说走过了多个非叶子节点），最终走到了“中国”这个叶子节点，对应的概率就是每个节点概率的连乘。