In [1]:
# -*- coding: utf-8 -*-

import os
import cjieba
from gensim.corpora import WikiCorpus
from gensim.models import word2vec

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# Wikimedia语料库下载地址：https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
fname = './data/zhwiki-latest-pages-articles.xml.bz2'
wiki = WikiCorpus(fname, lemmatize=False, dictionary={}, article_min_tokens=10, token_min_len=1, token_max_len=100, lower=True)

In [4]:
# 提取文本耗时25分钟
if not os.path.exists('./data/zhwiki.txt'):
    texts_num = 0
    with open('./data/zhwiki.txt', 'w', encoding='utf-8') as f:
        for text in wiki.get_texts():
            f.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已处理 %d 篇文章" % texts_num)

In [5]:
# 繁体转简体耗时2分钟
if not os.path.exists('./data/zhwiki_simp.txt'):
    ! opencc -i ./data/zhwiki.txt -o ./data/zhwiki_simp.txt -c zht2zhs.ini

In [6]:
# 分词耗时11分钟
if not os.path.exists('./data/zhwiki_simp_seg.txt'):
    texts_num = 0
    f_simp_seg = open('./data/zhwiki_simp_seg.txt', 'w', encoding='utf-8')
    with open('./data/zhwiki_simp.txt', 'r', encoding='utf-8') as f:
        text = f.readline()
        while text:
            f_simp_seg.write(' '.join([word for word in cjieba.cut(text.strip(), cut_all=False) if word != ' ']) + '\n')
            text = f.readline()
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已处理 %d 篇文章" % texts_num)
    logging.info("共处理 %d 篇文章" % texts_num)
    f_simp_seg.close()

In [7]:
# 训练word2vec模型耗时50分钟
if not os.path.exists('./data/model/'):
    os.makedirs('./data/model/')
if not os.path.exists('./data/model/word2vec.model'):
    sentences = word2vec.LineSentence('./data/zhwiki_simp_seg.txt')
    w2v_model = word2vec.Word2Vec(sentences, size=200, workers=4)
    w2v_model.save('./data/model/word2vec.model')

In [8]:
w2v_model = word2vec.Word2Vec.load("./data/model/word2vec.model")

2018-09-06 20:59:36,339 : INFO : loading Word2Vec object from ./data/model/word2vec.model
2018-09-06 20:59:39,088 : INFO : loading wv recursively from ./data/model/word2vec.model.wv.* with mmap=None
2018-09-06 20:59:39,089 : INFO : loading vectors from ./data/model/word2vec.model.wv.vectors.npy with mmap=None
2018-09-06 20:59:39,408 : INFO : setting ignored attribute vectors_norm to None
2018-09-06 20:59:39,409 : INFO : loading vocabulary recursively from ./data/model/word2vec.model.vocabulary.* with mmap=None
2018-09-06 20:59:39,410 : INFO : loading trainables recursively from ./data/model/word2vec.model.trainables.* with mmap=None
2018-09-06 20:59:39,411 : INFO : loading syn1neg from ./data/model/word2vec.model.trainables.syn1neg.npy with mmap=None
2018-09-06 20:59:41,232 : INFO : setting ignored attribute cum_table to None
2018-09-06 20:59:41,234 : INFO : loaded ./data/model/word2vec.model


In [9]:
# w2v_model.wv.most_similar(positive=['女人', '国王'], negative=['男人'], topn=3)
w2v_model.wv.most_similar_cosmul(positive=['女人', '国王'], negative=['男人'], topn=3)

2018-09-06 20:59:43,693 : INFO : precomputing L2-norms of word weight vectors


[('王室', 0.8594831824302673),
 ('王储', 0.8557029962539673),
 ('王后', 0.8546224236488342)]

In [10]:
w2v_model.wv.doesnt_match('早餐 谷物 晚餐 午餐'.split())

'谷物'

In [11]:
w2v_model.wv.similarity('女人', '男人')

0.88024735

In [12]:
w2v_model.wv.similar_by_word('猫', topn=3)

[('狗', 0.7673438787460327),
 ('犬', 0.710284948348999),
 ('老鼠', 0.675322413444519)]

In [13]:
w2v_model.wv.distance('媒体', '介质')

0.7762486040592194

In [14]:
# 先对每个list的词向量求均值，再计算之间的余弦相似度
w2v_model.wv.n_similarity(['寿司', '商店'], ['日语', '餐厅'])

0.6433907

In [15]:
# w2v_model.wv['电脑']
w2v_model.wv.word_vec('办公室', use_norm=True)

array([ 6.83923736e-02,  1.02329269e-01,  9.71030537e-03, -1.61030665e-02,
        2.08849721e-02,  5.80555238e-02,  1.18015781e-01, -2.75380518e-02,
       -8.45042020e-02, -9.69247222e-02, -1.09279506e-01,  9.02274251e-02,
       -3.59832570e-02,  3.35633988e-03, -2.02721311e-03,  1.12083957e-01,
       -1.46189081e-02, -1.16647176e-01,  7.64922723e-02, -3.64502408e-02,
        2.46583018e-02,  5.00709936e-02, -8.83321166e-02,  9.60599184e-02,
        3.13817635e-02, -1.01580143e-01, -6.54485747e-02, -2.66069286e-02,
       -1.54746529e-02,  4.91613820e-02, -2.65230499e-02, -2.19071452e-02,
       -2.41167005e-02, -1.05453776e-02,  8.22560042e-02, -4.12925072e-02,
        3.28649469e-02,  4.58846763e-02, -1.49507686e-01, -3.95058990e-02,
       -3.14263552e-02,  6.07363358e-02,  1.21501267e-01,  9.03314203e-02,
        4.87113036e-02,  2.88334265e-02,  4.36801165e-02, -1.17368706e-01,
       -8.89689997e-02, -3.12791653e-02,  3.64994183e-02,  1.75368637e-02,
        2.99781114e-02, -