## NOTE
在企业中使用Word2Vec一般的方式如下：<br/>
1. 使用gensim这类的第三方框架在原始数据上训练一个Embedding Table(单词和词向量之间的映射表)<br/>
2. 在业务中，需要使用到词向量转换的时候，直接加载这个Embedding Table作为单词向量转换的初始参数<br/>
    在深度学习中我们一般使用tf.nn.embedding_lookup(embedding_table,word_idxs)来获取单词id对应的向量

In [1]:
# import modules & set up logging
import logging # 打印日志的模块，日志主要包括以下几个级别: DEBUG、INFO、WARN、ERROR
import os

import numpy as np

import gensim
from gensim.models import word2vec

import jieba.analyse
import jieba

In [2]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# 原始文本数据所在的磁盘路径
sentence_file_path = './datas/in_the_name_of_people.txt'
# 进行文本分词之后的数据存储磁盘路径
word_file_path = './datas/cut_words_of_in_the_name_of_people.txt'
# Word2Vec模型持久化保存的磁盘路径
model_file_path1 = './datas/gensim_word2vec1.w2v'
model_file_path2 = './datas/gensim_word2vec2.bin'
model_file_path3 = './datas/gensim_word2vec3_{}.npy'

## 一、分词

In [8]:
# 人民的名义 小说分词
jieba.suggest_freq('沙瑞金',True)
jieba.suggest_freq('侯处长',True)
jieba.suggest_freq('田国富',True)
jieba.suggest_freq('高育良',True)
jieba.suggest_freq('侯亮平',True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
jieba.suggest_freq('京州市', True)
jieba.suggest_freq('副市长', True)
jieba.suggest_freq('赵德汉', True)
jieba.suggest_freq('吴彩霞', True)
jieba.add_word('陈海', 100)

# 自定义词典
jieba.add_word('人民的名义')
jieba.add_word('数字版')
jieba.add_word('中文在线数字出版集团股份有限公司')
jieba.add_word('离婚法')

print("对文件【{}】里面的内容进行分词!".format(sentence_file_path))
print("并且将分词结果数据保存到文件【{}】中!".format(word_file_path))
with open(word_file_path,'w', encoding='utf-8') as writer:
    with open(sentence_file_path, 'r', encoding='utf-8') as reader:
        # 加载所有数据
        content = reader.read()
        
        # 分词
        content = jieba.cut(content)
        
        # 合并结果
        result = ' '.join(content)
        
        # 结果输出
        writer.write(result)
print("Done!!!")

对文件【./datas/in_the_name_of_people.txt】里面的内容进行分词!
并且将分词结果数据保存到文件【./datas/cut_words_of_in_the_name_of_people.txt】中!
Done!!!


## 二、Gensim Word2Vec构建

In [13]:
# 按行数据加载(认为每行是一个文档), 并且会将每一行的前后空格去除，并且按照空格拆分成单词集合
sentences = word2vec.LineSentence(word_file_path) 

k = 0
start = 80
end = 85
for sentence in sentences:
    if k >= start:
        print("=" * 50)
        print(sentence)
        if k >= end:
            break
    k += 1

['五十三']
['五十四']
['一']
['侯亮平', '得知', '航班', '无限期', '延误', '，', '急得', '差点', '跳', '起来', '。', '他本', '打算', '坐', '最后', '一班', '飞机', '赶往', 'H', '省', '，', '协调', '指挥', '抓捕', '京州市', '副市长', '丁义珍', '的', '行动', '，', '这下子', '计划', '全', '落空', '了', '。', '广播', '中', '一遍', '遍', '传来', '女', '播音员', '中英文', '抱歉', '的', '通知', '，', '机场', '上空', '有', '雷暴', '区', '，', '为了', '乘客', '安全', '，', '飞机', '暂时', '无法', '起飞', '。', '侯亮平', '额上', '沁出', '一层', '细细的', '汗珠', '，', '早', '知道', '被困', '机场', '的', '痛苦', '，', '现在', '又', '得', '尝', '一次', '滋味', '了', '。']
['电视', '大', '荧屏', '正', '放映', '气象图', '，', '一', '团团', '浓厚', '的', '白云', '呈', '旋涡', '状', '翻卷', '，', '十分', '凶险', '的', '样子', '。', '字幕', '普及', '着', '航空', '知识', '—', '—', '雷暴', '如何', '危及', '飞行', '安全', '，', '误入', '雷暴', '区', '曾', '如何', '导致', '空难', '。', '但', '这', '一切', '根本', '不能', '平息', '人们', '焦虑', '的', '心情', '，', '整个', '候机', '大厅', '这时', '似乎', '已经', '变', '作', '巨型', '蜂巢', '，', '嗡嗡嘤嘤', '，', '噪声', '四起', '。', '旅客', '们', '分堆', '围住', '各', '值机', '台', '的', '机场', '工作人员', '，', '吵吵嚷嚷', '，', '无非', '是', '打听

#### 训练方式一

In [14]:
# 按行数据加载, 最终形成的数据格式为: list(list(string))
sentences = word2vec.LineSentence(word_file_path) 

# 训练Word2Vec模型
"""
__init__(self, sentences=None, size=100, alpha=0.025, 
        window=5, min_count=5, max_vocab_size=None, 
        sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
        sg=0, hs=0, negative=5, cbow_mean=1, 
        hashfxn=<built-in function hash>, iter=5, null_word=0, 
        trim_rule=None, sorted_vocab=1, batch_words=10000, 
        compute_loss=False, callbacks=())
sentences: 给定文档集合
size:转换之后的特征向量的维度大小
window：窗口大小
min_count：如果某一个单词出现次数小于min_count,那么该单词不计算对应的词向量
max_vocab_size：给定最多计算的词汇数目，None表示不限制。
sg: 1(Skip-gram) 0(CBOW)， 默认为0
hs: 1(hierarchical softmax) 0(negative)， 默认为0
negative: 当hs为0的时候，给定负样本数目，给定为0表示不采用负采样
iter：模型训练的迭代次数
alpha: 梯度下降的学习率，在更新过程会进行线性的递减到min_alpha。
"""
model = word2vec.Word2Vec(sentences, hs = 1,min_count = 1,
                          window = 3,size = 100, compute_loss=True)

2020-03-11 11:44:47,977 : INFO : collecting all words and their counts
2020-03-11 11:44:47,978 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-11 11:44:48,052 : INFO : collected 17856 word types from a corpus of 161191 raw words and 2311 sentences
2020-03-11 11:44:48,054 : INFO : Loading a fresh vocabulary
2020-03-11 11:44:48,091 : INFO : effective_min_count=1 retains 17856 unique words (100% of original 17856, drops 0)
2020-03-11 11:44:48,092 : INFO : effective_min_count=1 leaves 161191 word corpus (100% of original 161191, drops 0)
2020-03-11 11:44:48,143 : INFO : deleting the raw counts dictionary of 17856 items
2020-03-11 11:44:48,144 : INFO : sample=0.001 downsamples 38 most-common words
2020-03-11 11:44:48,145 : INFO : downsampling leaves estimated 120419 word corpus (74.7% of prior 161191)
2020-03-11 11:44:48,162 : INFO : constructing a huffman tree from 17856 words
2020-03-11 11:44:48,616 : INFO : built huffman tree with maximum node depth 17


#### 训练方式二

In [56]:
# 每行数据加载
sentences = word2vec.LineSentence(word_file_path) 

# 创建Word2Vec模型
"""
__init__(self, sentences=None, size=100, alpha=0.025, 
        window=5, min_count=5, max_vocab_size=None, 
        sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
        sg=0, hs=0, negative=5, cbow_mean=1, 
        hashfxn=<built-in function hash>, iter=5, null_word=0, 
        trim_rule=None, sorted_vocab=1, batch_words=10000, 
        compute_loss=False, callbacks=())
sg: 1(Skip-gram) 0(CBOW)
hs: 1(hierarchical softmax) 0(negative)
negative: 当hs为0的时候，给定负样本数目，给定为0表示不采用负采样
"""
model = word2vec.Word2Vec(hs = 1,min_count = 1,window = 3,size = 100)

# 构建词典(单词和id之间的映射关系)
model.build_vocab(sentences)

# 模型训练
print("总文档数目:{}".format(model.corpus_count))

# 可以一次性迭代训练10次(10epoch)
# model.train(sentences, total_examples=model.corpus_count, epochs=10)

# 也可以每次训练1次(1个epoch)，但是每次训练后都可以进行参数修改
for epoch in range(10):
    model.train(sentences, total_examples=model.corpus_count, epochs=1)
    # 可以进行参数修改
    model.alpha -= 0.002
    model.min_alpha = 0.001 * model.alpha

2020-03-11 14:41:26,681 : INFO : collecting all words and their counts
2020-03-11 14:41:26,687 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-11 14:41:26,773 : INFO : collected 17856 word types from a corpus of 161191 raw words and 2311 sentences
2020-03-11 14:41:26,774 : INFO : Loading a fresh vocabulary
2020-03-11 14:41:26,832 : INFO : effective_min_count=1 retains 17856 unique words (100% of original 17856, drops 0)
2020-03-11 14:41:26,833 : INFO : effective_min_count=1 leaves 161191 word corpus (100% of original 161191, drops 0)
2020-03-11 14:41:26,890 : INFO : deleting the raw counts dictionary of 17856 items
2020-03-11 14:41:26,898 : INFO : sample=0.001 downsamples 38 most-common words
2020-03-11 14:41:26,899 : INFO : downsampling leaves estimated 120419 word corpus (74.7% of prior 161191)
2020-03-11 14:41:26,915 : INFO : constructing a huffman tree from 17856 words
2020-03-11 14:41:27,709 : INFO : built huffman tree with maximum node depth 17


总文档数目:2311


2020-03-11 14:41:28,156 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-11 14:41:28,158 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-03-11 14:41:28,169 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-11 14:41:28,170 : INFO : EPOCH - 1 : training on 161191 raw words (120452 effective words) took 0.2s, 546913 effective words/s
2020-03-11 14:41:28,172 : INFO : training on a 161191 raw words (120452 effective words) took 0.2s, 532718 effective words/s
2020-03-11 14:41:28,176 : INFO : training model with 3 workers on 17856 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 negative=5 window=3
2020-03-11 14:41:28,385 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-03-11 14:41:28,394 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-03-11 14:41:28,401 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-03-11 14:41:28,403 : INFO : EPOCH - 1 :

## 三、Word2Vec应用

### 1. 获取相似度/相关性最高的K个演员
从训练数据中获取最相似的topn个单词

In [57]:
# 获取和"沙瑞金"这个单词向量最相似的topn个单词，以及这些单词的夹角余弦相似度
model.wv.similar_by_word('沙瑞金', topn =10)

2020-03-11 14:41:47,973 : INFO : precomputing L2-norms of word weight vectors


[('田国富', 0.9411921501159668),
 ('高育良', 0.9379141330718994),
 ('季昌明', 0.9015998840332031),
 ('这位', 0.8981113433837891),
 ('祁同伟', 0.896399974822998),
 ('李达康', 0.8698292970657349),
 ('易学习', 0.8689755201339722),
 ('吴彩霞', 0.8689247369766235),
 ('做个', 0.8619071245193481),
 ('陈岩石', 0.8618505001068115)]

In [20]:
import jieba.posseg as pseg

jieba.add_word('沙瑞金',10,'nr')
jieba.add_word('田国富',10,'nr')
jieba.add_word('高育良',10,'nr')
jieba.add_word('侯亮平',10,'nr')
jieba.add_word('钟小艾', 10,'nr')
jieba.add_word('陈岩石', 10,'nr')
jieba.add_word('欧阳菁', 10,'nr')
jieba.add_word('易学习', 10,'nr')
jieba.add_word('王大路', 10,'nr')
jieba.add_word('蔡成功', 10,'nr')
jieba.add_word('孙连城', 10,'nr')
jieba.add_word('季昌明', 10,'nr')
jieba.add_word('丁义珍', 10,'nr')
jieba.add_word('郑西坡', 10,'nr')
jieba.add_word('赵东来', 10,'nr')
jieba.add_word('高小琴', 10,'nr')
jieba.add_word('赵瑞龙', 10,'nr')
jieba.add_word('林华华', 10,'nr')
jieba.add_word('陆亦可', 10,'nr')
jieba.add_word('刘新建', 10,'nr')
jieba.add_word('刘庆祝', 10,'nr')
jieba.add_word('京州市', 10,'nr')
jieba.add_word('副市长', 10,'nr')
jieba.add_word('赵德汉',10,'nr')


tmp01 = model.wv.similar_by_word('沙瑞金', topn=100)
tmp01 = ''.join(map(lambda t:t[0], tmp01))
words = pseg.lcut(tmp01)
result = ' '.join([word for word,flag in words if flag == 'nr'])
print(result)


高育良 季昌明 田国富 李达康 侯亮平 陆亦可 易学习 陈岩石 吴慧芬 赵东来 肖钢玉 祁同伟 钟小艾 孙书记 梁璐 郑西坡 赵德汉 季检 欧阳菁 赵瑞龙 陈清泉 赵立春 吴春林 沙 孙连城 敬畏 刘总 李 达康 谢谢您


In [21]:
# 夹角余弦相似度
req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn =100):
    if len(key[0])==3:
        req_count -= 1
        print(key[0], key[1])
        if req_count == 0:
            break;

高育良 0.8608838319778442
季昌明 0.8341003656387329
田国富 0.830956757068634
李达康 0.8165519833564758
侯亮平 0.7882125377655029


### 2. 获取单词之间的相似度

In [23]:
# 获取单词向量的夹角余弦相似度/相关性
print(model.wv.similarity('沙瑞金', '高育良'))

0.8608839


### 3. 获取单词的词向量

In [24]:
v1 = model.wv["提拔"]
print(v1.shape)
print(v1)

(100,)
[ 0.25345016 -0.0069958  -0.28571624 -0.1290999  -0.16687526 -0.13535061
 -0.19934966 -0.10971514  0.18869436 -0.33045304  0.11058953  0.15518627
 -0.27602708  0.02290593  0.13005173 -0.05552704  0.31312478 -0.15938525
 -0.02327144 -0.04423758  0.07495979 -0.03580485  0.28716636 -0.14107184
  0.09984712 -0.13862452 -0.15097493 -0.05058463  0.12470543 -0.08889644
 -0.04234942 -0.31579977 -0.03215234  0.12022522 -0.04640454 -0.05165878
 -0.13708968  0.02664147  0.0713142  -0.55306405 -0.36513776  0.06065075
  0.07880416  0.27386206  0.23345795 -0.14853674  0.14477734  0.13385785
 -0.4448452   0.04699555 -0.0059435  -0.01429358 -0.21083947  0.06818633
 -0.4429156   0.29825822 -0.27673772 -0.10676947 -0.04256191  0.04803126
  0.07912153 -0.00751675  0.10841286  0.04414946  0.10697284 -0.08935854
 -0.06691136  0.22935225 -0.00337701 -0.42719752  0.02492289 -0.11838258
 -0.12504445 -0.07779804 -0.2023888  -0.4704439   0.10455754 -0.06818312
 -0.31577206  0.10807444 -0.03278089 -0.5052

In [25]:
v1 = model.wv.get_vector("提拔")
print(v1.shape)
print(v1)

(100,)
[ 0.25345016 -0.0069958  -0.28571624 -0.1290999  -0.16687526 -0.13535061
 -0.19934966 -0.10971514  0.18869436 -0.33045304  0.11058953  0.15518627
 -0.27602708  0.02290593  0.13005173 -0.05552704  0.31312478 -0.15938525
 -0.02327144 -0.04423758  0.07495979 -0.03580485  0.28716636 -0.14107184
  0.09984712 -0.13862452 -0.15097493 -0.05058463  0.12470543 -0.08889644
 -0.04234942 -0.31579977 -0.03215234  0.12022522 -0.04640454 -0.05165878
 -0.13708968  0.02664147  0.0713142  -0.55306405 -0.36513776  0.06065075
  0.07880416  0.27386206  0.23345795 -0.14853674  0.14477734  0.13385785
 -0.4448452   0.04699555 -0.0059435  -0.01429358 -0.21083947  0.06818633
 -0.4429156   0.29825822 -0.27673772 -0.10676947 -0.04256191  0.04803126
  0.07912153 -0.00751675  0.10841286  0.04414946  0.10697284 -0.08935854
 -0.06691136  0.22935225 -0.00337701 -0.42719752  0.02492289 -0.11838258
 -0.12504445 -0.07779804 -0.2023888  -0.4704439   0.10455754 -0.06818312
 -0.31577206  0.10807444 -0.03278089 -0.5052

In [28]:
# 异常：不存在"小明"这个单词
word = "小明"
if word in model.wv:
    print(model.wv[word])
"小明" not in model.wv
# model.wv.get_vector("小明")

True

## 四、模型持久化&模型恢复加载

### 方式一：
直接使用save API进行模型持久化

#### 持久化

In [29]:
model.save(model_file_path1)

2020-03-11 11:57:12,516 : INFO : saving Word2Vec object under ./datas/gensim_word2vec1.w2v, separately None
2020-03-11 11:57:12,518 : INFO : not storing attribute vectors_norm
2020-03-11 11:57:12,519 : INFO : not storing attribute cum_table
2020-03-11 11:57:13,076 : INFO : saved ./datas/gensim_word2vec1.w2v


#### 加载

In [30]:
# 直接基于路径加载
model2 = word2vec.Word2Vec.load(model_file_path1)
print(model2)

v1 = model2.wv.get_vector("提拔")
print(v1.shape)
print(v1)

2020-03-11 11:57:47,280 : INFO : loading Word2Vec object from ./datas/gensim_word2vec1.w2v
2020-03-11 11:57:47,537 : INFO : loading wv recursively from ./datas/gensim_word2vec1.w2v.wv.* with mmap=None
2020-03-11 11:57:47,538 : INFO : setting ignored attribute vectors_norm to None
2020-03-11 11:57:47,539 : INFO : loading vocabulary recursively from ./datas/gensim_word2vec1.w2v.vocabulary.* with mmap=None
2020-03-11 11:57:47,540 : INFO : loading trainables recursively from ./datas/gensim_word2vec1.w2v.trainables.* with mmap=None
2020-03-11 11:57:47,541 : INFO : setting ignored attribute cum_table to None
2020-03-11 11:57:47,543 : INFO : loaded ./datas/gensim_word2vec1.w2v


Word2Vec(vocab=17856, size=100, alpha=0.025)
(100,)
[ 0.25345016 -0.0069958  -0.28571624 -0.1290999  -0.16687526 -0.13535061
 -0.19934966 -0.10971514  0.18869436 -0.33045304  0.11058953  0.15518627
 -0.27602708  0.02290593  0.13005173 -0.05552704  0.31312478 -0.15938525
 -0.02327144 -0.04423758  0.07495979 -0.03580485  0.28716636 -0.14107184
  0.09984712 -0.13862452 -0.15097493 -0.05058463  0.12470543 -0.08889644
 -0.04234942 -0.31579977 -0.03215234  0.12022522 -0.04640454 -0.05165878
 -0.13708968  0.02664147  0.0713142  -0.55306405 -0.36513776  0.06065075
  0.07880416  0.27386206  0.23345795 -0.14853674  0.14477734  0.13385785
 -0.4448452   0.04699555 -0.0059435  -0.01429358 -0.21083947  0.06818633
 -0.4429156   0.29825822 -0.27673772 -0.10676947 -0.04256191  0.04803126
  0.07912153 -0.00751675  0.10841286  0.04414946  0.10697284 -0.08935854
 -0.06691136  0.22935225 -0.00337701 -0.42719752  0.02492289 -0.11838258
 -0.12504445 -0.07779804 -0.2023888  -0.4704439   0.10455754 -0.06818312

### 方式二：
保存为二进制词向量或者文本向量

#### 持久化

In [34]:
# 将数据保存为二进制的格式
model.wv.save_word2vec_format(model_file_path2,binary=True)

# 将数据保存为txt文本格式
# model.wv.save_word2vec_format(model_file_path2,binary=False)

2020-03-11 12:01:41,069 : INFO : storing 17856x100 projection weights into ./datas/gensim_word2vec2.bin


#### 加载

In [35]:
# 加载模型
# 加载二进制格式保存的模型
model2 = gensim.models.KeyedVectors.load_word2vec_format(model_file_path2,binary=True)

# 加载txt文本格式模型数据
# model2 = gensim.models.KeyedVectors.load_word2vec_format(model_file_path2,binary=False)
print(model2)

# 应用模型
v1 = model2.get_vector("提拔")
print(v1.shape)
print(v1)

2020-03-11 12:02:02,389 : INFO : loading projection weights from ./datas/gensim_word2vec2.bin
2020-03-11 12:02:02,861 : INFO : loaded (17856, 100) matrix from ./datas/gensim_word2vec2.bin


<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000001262FD98240>
(100,)
[ 0.25345016 -0.0069958  -0.28571624 -0.1290999  -0.16687526 -0.13535061
 -0.19934966 -0.10971514  0.18869436 -0.33045304  0.11058953  0.15518627
 -0.27602708  0.02290593  0.13005173 -0.05552704  0.31312478 -0.15938525
 -0.02327144 -0.04423758  0.07495979 -0.03580485  0.28716636 -0.14107184
  0.09984712 -0.13862452 -0.15097493 -0.05058463  0.12470543 -0.08889644
 -0.04234942 -0.31579977 -0.03215234  0.12022522 -0.04640454 -0.05165878
 -0.13708968  0.02664147  0.0713142  -0.55306405 -0.36513776  0.06065075
  0.07880416  0.27386206  0.23345795 -0.14853674  0.14477734  0.13385785
 -0.4448452   0.04699555 -0.0059435  -0.01429358 -0.21083947  0.06818633
 -0.4429156   0.29825822 -0.27673772 -0.10676947 -0.04256191  0.04803126
  0.07912153 -0.00751675  0.10841286  0.04414946  0.10697284 -0.08935854
 -0.06691136  0.22935225 -0.00337701 -0.42719752  0.02492289 -0.11838258
 -0.12504445 -0.07779804 -0.2023888  -

In [39]:
# 加载其它模型(一般用于加载别人训练好的Word2Vec模型参数<开源>)
other_model_file_path = './datas/vectors.bin'
other_model = gensim.models.KeyedVectors.load_word2vec_format(
    other_model_file_path,binary=True)
print(other_model)

# 应用模型
word = '提拔'
word = '酒店'
if word in other_model:
    print("【{}】:\n{}".format(word, other_model[word]))
else:
    print("【{}】不存在！！！".format(word))

# 将模型保存为文本形式
other_model.wv.save_word2vec_format('./datas/vectors.txt',binary=False)

2020-03-11 12:04:32,800 : INFO : loading projection weights from ./datas/vectors.bin
2020-03-11 12:04:32,992 : INFO : loaded (7942, 128) matrix from ./datas/vectors.bin
2020-03-11 12:04:32,997 : INFO : storing 7942x128 projection weights into ./datas/vectors.txt


<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000001265B3D61D0>
【酒店】:
[ 0.07507085 -0.06233635 -0.08300601 -0.09476656  0.19525695  0.08683676
  0.267884    0.03470918  0.14521684  0.00294149 -0.02735998 -0.09170757
  0.00139181  0.2421832  -0.09511401  0.0341509   0.0283838   0.32073992
  0.172025    0.20465788  0.05929533  0.07462119 -0.23834492  0.0421031
 -0.0448295  -0.02866336  0.05001667 -0.1257836   0.22431172 -0.0807058
 -0.12493779 -0.05265829  0.13126895  0.15061386 -0.19615541 -0.09053446
 -0.05627611 -0.23135136 -0.01231913 -0.23680945 -0.04299964  0.3667591
 -0.06821534 -0.29599202  0.34265348 -0.04311483 -0.21866152 -0.2495054
 -0.43372962  0.0463162   0.11516414  0.07433167  0.09803177 -0.06165684
 -0.1319202   0.04795204  0.2651979   0.09628078 -0.16381025  0.12577897
  0.11097272 -0.15810862 -0.07860032  0.08296187  0.12618895  0.28030068
 -0.00506122  0.05640052  0.11889897 -0.11027029 -0.14830197 -0.00842615
  0.01188814  0.19347009  0.00282715 -0.322

### 方式三：
直接使用NumPy API保存词向量信息

#### 持久化

In [40]:
# 获取词向量
norm_word_embeddings = model.wv.vectors_norm # 经过L2转换后的词向量映射矩阵
word_embeddings = model.wv.vectors # 原始词向量映射矩阵
# 获取词典(词典到idx的映射)
vocab_2_index = list(map(lambda k: (k, model.wv.vocab[k].index), model.wv.vocab))
print(np.shape(norm_word_embeddings), np.shape(word_embeddings), np.shape(vocab_2_index))
# 数据保存
np.save(model_file_path3.format("norm_embedding"), norm_word_embeddings)
np.save(model_file_path3.format("embedding"), word_embeddings)
np.save(model_file_path3.format("vocab_2_index"), vocab_2_index)

(17856, 100) (17856, 100) (17856, 2)


#### 加载

In [41]:
# 加载数据
norm_word_embeddings = np.load(model_file_path3.format("norm_embedding"))
word_embeddings = np.load(model_file_path3.format("embedding"))
vocab_2_index = np.load(model_file_path3.format("vocab_2_index"))

# 字典转换
vocab_2_index = dict(map(lambda t:(t[0], int(t[1])), vocab_2_index))

# 获取数据
word = "提拔"
index = vocab_2_index[word] # 得到单词对应的id索引
v1 = word_embeddings[index] # 根据id索引获取对应的向量
print(v1.shape)
print(v1)

(100,)
[ 0.25345016 -0.0069958  -0.28571624 -0.1290999  -0.16687526 -0.13535061
 -0.19934966 -0.10971514  0.18869436 -0.33045304  0.11058953  0.15518627
 -0.27602708  0.02290593  0.13005173 -0.05552704  0.31312478 -0.15938525
 -0.02327144 -0.04423758  0.07495979 -0.03580485  0.28716636 -0.14107184
  0.09984712 -0.13862452 -0.15097493 -0.05058463  0.12470543 -0.08889644
 -0.04234942 -0.31579977 -0.03215234  0.12022522 -0.04640454 -0.05165878
 -0.13708968  0.02664147  0.0713142  -0.55306405 -0.36513776  0.06065075
  0.07880416  0.27386206  0.23345795 -0.14853674  0.14477734  0.13385785
 -0.4448452   0.04699555 -0.0059435  -0.01429358 -0.21083947  0.06818633
 -0.4429156   0.29825822 -0.27673772 -0.10676947 -0.04256191  0.04803126
  0.07912153 -0.00751675  0.10841286  0.04414946  0.10697284 -0.08935854
 -0.06691136  0.22935225 -0.00337701 -0.42719752  0.02492289 -0.11838258
 -0.12504445 -0.07779804 -0.2023888  -0.4704439   0.10455754 -0.06818312
 -0.31577206  0.10807444 -0.03278089 -0.5052

# 五、扩展：直接从文件中读取数据来进行模型训练

In [42]:
from gensim import utils
from gensim.models.word2vec import Word2Vec


class MyData(object):
    def __iter__(self):
        path = word_file_path
        with open(path, 'r', encoding='utf-8') as reader:
            for line in reader:
                yield list(utils.tokenize(line))

# 模型构建
model = Word2Vec(hs = 1,min_count = 1,window = 3,size = 100, sentences=MyData(), iter=50)

2020-03-11 12:06:41,526 : INFO : collecting all words and their counts
2020-03-11 12:06:41,528 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-11 12:06:41,663 : INFO : collected 17800 word types from a corpus of 131991 raw words and 2422 sentences
2020-03-11 12:06:41,664 : INFO : Loading a fresh vocabulary
2020-03-11 12:06:41,701 : INFO : effective_min_count=1 retains 17800 unique words (100% of original 17800, drops 0)
2020-03-11 12:06:41,702 : INFO : effective_min_count=1 leaves 131991 word corpus (100% of original 131991, drops 0)
2020-03-11 12:06:41,760 : INFO : deleting the raw counts dictionary of 17800 items
2020-03-11 12:06:41,762 : INFO : sample=0.001 downsamples 40 most-common words
2020-03-11 12:06:41,762 : INFO : downsampling leaves estimated 111227 word corpus (84.3% of prior 131991)
2020-03-11 12:06:41,779 : INFO : constructing a huffman tree from 17800 words
2020-03-11 12:06:42,239 : INFO : built huffman tree with maximum node depth 17


In [43]:
# 夹角余弦相似度
req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn =100):
    if len(key[0])==3:
        req_count -= 1
        print(key[0], key[1])
        if req_count == 0:
            break;

2020-03-11 12:07:03,280 : INFO : precomputing L2-norms of word weight vectors


李达康 0.664504885673523
高育良 0.6642884016036987
季昌明 0.5894726514816284
田国富 0.5871814489364624
易学习 0.553726315498352


## 六、效果可视化

In [44]:
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

# 解决中文显示问题
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [45]:
%matplotlib tk
# %matplotlib inline

In [53]:
# 获取词向量
word_embeddings = model.wv.vectors # 原始词向量映射矩阵
# 获取词典(词典到idx的映射)
index_2_vocab = dict(list(map(lambda k: (model.wv.vocab[k].index, k), model.wv.vocab)))

  app.launch_new_instance()


In [54]:
viz_words = 1000
tsne = TSNE() # 将高维数据映射到低维空间上，主要目的是为了可视化
# 直接可视化前100个单词
# embed_tsne = tsne.fit_transform(word_embeddings[:viz_words, :])

# 可视化比较特殊的一些单词
words = list(filter(lambda word: len(word) == 2, [t[1] for t in index_2_vocab.items()]))
words = words[1500: 1500+viz_words]

# words = ['侯处长','沙瑞金','田国富','高育良','侯亮平','钟小艾',
#         '陈岩石','欧阳菁','易学习','王大路','蔡成功',
#         '孙连城','季昌明','丁义珍','郑西坡','赵东来',
#         '高小琴','赵瑞龙','陆亦可','刘新建',
#         '刘庆祝','京州市','副市长','赵德汉']


viz_words = len(words)
indexs = [model.wv.vocab[word].index for word in words]
embed_tsne = tsne.fit_transform(word_embeddings[indexs, :])




In [55]:
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
    word_idx = indexs[idx]
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(index_2_vocab[word_idx], 
                 (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
plt.show()