# 实现思路：
将单个中文文字看成一个词，然后使用Word2Vec的API来训练，结果实际上是Char2Vec的向量转换

In [1]:
# import modules & set up logging
import logging # 打印日志的模块，日志主要包括以下几个级别: DEBUG、INFO、WARN、ERROR
import os

import numpy as np

import gensim
from gensim.models import word2vec

import jieba.analyse
import jieba

In [2]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
sentence_file_path = './datas/in_the_name_of_people.txt'
word_file_path = './datas/cut_char_words_of_in_the_name_of_people.txt'
model_file_path1 = './datas/gensim_char_word2vec1.w2v'
model_file_path2 = './datas/gensim_char_word2vec2.bin'
model_file_path3 = './datas/gensim_char_word2vec3_{}.npy'

## 一、分词

In [4]:
# 人民的名义 小说分词
jieba.suggest_freq('沙瑞金',True)
jieba.suggest_freq('田国富',True)
jieba.suggest_freq('高育良',True)
jieba.suggest_freq('侯亮平',True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
jieba.suggest_freq('京州市', True)
jieba.suggest_freq('副市长', True)
jieba.suggest_freq('赵德汉',True)

# 自定义词典
jieba.add_word('人民的名义')

with open(word_file_path,'w', encoding='utf-8') as writer:
    with open(sentence_file_path, 'r', encoding='utf-8') as reader:
        # 加载所有数据
        content = reader.read()
        
        # 分词
        content = list(content)
        print("总单词数目(去重前):{}".format(len(content)))
        print("总单词数目(去重后):{}".format(len(set(content))))
        
        # 合并结果
        result = ' '.join(content)
        
        # 结果输出
        writer.write(result)
print("Done!!!")

Building prefix dict from the default dictionary ...
2020-04-03 14:30:16,318 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
2020-04-03 14:30:16,331 : DEBUG : Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.681 seconds.
2020-04-03 14:30:17,006 : DEBUG : Loading model cost 0.681 seconds.
Prefix dict has been built succesfully.
2020-04-03 14:30:17,008 : DEBUG : Prefix dict has been built succesfully.


总单词数目(去重前):266166
总单词数目(去重后):3272
Done!!!


## 二、Gensim Word2Vec构建

In [5]:
# 按行数据加载
sentences = word2vec.LineSentence(word_file_path) 

k = 0
for sentence in sentences:
    print("=" * 50)
    print(sentence)
    k+=1
    if k >= 10:
        break

['\ufeff']
['人', '民', '的', '名', '义']
['周', '梅', '森']
['©', '中', '文', '在', '线', '数', '字', '出', '版', '集', '团', '股', '份', '有', '限', '公', '司', '，', '2', '0', '1', '6', '-', '2', '0', '1', '7']
['数', '字', '版', '图', '书', '版', '权', '信', '息']
['人', '民', '的', '名', '义', '/', '周', '梅', '森', '著', '.', '北', '京', '：', '中', '文', '在', '线', '数', '字', '出', '版', '集', '团', '股', '份', '有', '限', '公', '司', '，', '2', '0', '1', '7', '.', '2', '.']
['C', 'A', 'E', 'B', 'N', '：', '7', '-', '0', '0', '1', '-', '0', '0', '0', '-', '6', '0', '7', '3', '3', '6', '3', '8', '-', '6']
['分', '类', '号', '：', '长', '篇', '小', '说', '—', '—', '中', '国', '—', '—', '当', '代', 'I', '2', '4', '7', '.', '5', '4']
['互', '联', '网', '出', '版', '许', '可', '证', '：', '新', '出', '网', '证', '（', '京', '）', '字', '0', '4', '5', '号']
['人', '民', '的', '名', '义']


#### 训练方式一

In [6]:
# 按行数据加载
sentences = word2vec.LineSentence(word_file_path) 

# 训练Word2Vec模型
"""
__init__(self, sentences=None, size=100, alpha=0.025, 
        window=5, min_count=5, max_vocab_size=None, 
        sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
        sg=0, hs=0, negative=5, cbow_mean=1, 
        hashfxn=<built-in function hash>, iter=5, null_word=0, 
        trim_rule=None, sorted_vocab=1, batch_words=10000, 
        compute_loss=False, callbacks=())
sentences: 给定文档集合
size:转换之后的特征向量的维度大小
window：窗口大小，如果是Word2Vec一般设置为5~10左右；如果是短文本，那么就小一点；如果长文本，那么就大一点。如果是Char2Vec建议稍微大一点。
min_count：如果某一个单词出现次数小于min_count,那么该单词不计算对应的词向量
max_vocab_size：给定最多计算的词汇数目，None表示不限制。
sg: 1(Skip-gram) 0(CBOW)， 默认为0
hs: 1(hierarchical softmax) 0(negative)， 默认为0
negative: 当hs为0的时候，给定负样本数目，给定为0表示不采用负采样
iter：模型训练的迭代次数
"""
model = word2vec.Word2Vec(sentences, hs = 0,min_count = 2,
                          window = 10,size = 100, compute_loss=True)

2020-04-03 14:30:17,115 : INFO : collecting all words and their counts
2020-04-03 14:30:17,116 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-03 14:30:17,187 : INFO : collected 3269 word types from a corpus of 258973 raw words and 2311 sentences
2020-04-03 14:30:17,188 : INFO : Loading a fresh vocabulary
2020-04-03 14:30:17,197 : INFO : effective_min_count=2 retains 2802 unique words (85% of original 3269, drops 467)
2020-04-03 14:30:17,199 : INFO : effective_min_count=2 leaves 258506 word corpus (99% of original 258973, drops 467)
2020-04-03 14:30:17,207 : INFO : deleting the raw counts dictionary of 3269 items
2020-04-03 14:30:17,209 : INFO : sample=0.001 downsamples 69 most-common words
2020-04-03 14:30:17,210 : INFO : downsampling leaves estimated 203380 word corpus (78.7% of prior 258506)
2020-04-03 14:30:17,219 : INFO : estimated required memory for 2802 words and 100 dimensions: 3642600 bytes
2020-04-03 14:30:17,221 : INFO : resetting layer we

#### 训练方式二

In [7]:
# 每行数据加载
sentences = word2vec.LineSentence(word_file_path) 

# 训练Word2Vec模型
"""
__init__(self, sentences=None, size=100, alpha=0.025, 
        window=5, min_count=5, max_vocab_size=None, 
        sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
        sg=0, hs=0, negative=5, cbow_mean=1, 
        hashfxn=<built-in function hash>, iter=5, null_word=0, 
        trim_rule=None, sorted_vocab=1, batch_words=10000, 
        compute_loss=False, callbacks=())
sg: 1(Skip-gram) 0(CBOW)
hs: 1(hierarchical softmax) 0(negative)
negative: 当hs为0的时候，给定负样本数目，给定为0表示不采用负采样
"""
model = word2vec.Word2Vec(hs = 0,min_count = 1, window = 10,size = 100, compute_loss=True)

# 构建词典
model.build_vocab(sentences)

# 模型训练
print("总文档数目:{}".format(model.corpus_count))
model.train(sentences, total_examples=model.corpus_count, epochs=10)

2020-04-03 14:30:18,203 : INFO : collecting all words and their counts
2020-04-03 14:30:18,205 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-03 14:30:18,276 : INFO : collected 3269 word types from a corpus of 258973 raw words and 2311 sentences
2020-04-03 14:30:18,277 : INFO : Loading a fresh vocabulary
2020-04-03 14:30:18,283 : INFO : effective_min_count=1 retains 3269 unique words (100% of original 3269, drops 0)
2020-04-03 14:30:18,284 : INFO : effective_min_count=1 leaves 258973 word corpus (100% of original 258973, drops 0)
2020-04-03 14:30:18,295 : INFO : deleting the raw counts dictionary of 3269 items
2020-04-03 14:30:18,297 : INFO : sample=0.001 downsamples 69 most-common words
2020-04-03 14:30:18,299 : INFO : downsampling leaves estimated 203917 word corpus (78.7% of prior 258973)
2020-04-03 14:30:18,309 : INFO : estimated required memory for 3269 words and 100 dimensions: 4249700 bytes
2020-04-03 14:30:18,310 : INFO : resetting layer weig

总文档数目:2311


2020-04-03 14:30:18,718 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-03 14:30:18,720 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-03 14:30:18,726 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-03 14:30:18,727 : INFO : EPOCH - 2 : training on 258973 raw words (204018 effective words) took 0.2s, 1089898 effective words/s
2020-04-03 14:30:18,903 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-03 14:30:18,908 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-03 14:30:18,913 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-03 14:30:18,914 : INFO : EPOCH - 3 : training on 258973 raw words (204125 effective words) took 0.2s, 1118259 effective words/s
2020-04-03 14:30:19,074 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-03 14:30:19,078 : INFO : worker thread finished; awaiting finish of 1 more threads


(2039127, 2589730)

## 三、Word2Vec应用

### 1. 获取相似度最高的K个演员

In [8]:
model.wv.similar_by_word('沙', topn =10)

2020-04-03 14:30:20,277 : INFO : precomputing L2-norms of word weight vectors


[('金', 0.9261943101882935),
 ('瑞', 0.7158239483833313),
 ('渣', 0.6757111549377441),
 ('田', 0.6536321640014648),
 ('龙', 0.608456015586853),
 ('富', 0.5885674357414246),
 ('煌', 0.558769941329956),
 ('鞅', 0.5485153198242188),
 ('杜', 0.5318111181259155),
 ('伯', 0.5284663438796997)]

### 2. 获取单词之间的相似度

In [9]:
# 夹角余弦相似度
print(model.wv.similarity('沙', '德'))

0.33464655


In [10]:
# 夹角余弦相似度
print(model.wv.similarity('审', '讯'))
print(model.wv.similarity('审', '反'))
print(model.wv.similarity('审', '局'))
print(model.wv.similarity('局', '反'))

0.95956093
0.38758934
0.30301526
0.77751446


### 3. 获取单词的词向量

In [11]:
v1 = model.wv["提"]
print(v1.shape)
print(v1)

(100,)
[-0.60956335 -0.6471069  -0.26842785  0.6162968  -0.18076341 -0.1703121
  0.36428553  0.1416554  -0.14931701  0.06397284  0.71154076  0.99694115
  0.8789367  -0.59439296  0.08373793  0.22902529 -0.15831263 -0.26903126
  0.1236173  -0.20903477  0.15448995 -0.14840478 -0.4177159   0.10698174
 -0.60944647  0.03953292  0.1956923   0.1513218  -0.07037899 -0.8072933
  0.4908197  -0.323661    0.8417529  -0.3173876   0.1118276   0.00171082
 -0.08394217 -0.16500616  0.74035805 -0.48985454  0.03330582 -0.16678555
 -0.04433866  0.05762612 -0.2563994   0.41124418 -0.99280447  0.04206005
  0.309352    0.7236022  -0.21899389 -0.06944254  0.16188137  0.14498137
  0.89541984 -0.48291838 -0.510195    0.11914092 -0.3826733  -0.14460754
  0.6162793  -0.02993771  0.5329512   0.43782917  0.4966131   0.21352576
 -0.63042563 -0.12275264 -0.21682642 -0.37671444 -0.4216982   0.18426591
  0.06127786  0.09715133 -0.06580494 -0.12927115  0.3276482   0.3567492
 -1.147274   -1.0582179   0.6785206  -0.4227622

In [12]:
v1 = model.wv.get_vector("提")
print(v1.shape)
print(v1)

(100,)
[-0.60956335 -0.6471069  -0.26842785  0.6162968  -0.18076341 -0.1703121
  0.36428553  0.1416554  -0.14931701  0.06397284  0.71154076  0.99694115
  0.8789367  -0.59439296  0.08373793  0.22902529 -0.15831263 -0.26903126
  0.1236173  -0.20903477  0.15448995 -0.14840478 -0.4177159   0.10698174
 -0.60944647  0.03953292  0.1956923   0.1513218  -0.07037899 -0.8072933
  0.4908197  -0.323661    0.8417529  -0.3173876   0.1118276   0.00171082
 -0.08394217 -0.16500616  0.74035805 -0.48985454  0.03330582 -0.16678555
 -0.04433866  0.05762612 -0.2563994   0.41124418 -0.99280447  0.04206005
  0.309352    0.7236022  -0.21899389 -0.06944254  0.16188137  0.14498137
  0.89541984 -0.48291838 -0.510195    0.11914092 -0.3826733  -0.14460754
  0.6162793  -0.02993771  0.5329512   0.43782917  0.4966131   0.21352576
 -0.63042563 -0.12275264 -0.21682642 -0.37671444 -0.4216982   0.18426591
  0.06127786  0.09715133 -0.06580494 -0.12927115  0.3276482   0.3567492
 -1.147274   -1.0582179   0.6785206  -0.4227622

## 四、模型持久化&模型恢复加载

### 方式一：
直接使用save API进行模型持久化

#### 持久化

In [13]:
model.save(model_file_path1)

2020-04-03 14:30:20,358 : INFO : saving Word2Vec object under ./datas/gensim_char_word2vec1.w2v, separately None
2020-04-03 14:30:20,360 : INFO : not storing attribute vectors_norm
2020-04-03 14:30:20,362 : INFO : not storing attribute cum_table
2020-04-03 14:30:20,405 : INFO : saved ./datas/gensim_char_word2vec1.w2v


#### 加载

In [14]:
# 直接基于路径加载
model2 = word2vec.Word2Vec.load(model_file_path1)
print(model2)

v1 = model2.wv.get_vector("提")
print(v1.shape)
print(v1)

2020-04-03 14:30:20,413 : INFO : loading Word2Vec object from ./datas/gensim_char_word2vec1.w2v
2020-04-03 14:30:20,445 : INFO : loading wv recursively from ./datas/gensim_char_word2vec1.w2v.wv.* with mmap=None
2020-04-03 14:30:20,447 : INFO : setting ignored attribute vectors_norm to None
2020-04-03 14:30:20,448 : INFO : loading vocabulary recursively from ./datas/gensim_char_word2vec1.w2v.vocabulary.* with mmap=None
2020-04-03 14:30:20,449 : INFO : loading trainables recursively from ./datas/gensim_char_word2vec1.w2v.trainables.* with mmap=None
2020-04-03 14:30:20,451 : INFO : setting ignored attribute cum_table to None
2020-04-03 14:30:20,452 : INFO : loaded ./datas/gensim_char_word2vec1.w2v


Word2Vec(vocab=3269, size=100, alpha=0.025)
(100,)
[-0.60956335 -0.6471069  -0.26842785  0.6162968  -0.18076341 -0.1703121
  0.36428553  0.1416554  -0.14931701  0.06397284  0.71154076  0.99694115
  0.8789367  -0.59439296  0.08373793  0.22902529 -0.15831263 -0.26903126
  0.1236173  -0.20903477  0.15448995 -0.14840478 -0.4177159   0.10698174
 -0.60944647  0.03953292  0.1956923   0.1513218  -0.07037899 -0.8072933
  0.4908197  -0.323661    0.8417529  -0.3173876   0.1118276   0.00171082
 -0.08394217 -0.16500616  0.74035805 -0.48985454  0.03330582 -0.16678555
 -0.04433866  0.05762612 -0.2563994   0.41124418 -0.99280447  0.04206005
  0.309352    0.7236022  -0.21899389 -0.06944254  0.16188137  0.14498137
  0.89541984 -0.48291838 -0.510195    0.11914092 -0.3826733  -0.14460754
  0.6162793  -0.02993771  0.5329512   0.43782917  0.4966131   0.21352576
 -0.63042563 -0.12275264 -0.21682642 -0.37671444 -0.4216982   0.18426591
  0.06127786  0.09715133 -0.06580494 -0.12927115  0.3276482   0.3567492
 -1

### 方式二：
保存为二进制词向量

#### 持久化

In [15]:
model.wv.save_word2vec_format(model_file_path2,binary=True)

2020-04-03 14:30:20,470 : INFO : storing 3269x100 projection weights into ./datas/gensim_char_word2vec2.bin


#### 加载

In [16]:
# 加载模型
model2 = gensim.models.KeyedVectors.load_word2vec_format(model_file_path2,binary=True)
print(model2)

# 应用模型
v1 = model2.get_vector("提")
print(v1.shape)
print(v1)

2020-04-03 14:30:20,824 : INFO : loading projection weights from ./datas/gensim_char_word2vec2.bin


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbb in position 0: invalid start byte

### 方式三：
直接使用NumPy API保存词向量信息

#### 持久化

In [None]:
# 获取词向量
norm_word_embeddings = model.wv.vectors_norm # 经过L2转换后的词向量映射矩阵
word_embeddings = model.wv.vectors # 原始词向量映射矩阵
# 获取词典(词典到idx的映射)
vocab_2_index = list(map(lambda k: (k, model.wv.vocab[k].index), model.wv.vocab))
print(np.shape(norm_word_embeddings), np.shape(word_embeddings), np.shape(vocab_2_index))
# 数据保存
np.save(model_file_path3.format("norm_embedding"), norm_word_embeddings)
np.save(model_file_path3.format("embedding"), word_embeddings)
np.save(model_file_path3.format("vocab_2_index"), vocab_2_index)

#### 加载

In [None]:
# 加载数据
norm_word_embeddings = np.load(model_file_path3.format("norm_embedding"))
word_embeddings = np.load(model_file_path3.format("embedding"))
vocab_2_index = np.load(model_file_path3.format("vocab_2_index"))

# 字典转换
vocab_2_index = dict(map(lambda t:(t[0], int(t[1])), vocab_2_index))

# 获取数据
word = "提"
index = vocab_2_index[word]
v1 = word_embeddings[index]
print(v1.shape)
print(v1)

## 五、效果可视化

In [None]:
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt

# 解决中文显示问题
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [None]:
%matplotlib tk
# %matplotlib inline

In [None]:
# 获取词向量
word_embeddings = model.wv.vectors # 原始词向量映射矩阵
# 获取词典(词典到idx的映射)
index_2_vocab = dict(list(map(lambda k: (model.wv.vocab[k].index, k), model.wv.vocab)))

In [None]:
viz_words = 500
tsne = TSNE() # 将高维数据映射到低维空间上，主要目的是为了可视化
embed_tsne = tsne.fit_transform(word_embeddings[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(index_2_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
plt.show()