In [1]:
from gensim.test.utils import common_texts
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


# 一、Doc2Vec

In [2]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
# 拼接数据
documents = [TaggedDocument(doc, [i]) for i,doc in enumerate(common_texts)]

"""
gensim.models.doc2vec.Doc2Vec(
  documents=None, 文档集合, list(TaggedDocument(list(string), tagid))
  corpus_file=None, 
  dm_mean=None, 如果使用PVDM结构的时候，对于上下文的单词向量是相加(0)还是均值(1)
  dm=1, 训练算法；1表示PVDM，0表示PV-DBOW
  dbow_words=0, 如果使用PVDMOW结构的时候，同时训练文档向量和词向量(1)，仅训练文档向量(0)
  dm_concat=0, 如果使用PVDM结构的时候，单词向量和文档向量的合并方式为sum(0)还是concat(1)
  dm_tag_count=1, 
  docvecs=None, 
  docvecs_mapfile=None, 
  comment=None, 
  trim_rule=None, 
  callbacks=(), **kwargs)
NOTE: Word2Vec的所有参数Doc2Vec基本都支持

"""
# 模型训练
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=3)

In [4]:
# 预测使用
vector = model.infer_vector(["system", "response"])
print("【Doc2Vec结果】:\n{}".format(vector))

【Doc2Vec结果】:
[-0.04091445  0.04344336 -0.08487181  0.07065757 -0.03429239]


In [5]:
# 预测使用
vector = model.infer_vector(["interface", "minors", "eps", "小明"])
print("【Doc2Vec结果】:\n{}".format(vector))

【Doc2Vec结果】:
[-0.08389413 -0.06861168 -0.01032796  0.05317792  0.09081002]


# 二、FastText

In [6]:
from gensim.test.utils import common_texts
from gensim.models.fasttext import FastText

In [7]:
# 模型对象创建
"""
gensim.models.fasttext.FastText(
    sentences=None,  文档， list(list(string))
    corpus_file=None, 
    sg=0,  和Word2Vec一样
    hs=0,  和Word2Vec一样
    size=100,  和Word2Vec一样
    alpha=0.025, 和Word2Vec一样
    window=5, 和Word2Vec一样
    min_count=5, 和Word2Vec一样
    max_vocab_size=None, 和Word2Vec一样
    word_ngrams=1, 提取n-gram的时候，N等于多少
    sample=0.001, 
    seed=1, 
    workers=3, 
    min_alpha=0.0001, 
    negative=5, 
    ns_exponent=0.75, 
    cbow_mean=1, hashfxn=<built-in function hash>, iter=5, 
    null_word=0, 
    min_n=3, 
    max_n=6, 
    sorted_vocab=1, 
    bucket=2000000, 
    trim_rule=None, 
    batch_words=10000, callbacks=(), compatible_hash=True)

"""
model = FastText(size=4, window=3, min_count=1)
# 构建词表
model.build_vocab(sentences=common_texts)
# 训练模型
model.train(sentences=common_texts, total_examples=model.corpus_count, epochs=10)

In [8]:
# 模型应用（获取单词向量）
v1 = model.wv.get_vector('system')
v2 = model.wv.get_vector('interface')
print(v1)
print(v2)

[-0.03050808 -0.00921003  0.0233555   0.00851412]
[ 0.0185172   0.03048961 -0.00839526  0.02434803]


# 三、扩展
直接读取文件训练FastText模型

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
from gensim import utils
from gensim.models.fasttext import FastText

word_file_path = './datas/cut_words_of_in_the_name_of_people.txt'
class MyData(object):
    def __iter__(self):
        path = word_file_path
        with open(path, 'r', encoding='utf-8') as reader:
            for line in reader:
                yield list(utils.tokenize(line))

# 模型构建
model = FastText(size=4, window=3, min_count=1, sentences=MyData(), iter=10)

2020-03-11 15:04:12,416 : INFO : resetting layer weights
2020-03-11 15:04:12,637 : INFO : collecting all words and their counts
2020-03-11 15:04:12,639 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-11 15:04:12,777 : INFO : collected 17800 word types from a corpus of 131991 raw words and 2422 sentences
2020-03-11 15:04:12,778 : INFO : Loading a fresh vocabulary
2020-03-11 15:04:12,807 : INFO : effective_min_count=1 retains 17800 unique words (100% of original 17800, drops 0)
2020-03-11 15:04:12,808 : INFO : effective_min_count=1 leaves 131991 word corpus (100% of original 131991, drops 0)
2020-03-11 15:04:12,866 : INFO : deleting the raw counts dictionary of 17800 items
2020-03-11 15:04:12,868 : INFO : sample=0.001 downsamples 40 most-common words
2020-03-11 15:04:12,869 : INFO : downsampling leaves estimated 111227 word corpus (84.3% of prior 131991)
2020-03-11 15:04:12,976 : INFO : estimated required memory for 17800 words, 67801 buckets and 4 dime

In [11]:
# 夹角余弦相似度
req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn =100):
    if len(key[0])==3:
        req_count -= 1
        print(key[0], key[1])
        if req_count == 0:
            break;

2020-03-11 15:04:15,768 : INFO : precomputing L2-norms of word weight vectors
2020-03-11 15:04:15,770 : INFO : precomputing L2-norms of ngram weight vectors


凑合着 0.9985167980194092
高育良 0.9978237748146057
里斯本 0.9968358874320984
四五十 0.9964548945426941
请育良 0.9963105320930481
