## 1. Word2vec

In [1]:
# 引入代码包
from gensim.models.word2vec import Word2Vec, LineSentence
import gensim.downloader

In [2]:
# 读取文件
path = 'data/fairytales.txt'
with open(path, encoding='utf-8') as f:
    lines = f.readlines()
print(lines[:10])

['1872\n', 'FAIRY TALES OF HANS CHRISTIAN ANDERSEN\n', 'A CHEERFUL TEMPER\n', 'by Hans Christian Andersen\n', 'FROM my father I received the best inheritance, namely a "good\n', 'temper." "And who was my father?" That has nothing to do with the good\n', 'temper; but I will say he was lively, good-looking round, and fat;\n', 'he was both in appearance and character a complete contradiction to\n', 'his profession. "And pray what was his profession and his standing\n', 'in respectable society?" Well, perhaps, if in the beginning of a\n']


In [3]:
context = ' '.join(lines).replace('\n', ' ')

In [4]:
# 改为以？！.换行
context = context.replace('."','."\n')
context = context.replace('?"','?"\n')
context = context.replace('!"','!"\n')
context = context.replace('. ','."\n')
context = context.replace('? ','?"\n')
context = context.replace('! ','!"\n')
context



In [5]:
# 找出特殊字符
import re
import string

data = context.split('\n')
content = ''.join(data)
special_char = re.sub(r'[\u4e00-\u9fa5]', ' ', content)  # 匹配中文，将中文替换掉

print(set(special_char) - set(string.ascii_letters) - set(string.digits))

{'?', '"', '-', '】', ',', ':', '.', ')', '，', ';', "'", '\ufffe', '*', '！', ' ', '【', '(', '。', '!', '&'}


In [6]:
# 数据清洗
def cleaning(data):
    for i in range(len(data)):
        # 替换特殊字符
        data[i] = data[i].replace('ufffe', '')
        data[i] = data[i].replace('\n', '')
        eng_mark = [',', '.', '!', '?', ';','"'] # 因为标点前加空格
        for mark in eng_mark:
            data[i] = data[i].replace(mark, ' '+mark+' ')
            data[i] = data[i].replace('  ', ' ')
        data[i] = data[i].lower()  # 统一替换为小写
    return data
cleaning(data)

['1872 fairy tales of hans christian andersen a cheerful temper by hans christian andersen from my father i received the best inheritance , namely a " good temper . " ',
 ' " and who was my father ? " ',
 ' that has nothing to do with the good temper ; but i will say he was lively , good-looking round , and fat ; he was both in appearance and character a complete contradiction to his profession . " ',
 ' " and pray what was his profession and his standing in respectable society ? " ',
 ' well , perhaps , if in the beginning of a book these were written and printed , many , when they read it , would lay the book down and say , " it seems to me a very miserable title , i don\'t like things of this sort . " ',
 ' and yet my father was not a skin-dresser nor an executioner ; on the contrary , his employment placed him at the head of the grandest people of the town , and it was his place by right . " ',
 'he had to precede the bishop , and even the princes of the blood ; he always went firs

In [7]:
# 转换成词元形式
def tokenize(data):
    tokens = []
    for line in data:
        pair = line.split('\t')
        src = pair[0].split(' ')
        tokens.append(src)
    return tokens
tokens = tokenize(data)
print("tokens:", tokens[:6])

tokens: [['1872', 'fairy', 'tales', 'of', 'hans', 'christian', 'andersen', 'a', 'cheerful', 'temper', 'by', 'hans', 'christian', 'andersen', 'from', 'my', 'father', 'i', 'received', 'the', 'best', 'inheritance', ',', 'namely', 'a', '"', 'good', 'temper', '.', '"', ''], ['', '"', 'and', 'who', 'was', 'my', 'father', '?', '"', ''], ['', 'that', 'has', 'nothing', 'to', 'do', 'with', 'the', 'good', 'temper', ';', 'but', 'i', 'will', 'say', 'he', 'was', 'lively', ',', 'good-looking', 'round', ',', 'and', 'fat', ';', 'he', 'was', 'both', 'in', 'appearance', 'and', 'character', 'a', 'complete', 'contradiction', 'to', 'his', 'profession', '.', '"', ''], ['', '"', 'and', 'pray', 'what', 'was', 'his', 'profession', 'and', 'his', 'standing', 'in', 'respectable', 'society', '?', '"', ''], ['', 'well', ',', 'perhaps', ',', 'if', 'in', 'the', 'beginning', 'of', 'a', 'book', 'these', 'were', 'written', 'and', 'printed', ',', 'many', ',', 'when', 'they', 'read', 'it', ',', 'would', 'lay', 'the', 'book

In [8]:
w2v_model = Word2Vec(min_count=1, sg=1)
w2v_model.build_vocab(tokens) # 构建词表
w2v_model.train(tokens, total_examples=w2v_model.corpus_count, epochs=10) # 模型训练
w2v_model.save('model/w2v.model') # 模型保存

In [9]:
w2v_model.wv.distance('king', 'fruits')

0.380307674407959

In [10]:
w2v_model.wv.distance('king', 'queen')

0.22094666957855225

In [11]:
w2v_model.wv.distance('husband', 'wife')

0.23085010051727295

In [12]:
w2v_model.wv.distance('prince', 'princess')

0.12856191396713257

In [13]:
w2v_model.wv.most_similar('man')

[('poet', 0.7960713505744934),
 ('pious', 0.7199793457984924),
 ('student', 0.7118924260139465),
 ('emperor', 0.7057666182518005),
 ('shadow', 0.6983033418655396),
 ('king', 0.6958541870117188),
 ('clerk', 0.6886823773384094),
 ('bachelor', 0.6857923269271851),
 ('artist', 0.6828265190124512),
 ('field-mouse', 0.6788650751113892)]

In [14]:
w2v_model.wv.most_similar('is')

[("it's", 0.7663909196853638),
 ("that's", 0.7144259810447693),
 ('isthe', 0.6753060221672058),
 ('delightful', 0.6461496353149414),
 ('says', 0.6429592370986938),
 ('seems', 0.6414008736610413),
 ("i'm", 0.6407366991043091),
 ('was', 0.6394022703170776),
 ('does', 0.6341639161109924),
 ('itis', 0.6321037411689758)]

## 2. Doc2vec

In [15]:
# 引入代码包
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [16]:
# 构建documents
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]

In [17]:
d2v_model = Doc2Vec(min_count=1, dm=1) #设置模型参数
d2v_model.build_vocab(documents) # 构建词表
d2v_model.train(documents, total_examples=d2v_model.corpus_count, epochs=10) # 模型训练
d2v_model.save("model/d2v.model")

In [18]:
vector = d2v_model.infer_vector(["i", "love", 'you']) 
vector # 句向量

array([ 5.39252535e-02, -3.83157767e-02, -6.38218044e-05, -1.74791366e-02,
        2.11739931e-02, -7.96512961e-02, -1.54934358e-02,  4.61201891e-02,
       -1.34458756e-02, -5.57313673e-02,  6.52607996e-03, -1.82365924e-02,
       -3.13697942e-02, -1.00150118e-02, -4.45697596e-03,  3.22285593e-02,
       -3.13435458e-02, -5.38503658e-03,  6.81202263e-02, -6.33961931e-02,
       -1.99877024e-02, -5.40556014e-02,  7.39102811e-03,  1.48547441e-03,
        2.52026729e-02, -2.54591219e-02,  4.82451580e-02, -1.42155951e-02,
        1.44875199e-02, -1.11780297e-02,  2.53157807e-03,  3.34508941e-02,
       -4.07026708e-02, -2.80122235e-02, -9.15816650e-02, -6.11062087e-02,
        3.06912642e-02,  4.36644182e-02,  5.15675955e-02, -4.74357978e-02,
       -1.84462534e-03,  3.81492972e-02,  5.72027219e-03, -4.51108553e-02,
       -4.00553122e-02, -1.90876722e-02, -2.56116129e-02,  1.56621076e-02,
        1.96417756e-02,  2.33999733e-02,  1.38739143e-02, -3.07967458e-02,
        1.03941085e-02, -

In [19]:
d2v_model.similarity_unseen_docs(['i','like', 'you'], ['i', 'love', 'you'])

0.8187965

In [20]:
d2v_model.similarity_unseen_docs(['i', 'love', 'you'], ['go', 'away'])

-0.12075944