# 实例1 word2vec的基本使用

## 1.语料加工

In [7]:
import os
import jieba

root_path = './data/家居'
file_list = os.listdir(root_path)
all_word_list = []
for name in file_list:
    file_path = root_path + "/" + name
    with open(file_path, "r", encoding="utf-8") as f:
        txt = f.read()
        word_list = jieba.lcut(txt, cut_all=False)  # 精确模式
        all_word_list.extend(word_list)
result = " ".join(all_word_list)
with open("result.txt", "w", encoding="utf-8") as f:
    f.write(result)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\echoe\AppData\Local\Temp\jieba.cache
Loading model cost 0.506 seconds.
Prefix dict has been built successfully.


## 2.打开命令控制符，导入word2vec

In [8]:
from gensim.models import word2vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus("result.txt")
model = word2vec.Word2Vec(sentences, min_count=1, vector_size=200)
model.save("my_model.model")

2023-03-20 14:51:15,602 : INFO : collecting all words and their counts
2023-03-20 14:51:15,602 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-20 14:51:15,618 : INFO : collected 16130 word types from a corpus of 133694 raw words and 14 sentences
2023-03-20 14:51:15,618 : INFO : Creating a fresh vocabulary
2023-03-20 14:51:15,649 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 16130 unique words (100.00% of original 16130, drops 0)', 'datetime': '2023-03-20T14:51:15.649922', 'gensim': '4.3.0', 'python': '3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-03-20 14:51:15,665 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 133694 word corpus (100.00% of original 133694, drops 0)', 'datetime': '2023-03-20T14:51:15.665581', 'gensim': '4.3.0', 'python': '3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]

## 3.读取保存的模型文件

In [9]:
model_2 = word2vec.Word2Vec.load("my_model.model")

2023-03-20 14:51:40,137 : INFO : loading Word2Vec object from my_model.model
2023-03-20 14:51:40,149 : INFO : loading wv recursively from my_model.model.wv.* with mmap=None
2023-03-20 14:51:40,149 : INFO : setting ignored attribute cum_table to None
2023-03-20 14:51:40,206 : INFO : Word2Vec lifecycle event {'fname': 'my_model.model', 'datetime': '2023-03-20T14:51:40.206796', 'gensim': '4.3.0', 'python': '3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


## 4.输出：“秋雨”的向量

In [10]:
print(model.wv['秋雨'])

[-0.00019699 -0.01525664 -0.00544756  0.01589671  0.02762173 -0.01976035
  0.02191073  0.07053912 -0.01587785  0.00102091  0.00156754 -0.00603007
  0.02241345  0.01668857 -0.01211019 -0.02073513 -0.00963482  0.01521658
 -0.01546223 -0.05507956  0.00649465 -0.01962992 -0.0108421   0.00916005
  0.00879236 -0.02255019  0.0073317  -0.02147668 -0.02621424  0.00517545
  0.02662977 -0.00034586  0.0255306  -0.01872779 -0.00595741  0.01620557
  0.02231165 -0.01866699 -0.01346172 -0.02531746 -0.0147829  -0.00228447
 -0.01748663  0.01068981  0.03811073 -0.02194946 -0.01627542 -0.0230178
  0.02131451  0.02589334  0.02077539 -0.00021713 -0.02416469 -0.02944181
  0.00742791 -0.03311317  0.01163418 -0.02014735 -0.03828868  0.02002272
 -0.01235592 -0.00921937 -0.00793731 -0.00062808 -0.05089124  0.02123314
  0.00301713  0.04890329 -0.03395978  0.02934017  0.00453265 -0.00147408
  0.03708081 -0.00152298 -0.01183118  0.01793184  0.04130481 -0.00916984
 -0.06297085 -0.01445791 -0.01302207 -0.01953019 -0.

## 5.计算两个词的相关程度

In [11]:
model = word2vec.Word2Vec.load("my_model.model")
y1 = model.wv.similarity("秋雨", "落叶")
print(y1)

2023-03-20 14:51:47,505 : INFO : loading Word2Vec object from my_model.model
2023-03-20 14:51:47,516 : INFO : loading wv recursively from my_model.model.wv.* with mmap=None
2023-03-20 14:51:47,516 : INFO : setting ignored attribute cum_table to None
2023-03-20 14:51:47,578 : INFO : Word2Vec lifecycle event {'fname': 'my_model.model', 'datetime': '2023-03-20T14:51:47.578497', 'gensim': '4.3.0', 'python': '3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


0.9277958


## 6.输出与“秋雨”相关度最高的20个词

In [12]:
y2 = model.wv.most_similar("落叶", topn=20)
for y in y2:
    print(y)

('编用', 0.9352712631225586)
('预科班', 0.935040295124054)
('征集', 0.9349719285964966)
('配合', 0.9343264102935791)
('森美', 0.9341440796852112)
('观', 0.9336857199668884)
('所在', 0.9336254596710205)
('接下来', 0.933625340461731)
('上市公司', 0.933603048324585)
('不俗', 0.9333636164665222)
('机', 0.9333564639091492)
('21', 0.9332705736160278)
('水泥砂浆', 0.9332515001296997)
('挥手', 0.9332339763641357)
('负温', 0.9331874251365662)
('位于', 0.9330872297286987)
('锁定', 0.9330846667289734)
('顶级', 0.9330658912658691)
('采访', 0.9330472946166992)
('线条', 0.932976245880127)


## 7.选择”最不合群“的词

In [13]:
y4 = model.wv.doesnt_match("劳动节 国庆节 中秋节 消费者".split(" "))
print(y4)



消费者
