# 训练词向量

In [1]:
from gensim.models import Word2Vec
import pandas as pd
import jieba
import pickle

## Char级别Embedding

In [2]:
train_data = pd.read_csv("../jupyter/shuffle-data/train_data.csv")
dev_data = pd.read_csv("../jupyter/shuffle-data/dev_data.csv")

# 训练字向量的w2v
char_train_list = []
for idx in train_data.index:
    query_1 = train_data.iloc[idx,1]
    query_2 = train_data.iloc[idx,2]
    for q in [query_1,query_2]:
        line = [word for word in q]
        if line not in char_train_list:
            char_train_list.append(line)

In [3]:
char_train_list[:5]

[['剧', '烈', '运', '动', '后', '咯', '血', '，', '是', '怎', '么', '了', '？'],
 ['剧', '烈', '运', '动', '后', '咯', '血', '是', '什', '么', '原', '因', '？'],
 ['剧', '烈', '运', '动', '后', '为', '什', '么', '会', '咯', '血', '？'],
 ['剧', '烈', '运', '动', '后', '咯', '血', '，', '应', '该', '怎', '么', '处', '理', '？'],
 ['剧', '烈', '运', '动', '后', '咯', '血', '，', '需', '要', '就', '医', '吗', '？']]

In [4]:
embed_size = 300
model = Word2Vec(char_train_list,size=embed_size,window=5, min_count=1, workers=4)
with open("w2v_char_" + str(embed_size) + ".pkl","wb") as f:
    pickle.dump(model,f)

## Word级别Embedding

In [5]:
# 训练词级别的w2v
word_train_list = []
for idx in train_data.index:
    query_1 = train_data.iloc[idx,1]
    query_2 = train_data.iloc[idx,2]
    for q in [query_1,query_2]:
        line = [word for word in jieba.lcut(q)]
        if line not in word_train_list:
            word_train_list.append(line)

word_train_list[:10]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/j1/ls86yccj7l5dyscbpmp85ngw0000gn/T/jieba.cache
Loading model cost 0.877 seconds.
Prefix dict has been built successfully.


[['剧烈运动', '后', '咯血', '，', '是', '怎么', '了', '？'],
 ['剧烈运动', '后', '咯血', '是', '什么', '原因', '？'],
 ['剧烈运动', '后', '为什么', '会', '咯血', '？'],
 ['剧烈运动', '后', '咯血', '，', '应该', '怎么', '处理', '？'],
 ['剧烈运动', '后', '咯血', '，', '需要', '就医', '吗', '？'],
 ['剧烈运动', '后', '咯血', '，', '是否', '很', '严重', '？'],
 ['百令', '胶囊', '需要', '注意', '什么', '？'],
 ['百令', '胶囊', '有', '什么', '注意事项', '？'],
 ['服用', '百令', '胶囊', '有', '什么', '需要', '特别', '注意', '的', '吗', '？'],
 ['百令', '胶囊', '如何', '服用', '？']]

In [6]:
embed_size = 300
model = Word2Vec(word_train_list,size=embed_size,window=5, min_count=1, workers=4)
with open("w2v_word_" + str(embed_size) + ".pkl","wb") as f:
    pickle.dump(model,f)

In [7]:
import numpy as np
from gensim.models import Word2Vec

# char_model = Word2Vec.load("w2v_char_300.pkl")
# word_model = Word2Vec.load("w2v_word_300.pkl")

# word_model.wv.vocab
# char_model.wv.vocab

In [8]:
word2idx = {"_PAD": 0, "_UNK": 1}

embedding_matrix = np.zeros((len(model.wv.vocab) + 2, model.vector_size))

unk = np.random.random(size=model.vector_size)
unk = unk - unk.mean()
embedding_matrix[1] = unk

for word in model.wv.vocab.keys():
    idx = len(word2idx)
    word2idx[word] = idx
    embedding_matrix[idx] = model.wv[word]

In [9]:
len(word2idx)

3097

In [10]:
len(embedding_matrix)

3097