In [2]:
import sys
import os
import jieba
import config # 自定义配置文件
import numpy as np
import pandas as pd
from datetime import datetime

# 导入自定义模块
sys.path.append(config.Py_path) # 添加路径
from SaveAndLoad import save_pkl # 数据文件持久化

In [3]:
from collections import namedtuple # 创建命名元组，即创建和tuple类似的对象
import subprocess # 子进程
import codecs # 编码转换
from gensim.models import Doc2Vec

### 构建Doc2Vec模型

In [4]:
# 定义一个namedtuple类型SentimentDocument（对象），并包含SentimentDocument和words tags属性
SentimentDocument = namedtuple('SentimentDocument', 'words tags')

In [5]:
# 定义文档创建类
class Doc_list(object):
    def __init__(self,f):
        self.f = f
    def __iter__(self):
        Participle_Jieba = pd.read_excel(self.f)
        for i,line in enumerate(Participle_Jieba["token"]):
            words = line[2:-2].split("', '") # 划分分词结果
            tags = [i]
            yield SentimentDocument(words,tags) # yield——生成器，减少占用的内存

In [6]:
# 模型实例化
D2V = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=3, window=30,sample=1e-5,workers=8,alpha=0.025,min_alpha=0.025)
D2V_HW = Doc2Vec(dm=0, vector_size=300, negative=2, hs=0, min_count=1, window=5,sample=1e-5,workers=8,alpha=0.025,min_alpha=0.025)

In [7]:
doc_list = Doc_list(config.WordData_path + 'Participle-Jieba.xlsx') # 生成文档
HW_doc_list = Doc_list(config.WordData_path + "Participle-FoolNLTK-HW.xlsx") # 生成文档

In [8]:
D2V.build_vocab(doc_list) # 构建词汇表
D2V_HW.build_vocab(HW_doc_list) # 构建词汇表

In [9]:
D2V.train(doc_list,total_examples=17651, epochs=5) # Doc2Vec模型训练
D2V_HW.train(HW_doc_list,total_examples=D2V_HW.corpus_count, epochs=5) # Doc2Vec模型训练

### 词向量持久化

In [10]:
# 词向量提取
X_sp = np.array([D2V.docvecs[i] for i in range(17651)])
X_HW_sp = np.array([D2V_HW.docvecs[i] for i in range(17651)])

In [13]:
X_sp.shape

(17651, 300)

In [14]:
X_HW_sp.shape

(17651, 300)

In [15]:
# 存储词向量数据
save_pkl(X_sp,"D2V_X_sp")

持久化存储路径：./data/WordVectorData/D2V_X_sp.feat


In [16]:
# 存储词向量数据
save_pkl(X_HW_sp,"D2V_HW_X_sp")

持久化存储路径：./data/WordVectorData/D2V_HW_X_sp.feat
