# 文本预处理工具
- 生成用于训练TfidfVector数据集的方法
- 生成用于训练word2vector数据集的方法

In [50]:
import sys
sys.path.append('..')
import esProxy
from analyzer import Analyzer
import pickle, random
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec



In [6]:
sougouNews = esProxy.getDataFromEs()

with open('sougouNews.pk', 'wb') as f:
    f.truncate()
    pickle.dump(sougouNews, f)

## 生成用于训练TfidfVector数据集的方法

In [48]:
def getTrainData4Tfidf(file='sougouNews.pk', field='doc_title'):
    """
    数据格式：每行为一个字符串，term之间空格分开
    """
    with open(file, 'rb') as f:
        dataSource = pickle.load(f)
    
    # 分词器
    analyzer = Analyzer()
    
    data4Tfidf = []
    for d in dataSource:
        title = d[field]
        termsString = ' '.join(analyzer.cutAndFilter(title, forSearch=False))
        data4Tfidf.append(termsString)
    return data4Tfidf

In [49]:
data4tfidf = getTrainData4Tfidf(file='sougouNews.pk', field='doc_title')

with open('trainData4Tfidf.pk', 'wb') as f:
    pickle.dump(data4tfidf, f)

### 训练tfidfvectorizro

In [51]:
tfidfVectorizor = TfidfVectorizer(min_df=2)
tfidfVectorizor.fit(data4tfidf)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## 生成用于训练word2vector数据集的方法

In [54]:
import re

def getTrainData4Wordvec(file='sougouNews.pk'):
    """
    数据格式：每一行为list，list里的每个元素为一个term
    """
    with open(file, 'rb') as f:
        dataSource = pickle.load(f)
    
    # 分词器
    analyzer = Analyzer()
    
    data4wordvec = []
    for d in dataSource:
        # 1. 标题为单独的一个句子
        # 2. 正文按句号切分出多个句子
        sentences = re.split('[。?？]', d['doc_content'])
        terms1 = [analyzer.cutAndFilter(sen) for sen in sentences]
        data4wordvec.extend(terms1)

        terms2 = analyzer.cutAndFilter(d['doc_title'])
        data4wordvec.append(terms2)
    # 滤除空list
    data4wordvec = [d for d in data4wordvec if len(d) > 0]
    return data4wordvec

In [55]:
data4wordvec = getTrainData4Wordvec()

with open('trainData4Word2vec.pk', 'wb') as f:
    f.truncate()
    pickle.dump(data4wordvec, f)

### 训练word2vec

In [56]:
word2vec = Word2Vec(size=100, window=3, min_count=1)
word2vec.build_vocab(data4wordvec)
word2vec.train(data4wordvec, total_examples=word2vec.corpus_count,epochs=word2vec.iter)

  app.launch_new_instance()


(128849765, 140245570)

In [57]:
word2vec.wv.similar_by_word('女性')

[('男性', 0.8309999704360962),
 ('军人', 0.6905989050865173),
 ('年轻人', 0.6876094341278076),
 ('儿童', 0.6672907471656799),
 ('男人', 0.6606389880180359),
 ('普通人', 0.6577602624893188),
 ('妇女', 0.648067831993103),
 ('职业女性', 0.6274821758270264),
 ('国人', 0.6245989799499512),
 ('白领', 0.6231762766838074)]

In [58]:
with open('word2vec.model', 'wb') as f:
    f.truncate()
    pickle.dump(word2vec, f)