### 训练向量化模型
- TfidfVectorizor
- Word2Vec

In [27]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import datetime
import numpy as np

In [2]:
# 获取当前时间，用户日志打印
def timenow():
    return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(timenow())

2018-05-29 23:35:34


In [3]:
# 载入训练数据
with open('tokenSougouNews-train.pk', 'rb') as f:
    trainData = pickle.load(f)
    trainData4Tfidf = [' '.join((d['doc_title'] + d['doc_content']))
                       for d in trainData]
print('%s-train size=%d.' % (timenow(), len(trainData4Tfidf)))

# 训练TfidfVectorizor
tfidfVectorizor = TfidfVectorizer()
tfidfVectorizor.fit(trainData4Tfidf)
print('%s-train TfidfVectorizor finished.' % (timenow()))

2018-05-29 23:35:44-train size=82656.
2018-05-29 23:36:04-train TfidfVectorizor finished.


In [4]:
# 训练word2vec
sentences = [d.split() for d in trainData4Tfidf if len(d) > 0]
print('%s-生成训练数据: size=%d' % (timenow(), len(sentences)))

word2vec = Word2Vec(size=100, window=3, min_count=1)
word2vec.build_vocab(sentences)
word2vec.train(sentences, total_examples=word2vec.corpus_count,epochs=word2vec.iter)
print('%s-train word2vec finished.' % (timenow()))

2018-05-29 23:36:08-生成训练数据: size=82656




2018-05-29 23:38:40-train word2vec finished.


In [14]:
word2vec.wv.similar_by_word('女性')

[('男性', 0.8334938883781433),
 ('年轻人', 0.6696390509605408),
 ('军人', 0.6621428728103638),
 ('白领', 0.6518714427947998),
 ('儿童', 0.6363332271575928),
 ('男人', 0.6311925649642944),
 ('国人', 0.6270802021026611),
 ('成人', 0.6261032819747925),
 ('审美', 0.6210224628448486),
 ('妇女', 0.6197933554649353)]

### 构建分类器

In [5]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import sys
sys.path.append('..')

import vectorizor

In [7]:
# 载入训练数据和测试数据
with open('tokenSougouNews-test.pk', 'rb') as f:
    testData = pickle.load(f)
with open('tokenSougouNews-train.pk', 'rb') as f:
    trainData = pickle.load(f)
trainX = [dict(doc_title=' '.join(d['doc_title'])) for d in trainData]
trainY = [d['doc_type'] for d in trainData]
print('%s-train size=%d' % (timenow(), len(trainX)))
testX = [dict(doc_title=' '.join(d['doc_title'])) for d in testData]
testY = [d['doc_type'] for d in testData]
print('%s-test size=%d' % (timenow(), len(testX)))

2018-05-29 23:40:28-train size=82656
2018-05-29 23:40:28-test size=20664


In [9]:
# tfidf + lr
tfidfLrClf = Pipeline([('tfidfVectorizor', vectorizor.TfidfVectorizor(tfidfVectorizor, ['doc_title'])),
                 ('lr', LogisticRegression())])
tfidfLrClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, tfidfLrClf.predict(trainX))
testAcc = accuracy_score(testY, tfidfLrClf.predict(testX))
print('%s-tfidf+lr: trainAcc=%f, testAcc=%f' % (timenow(), trainAcc, testAcc))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


2018-05-29 23:41:06-tfidf+lr: trainAcc=0.918518, testAcc=0.876597


In [10]:
# word2vec + lr
w2cLrClf = Pipeline([('tfidfVectorizor', vectorizor.Doc2VecVectorizor(word2vec, ['doc_title'])),
                 ('lr', LogisticRegression())])
w2cLrClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, w2cLrClf.predict(trainX))
testAcc = accuracy_score(testY, w2cLrClf.predict(testX))
print('%s-word2vec+lr: trainAcc=%f, testAcc=%f' % (timenow(), trainAcc, testAcc))

2018-05-29 23:43:37-word2vec+lr: trainAcc=0.844343, testAcc=0.847464


In [15]:
# tfidf + svm
tfidfSvmClf = Pipeline([('tfidfVectorizor', vectorizor.TfidfVectorizor(tfidfVectorizor, ['doc_title'])),
                 ('svm', LinearSVC())])
tfidfSvmClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, tfidfSvmClf.predict(trainX))
testAcc = accuracy_score(testY, tfidfSvmClf.predict(testX))
print('%s-tfidf+svm: trainAcc=%f, testAcc=%f' % (timenow(), trainAcc, testAcc))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


2018-05-29 23:46:46-tfidf+svm: trainAcc=0.982639, testAcc=0.900019


In [16]:
# word2vec + svm
w2cSvmClf = Pipeline([('tfidfVectorizor', vectorizor.Doc2VecVectorizor(word2vec, ['doc_title'])),
                 ('svm', LinearSVC())])
w2cSvmClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, w2cSvmClf.predict(trainX))
testAcc = accuracy_score(testY, w2cSvmClf.predict(testX))
print('%s-word2vec+svm: trainAcc=%f, testAcc=%f' % (timenow(), trainAcc, testAcc))

2018-05-29 23:50:04-word2vec+svm: trainAcc=0.841355, testAcc=0.845528


### word2vec + LSTM

In [17]:
# 载入训练数据和测试数据
with open('tokenSougouNews-test.pk', 'rb') as f:
    testData = pickle.load(f)
with open('tokenSougouNews-train.pk', 'rb') as f:
    trainData = pickle.load(f)

# 由于LSTM需要指定input_length，这里确定训练数据中标题的最大单词个数
wordsMax = 0
for d in testData:
    wordsMax = max(wordsMax, len(d['doc_title']))
print('%s-最大单词个数：%d' % (timenow(), wordsMax))

In [30]:
# 转化为可以用于LSTM训练的数据
def toLstmInputs(data,word2vec, wordsMax=25, maskValue=0):
    inputDim = word2vec.vector_size
    lstmInputs = []
    for row in data:
        words = row['doc_title']
        print('%s-words=%s' % (timenow(), words))
        lstmInputs.append([word2vec[words[index]] if index < len(words) and words[index] in word2vec else maskValue * np.ones(inputDim) for index in range(wordsMax)])
    return lstmInputs