## 文本分类步骤
- 划分数据集
- 对标题和正文分词和去停用词
- 计算tf-idf等特征
- 构建分类器

### 从ES取出带标签的数据，分词，并dump到本地

In [23]:
import sys
sys.path.append('..')
import esProxy
from analyzer import Analyzer
import pickle, random

# 从ES导出带标签的新闻数据
sougouNews = esProxy.getDataFromEs()
print('成功导出新闻数据：size=%d' % (len(sougouNews)))

def featurelize(sougouNews, fields=['doc_title'], analyzer=Analyzer()):
    """
    返回标签和分词后的特征
    """
    tokens = []
    for doc in sougouNews:
        dic = {}
        for field in fields:
            dic[field] = analyzer.cutAndFilter(doc[field])
        # 添加新闻类别
        dic['doc_type'] = doc['doc_type']
        tokens.append(dic)
    return tokens

# 对新闻标题进行分词，得到带分词的新闻数据
tokenSougouNews = featurelize(sougouNews, fields=['doc_title', 'doc_content'], analyzer=Analyzer())
print('完成对新闻标题的分词')

# 将分词后的结果dump到本地
with open('tokenSougouNews.pk', 'wb') as f:
    f.truncate()
    pickle.dump(tokenSougouNews, f)
print('成功将分词后的数据dump到本地')

# 划分训练集和测试集
random.shuffle(tokenSougouNews)
trainPercent = 0.8
# dump训练集
with open('tokenSougouNews-train.pk', 'wb') as f:
    f.truncate()
    pickle.dump(tokenSougouNews[:int(trainPercent*len(tokenSougouNews))], f)
print('成功dump训练集到本地：size=%d' % (int(trainPercent*len(tokenSougouNews))))
    
# dump测试集
with open('tokenSougouNews-test.pk', 'wb') as f:
    f.truncate()
    pickle.dump(tokenSougouNews[int(trainPercent*len(tokenSougouNews)):], f)
print('成功dump测试集到本地：size=%d' % (len(tokenSougouNews) - int(trainPercent*len(tokenSougouNews))))

tokenSougouNews[0]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\YDQing\AppData\Local\Temp\jieba.cache


成功导出新闻数据：size=103320


Loading model cost 1.214 seconds.
Prefix dict has been built succesfully.


完成对新闻标题的分词
成功将分词后的数据dump到本地
成功dump训练集到本地：size=82656
成功dump测试集到本地：size=20664


{'doc_content': ['本期',
  '节目',
  '内容',
  '介绍',
  '关注',
  '机动车',
  '驾驶证',
  '申领',
  '和',
  '使用',
  '规定',
  '搜狐',
  '汽车',
  '广播',
  '诚邀',
  '全国',
  '各地',
  '强势',
  '电台',
  '真情',
  '加盟',
  '携手',
  '打造',
  '中国',
  '汽车',
  '广播',
  '最强',
  '容',
  '把脉',
  '全球',
  '汽车产业',
  '风向标',
  '引领',
  '时尚',
  '汽车',
  '消费',
  '的',
  '参考书',
  '搜狐',
  '汽车',
  '广播',
  '车旅',
  '杂志',
  '服务',
  '我们',
  '的',
  '汽车',
  '生活',
  '加盟',
  '热线',
  '13381202220',
  '010',
  '62729907',
  '独家',
  '出品',
  '搜狐',
  '汽车',
  '事业部'],
 'doc_title': ['搜狐', '汽车', '广播', '车旅', '杂志', '2012', '06', '20', '期'],
 'doc_type': '汽车'}

### tf-idf + 分类器

#### 加载训练数据和测试数据

In [1]:
import pickle

    
with open('tokenSougouNews-test.pk', 'rb') as f:
    testData = pickle
with open('tokenSougouNews-train.pk', 'rb') as f:
    trainData = pickle.load(f)
trainX = [dict(doc_title=' '.join(d['doc_title'])) for d in trainData]
trainY = [d['doc_type'] for d in trainData]
print('train size=%d' % (len(trainX))).load(f)
testX = [dict(doc_title=' '.join(d['doc_title'])) for d in testData]
testY = [d['doc_type'] for d in testData]
print('test size=%d' % (len(testX)))

train size=82656
test size=20664


#### 将文本tf-idf向量化

In [3]:
import sklearn.feature_extraction.text as text
import numpy as np

class TfidfVectorizor(object):
    def __init__(self, fields):
        """
        fields: 需要向量化的属性
        """
        self.fields = fields
        self.tfidfVectorizors = dict()
        for field in fields:
            self.tfidfVectorizors[field] = text.TfidfVectorizer()
    
    def fit(self, X, y=None):
        for field in self.fields:
            docs = [x[field] for x in X]
            self.tfidfVectorizors[field].fit(docs, y)
        return self
    
    def transform(self, X):
        """
        将每个属性向量化后，拼接成一个向量
        """
        vectors = None
        for i, field in enumerate(self.fields):
            docs = [x[field] for x in X]
            vector = self.tfidfVectorizors[field].transform(docs)
            vectors = np.hstack(vectors, vector) if i > 0 else vector
        return vectors

#### 构建分类器

In [16]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# tfidf + lr
lrClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),
                 ('lr', LogisticRegression())])
lrClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, lrClf.predict(trainX))
testAcc = accuracy_score(testY, lrClf.predict(testX))
print('tfidf+lr: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))

# tfidf + nb
nbClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),
                 ('multinomialNB', MultinomialNB())])
nbClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, nbClf.predict(trainX))
testAcc = accuracy_score(testY, nbClf.predict(testX))
print('tfidf+multiNB: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))

# tfidf + svm
svmClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),
                 ('svm', LinearSVC())])
svmClf.fit(trainX, trainY)

trainAcc = accuracy_score(trainY, svmClf.predict(trainX))
testAcc = accuracy_score(testY, svmClf.predict(testX))
print('tfidf+svm: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


tfidf+lr: trainAcc=0.913848, testAcc=0.869774
tfidf+multiNB: trainAcc=0.867886, testAcc=0.821235
tfidf+svm: trainAcc=0.981018, testAcc=0.895906


#### word2vec向量化

In [17]:
from gensim.models import Word2Vec

class Doc2VecVectorizor(object):
    def __init__(self, fields, size=100, window=3, min_count=1):
        self.fields = fields
        self.size = size
        self.window = window
        self.min_count = min_count
        self.word2vec = Word2Vec(size=size, window=window, min_count=min_count)
        
    def fit(self, X, y=None):
        sentences = []
        for x in X:
            for field in self.fields:
                sentences.append(x[field].split())
        self.word2vec.build_vocab(sentences)
        self.word2vec.train(sentences, total_examples=self.word2vec.corpus_count,epochs=self.word2vec.iter)
        return self
    
    def transform(self, X):
        """
        计算文档的特征向量
        1. 对每个属性，计算每个词的vector，然后将所有词的vector的平均值作为该属性的vector
        2. 所有属性的vector，flatten为一个宽vector，作为该文档的特征向量
        """
        return np.array([self.__doc2vec(x) for x in X])
        
    def __sentence2vec(self, sentence):
        if len(sentence.strip()) == 0:
            return np.zeros(self.size)
        vectors = [self.word2vec[word] if word in self.word2vec else np.zeros(self.size) for word in sentence.split()]
        return np.mean(vectors, axis=0)
    
    def __doc2vec(self, doc):
        vectors = np.array([self.__sentence2vec(doc[field]) for field in self.fields])
        return vectors.flatten()
    
doc2vec = Doc2VecVectorizor(fields=['doc_title'])
doc2vec.fit(trainX)



<__main__.Doc2VecVectorizor at 0x1fd5007cf98>

In [18]:
doc2vec.word2vec.wv.similar_by_word(word='体育', topn=10)

[('舞蹈节', 0.9734185934066772),
 ('专访', 0.9699808955192566),
 ('老年人', 0.9686485528945923),
 ('日内瓦', 0.9671200513839722),
 ('搜狐', 0.9666953086853027),
 ('看车', 0.963032603263855),
 ('国际足球', 0.9596318006515503),
 ('广汽传祺', 0.9582968950271606),
 ('篮联', 0.9582201242446899),
 ('海河', 0.9577779173851013)]

In [20]:
doc2vec.word2vec.vector_size

100

#### word2vec + svm

In [19]:
from sklearn.linear_model import LogisticRegression

svmClf = Pipeline([('doc2vec', Doc2VecVectorizor(['doc_title'])),
                 ('svm', LinearSVC())])
svmClf.fit(trainX, trainY)

# 计算误差
trainAcc = accuracy_score(trainY, svmClf.predict(trainX))
testAcc = accuracy_score(testY, svmClf.predict(testX))
print('doc2vec+svm: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))



doc2vec+svm: trainAcc=0.705841, testAcc=0.708672


### tf-idf加权的word2vec + classification
#### tf-idf加权的word2vec

In [None]:
from gensim.models import Word2Vec

class Doc2VecVectorizor(object):
    def __init__(self, tfidfVectorizor, word2vecVectorizor, fields):
        self.tfidfVectorizor = tfidfVectorizor
        self.word2vecVectorizor = word2vecVectorizor
        self.fields = fields
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """
        计算文档的特征向量
        1. 对每个属性，计算每个词的tfidf-vector和word-vector，然后将所有词的两个vector的加权平均向量作为该属性的vector
        2. 所有属性的vector，flatten为一个宽vector，作为该文档的特征向量
        """
        return np.array([self.__doc2vec(x) for x in X])
        
    def __sentence2vec(self, sentence):
        if len(sentence.strip()) == 0:
            return np.zeros(self.size)
        vectors = [self.word2vecVectorizor[word]*self.tfidfVectorizor.transform() 
                   if word in self.word2vecVectorizor else np.zeros(self.size) 
                   for word in sentence.split()]
        return np.mean(vectors, axis=0)
    
    def __doc2vec(self, doc):
        vectors = np.array([self.__sentence2vec(doc[field]) for field in self.fields])
        return vectors.flatten()
    
doc2vec = Doc2VecVectorizor(fields=['doc_title'])
doc2vec.fit(trainX)