In [2]:
import nltk
import gensim
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn import preprocessing, metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from collections import Counter
from string import punctuation, digits
from time import time

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim import corpora, models

%matplotlib inline
# nltk.download('wordnet')



In [3]:
# 定义衡量标准
def performance_metric(y_true, y_predict):
    # 使用全局的预测正确率作为衡量标准
    accuracy = f1_score( y_true, y_predict, average='micro' )
    return accuracy

# 词袋子模型

## 1. 朴素贝叶斯-MultinomialNB

In [4]:
# 训练最优模型 - 朴素贝叶斯
def fit_NB(X, y, nplits=5):
    """ 基于输入数据 [X,y]，利于网格搜索找到最优的朴素贝叶斯模型"""
    
    cross_validator = KFold( n_splits=nplits )
    
    clf = MultinomialNB()

    params = { 'alpha': np.array([0.001, 0.01, 0.1, 1.0, 10]) }

    scoring_fnc = make_scorer( performance_metric )

    grid = GridSearchCV( clf, params, scoring=scoring_fnc, cv=cross_validator )

    # 基于输入数据 [X,y]，进行网格搜索
    grid = grid.fit(X, y)

    return grid.best_estimator_

In [5]:
# 训练集和测试集
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [5]:
# 词向量获取方式： 词袋子模型 - 简单计数
vectorizer_c = CountVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_c.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_c.transform(newsgroups_test.data)

optimal_clf = fit_NB(vectors_train, newsgroups_train.target)
print(optimal_clf, '\n')

pred = optimal_clf.predict(vectors_test)

accuracy_c = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy_c)

(11314, 20000) 

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True) 

0.794344131705


In [6]:
# 词向量获取方式： 词袋子模型 - tf-idf
vectorizer_ti = TfidfVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_ti.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_ti.transform(newsgroups_test.data)

optimal_clf = fit_NB(vectors_train, newsgroups_train.target)
print(optimal_clf, '\n')

pred = optimal_clf.predict(vectors_test)

accuracy_ti = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy_ti)

(11314, 20000) 

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True) 

0.829394583112


In [7]:
# 词向量获取方式： 词袋子模型 - HashingVectorizer
vectorizer_h = HashingVectorizer(stop_words='english', n_features=20000, non_negative=True)
vectors_train = vectorizer_h.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_h.transform(newsgroups_test.data)

optimal_clf = fit_NB(vectors_train, newsgroups_train.target)
print(optimal_clf, '\n')

pred = optimal_clf.predict(vectors_test)

accuracy_h = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy_h)



(11314, 20000) 





MultinomialNB(alpha=0.10000000000000001, class_prior=None, fit_prior=True) 

0.824880509825


## 2. 支持向量机-SGDClassifier

In [8]:
# 训练最优模型 - linear SVM (SGDClassifier)
def fit_svm(X, y, nplits=5):
    """ 基于输入数据 [X,y]，利于网格搜索找到最优的支持向量机模型"""
    
    cross_validator = KFold( n_splits=nplits )
    
    clf = SGDClassifier(max_iter=5, tol=None)

    params = {'alpha': (0.0001, 0.00001, 0.000001), 
              'penalty': ('l2', 'elasticnet')}

    scoring_fnc = make_scorer( performance_metric )

    grid = GridSearchCV( clf, params, scoring=scoring_fnc, cv=cross_validator )

    # 基于输入数据 [X,y]，进行网格搜索
    grid = grid.fit(X, y)

    return grid.best_estimator_

In [9]:
# 词向量获取方式： 词袋子模型 - tf-idf
vectorizer_ti = TfidfVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_ti.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_ti.transform(newsgroups_test.data)

t0 = time()
optimal_clf = fit_svm(vectors_train, newsgroups_train.target)
print("Time: %.2fs" % (time()-t0))
print(optimal_clf, '\n')

pred = optimal_clf.predict(vectors_test)

accuracy_ti = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy_ti)

(11314, 20000) 

Time: 22.00s
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False) 

0.842272968667


## 3. 多层感知机-MLPClassifier

In [37]:
# 训练最优模型 - 神经网络 MLP
from sklearn.neural_network import MLPClassifier

# def fit_mlp(X, y, nplits=5):
#     """基于输入数据 [X,y]，利于随机搜索找到最优的多层感知机模型"""
    
#     cross_validator = KFold( n_splits=nplits )
    
#     clf = MLPClassifier()
    
#     # alpha: L2正则化
#     params = {'alpha': (0.01, 0.001, 0.0001), 
#               'hidden_layer_sizes': ((128,), (256,), (512,))}

#     scoring_fnc = make_scorer( performance_metric )

#     grid = RandomizedSearchCV(clf, params, n_iter=6, scoring=scoring_fnc, cv=cross_validator, n_jobs=-1) # 6个随机参数集合

#     # 基于输入数据 [X,y]，进行网格搜索
#     grid = grid.fit(X, y)

#     return grid.best_estimator_

In [8]:
# 训练模型 - 神经网络 多层感知机
# 词向量获取方式： 词袋子模型 - tf-idf
from sklearn.neural_network import MLPClassifier

vectorizer_ti = TfidfVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_ti.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_ti.transform(newsgroups_test.data)

# alpha: L2正则化
clf = MLPClassifier(hidden_layer_sizes=(512,), alpha=0.01)

t0 = time()
clf.fit(vectors_train, newsgroups_train.target)
print("Time: %.2fs" % (time()-t0))

pred = clf.predict(vectors_test)

accuracy = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy)

(11314, 20000) 

Time: 1039.52s
0.836563993627


## 4. 梯度提升决策树-GBDT

In [None]:
# # 训练最优模型 - GBDT - 随机搜索
# def fit_GBDT(X, y, nplits=5):
#     """ 基于输入数据 [X,y]，利于随机搜索找到最优的支持向量机模型"""
    
#     cross_validator = KFold( n_splits=nplits )
    
#     clf = GradientBoostingClassifier()

#     params = {'n_estimators': (50, 100, 200, 300), 
#               'max_depth': (1, 3, 5, 7, 9)}

#     scoring_fnc = make_scorer( performance_metric )

#     grid = RandomizedSearchCV(clf, params, n_iter=6, scoring=scoring_fnc, cv=cross_validator, n_jobs=-1) # 6个随机参数集合

#     # 基于输入数据 [X,y]，进行随机搜索
#     grid = grid.fit(X, y)

#     return grid.best_estimator_

In [9]:
# 训练模型 - GBDT
# 词向量获取方式： 词袋子模型 - tf-idf
vectorizer_ti = TfidfVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_ti.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_ti.transform(newsgroups_test.data)

clf = GradientBoostingClassifier()

t0 = time()
clf.fit(vectors_train, newsgroups_train.target)
print("Time: %.2fs" % (time()-t0))

pred = clf.predict(vectors_test)

accuracy = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy)

(11314, 20000) 

Time: 522.94s
0.74614976102


## 5. 随机森林-Random Forest

In [None]:
# # 训练最优模型 - Random Forest
# def fit_rf(X, y, nplits=5):
#     """ 基于输入数据 [X,y]，利于随机搜索找到最优的支持向量机模型"""
    
#     cross_validator = KFold( n_splits=nplits )
    
#     clf = RandomForestClassifier()

#     params = {'n_estimators': (5, 10, 15, 20), 
#               'max_depth': (None, 3, 5, 7, 9)}

#     scoring_fnc = make_scorer( performance_metric )

#     grid = RandomizedSearchCV(clf, params, n_iter=6, scoring=scoring_fnc, cv=cross_validator)

#     # 基于输入数据 [X,y]，进行随机搜索
#     grid = grid.fit(X, y)

#     return grid.best_estimator_

In [10]:
# 训练模型 - Random Forest
# 词向量获取方式： 词袋子模型 - tf-idf
vectorizer_ti = TfidfVectorizer(stop_words='english', max_features=20000)
vectors_train = vectorizer_ti.fit_transform(newsgroups_train.data)
print(vectors_train.shape, '\n')

vectors_test = vectorizer_ti.transform(newsgroups_test.data)

clf = RandomForestClassifier()

t0 = time()
clf.fit(vectors_train, newsgroups_train.target)
print("Time: %.2fs" % (time()-t0))

pred = clf.predict(vectors_test)

accuracy = f1_score(newsgroups_test.target, pred, average='micro')
print(accuracy)

(11314, 20000) 

Time: 2.45s
0.681492299522


# 主题模型-LDA

### 通过主题模型获取主题向量来表示文本

In [10]:
# 读取全集
newsgroups_all = fetch_20newsgroups(subset='all').data
newsgroups_train = fetch_20newsgroups(subset='train').data
newsgroups_test = fetch_20newsgroups(subset='test').data
print(len(newsgroups_train), len(newsgroups_test))

11314 7532


In [11]:
# 从NLTK中导入WordNetLemmatizer 对词进行还原： is - be； dogs - dog
wordnet_lemmatizer = WordNetLemmatizer()

wordnet_lemmatizer.lemmatize('passed', pos='v')

'pass'

In [12]:
# 统计全集词列表
words = []
for text in newsgroups_all:
    # 去除文档中的标点和数字，并全部转换成小写
    text = ''.join([c for c in text.lower() if (c not in punctuation) and (c not in digits)])
    # 将文档拆成单词，并进行词还原
    words += [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text.split()]

print(len(words))

4990284


In [13]:
# 组成全集集词汇表
counts = Counter(words)
counts.most_common(10)

[('the', 240506),
 ('be', 180873),
 ('to', 120304),
 ('of', 113840),
 ('a', 101126),
 ('and', 94986),
 ('in', 80337),
 ('i', 69387),
 ('that', 62564),
 ('have', 51823)]

In [14]:
# 从NLTK中引入停用词，从全集词汇表中删除
stoplist = stopwords.words('english')

# 从词汇表中删除计数少于10次，或者字母长度大于20的单词 like 'maxaxaxaxaxaxaxaxaxaxaxaxaxaxax'
vocab = {word: num for word, num in counts.items() if (word not in stoplist) and (num > 10) and (1 < len(word) < 20)}

# 按照计数排序
vocab_list = sorted(vocab, key=vocab.get, reverse=True)

In [15]:
# 将单词映射到数字，从1开始
vocab_to_int = {word: i for i ,word in enumerate(vocab_list, 1)}
print(len(vocab_to_int))

17504


In [16]:
# 定义训练集和测试集的预处理函数
def pre_process_data(raw_data):
    # 预处理数据特征
    # raw data: 如 newsgroups_train.data = fetch_20newsgroups(subset='train').data 
    # seq_len: 序列长度
    
    # 将文档组成数字列表
    all_texts = []
    # length = []
    for text in raw_data:
        text = ''.join([c for c in text.lower() if c not in punctuation and c not in digits])
        text = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text.split()] # 分词和词还原
        text = [word for word in text if word in vocab_to_int]
        all_texts.append(text)
        # length.append(len(text))  
    return all_texts

def pre_process_targets(raw_target, classes=20):
    # 预处理数据标签 独热编码
    lb = preprocessing.LabelBinarizer()
    lb.fit(list(range(classes)))
    return lb.transform(raw_target)

In [17]:
texts_all = pre_process_data(newsgroups_all)
print(texts_all[:2])

[['mamatha', 'devineni', 'ratnam', 'mrandrewcmuedu', 'subject', 'pen', 'fan', 'reactions', 'organization', 'post', 'office', 'carnegie', 'mellon', 'pittsburgh', 'pa', 'line', 'nntppostinghost', 'poandrewcmuedu', 'sure', 'pen', 'fan', 'pretty', 'confuse', 'lack', 'kind', 'post', 'recent', 'pen', 'massacre', 'devil', 'actually', 'bite', 'puzzle', 'bite', 'relieve', 'however', 'go', 'put', 'end', 'relief', 'bite', 'praise', 'pen', 'man', 'kill', 'devil', 'worse', 'think', 'jagr', 'show', 'much', 'better', 'regular', 'season', 'stats', 'also', 'lot', 'fo', 'fun', 'watch', 'playoffs', 'bowman', 'let', 'jagr', 'lot', 'fun', 'next', 'couple', 'game', 'since', 'pen', 'go', 'beat', 'jersey', 'anyway', 'disappoint', 'see', 'islanders', 'lose', 'final', 'regular', 'season', 'game', 'pen', 'rule'], ['matthew', 'lawson', 'subject', 'highperformance', 'vlb', 'video', 'card', 'summary', 'seek', 'recommendations', 'vlb', 'video', 'card', 'nntppostinghost', 'organization', 'engineer', 'computer', 'netw

In [18]:
# 按照texts_all中的单词创建词汇表
dictionary = corpora.Dictionary(texts_all)

In [19]:
len(dictionary)

17504

In [20]:
# 按照dictionary创建每篇文档的词袋子模型，每篇文档由（单词对应的键，词频）组成
corpus = [dictionary.doc2bow(text) for text in texts_all]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 6), (6, 2), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 3), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 2), (45, 2), (46, 1), (47, 1), (48, 2), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)]


In [21]:
len(corpus)

18846

In [22]:
# 生成 LDA 模型
t0 = time()
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, id2word=dictionary, passes=20)
print('TIme:', time()-t0, 's')

TIme: 1076.5901086330414 s


In [None]:
# 保存 LDA 模型
ldamodel.save(lda_model)

In [23]:
ldamodel[corpus[0]]

[(10, 0.034659902989598777),
 (29, 0.12140788217968064),
 (34, 0.012380261309099878),
 (37, 0.023731000663055946),
 (46, 0.032988322354522856),
 (56, 0.037178435603637684),
 (62, 0.014724935992155117),
 (83, 0.13375410752716363),
 (86, 0.30004839308932313),
 (93, 0.059263435018506672),
 (97, 0.2195144860639526)]

In [24]:
ldamodel[corpus[1]]

[(4, 0.049311042569805758),
 (15, 0.12525604748038291),
 (16, 0.012551208473178406),
 (19, 0.22055373840238288),
 (48, 0.056972566440368216),
 (58, 0.088125372481612216),
 (68, 0.031962812940070189),
 (70, 0.050376290410712517),
 (87, 0.35225203191259746)]

In [25]:
ldamodel[corpus[888]]

[(14, 0.05099906556204923),
 (17, 0.013569792262001508),
 (37, 0.017703232392275985),
 (51, 0.014130242283191543),
 (59, 0.11075204678035798),
 (67, 0.052812930369317557),
 (68, 0.028664075030782572),
 (74, 0.3022055918047698),
 (82, 0.028401634730905217),
 (84, 0.072508114576306334),
 (89, 0.023222043769478905),
 (93, 0.02362039255569514),
 (97, 0.22766568597301823)]

In [26]:
import numpy as np
x_all = np.zeros((len(corpus), 100)) # 每一行代表一篇文本，每一列代表一个主题
x_all.shape

(18846, 100)

In [27]:
for i in range(len(corpus)):
    for tup in ldamodel[corpus[i]]:
        idx = tup[0]
        value = tup[1]
        x_all[i, idx] = value


In [28]:
print(x_all[0])

[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.03520696  0.          0.          0.
  0.02663502  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.12098029  0.          0.          0.          0.
  0.01238246  0.          0.          0.02365746  0.          0.          0.
  0.          0.          0.          0.          0.          0.03222063
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.0371705   0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.14060797  0.          0.          0.29660579  0.          0.
  0.          0

In [29]:
# 和原始数据集大小保持基本一致
x_train = x_all[:11314]
x_test = x_all[11314:]
print(x_train.shape, x_test.shape)

(11314, 100) (7532, 100)


In [30]:
y_all_20 = fetch_20newsgroups(subset='all').target
y_all_20

array([10,  3, 17, ...,  3,  1,  7])

In [31]:
# 和原始数据集大小保持基本一致
y_train_20 = y_all_20[:11314]
y_test_20 = y_all_20[11314:]
print(y_train_20.shape, y_test_20.shape)

(11314,) (7532,)


## 6. LDA - Naive Bayes， SVM

In [32]:
# 朴素贝叶斯
optimal_clf = fit_NB(x_train, y_train_20)
print(optimal_clf, '\n')

pred = optimal_clf.predict(x_test)

accuracy_c = f1_score(y_test_20, pred, average='micro')
print(accuracy_c)

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True) 

0.666356877323


In [33]:
# 支持向量机

t0 = time()
optimal_clf = fit_svm(x_train, y_train_20)
print("Time: %.2fs" % (time()-t0))
print(optimal_clf, '\n')

pred = optimal_clf.predict(x_test)

accuracy_ti = f1_score(y_test_20, pred, average='micro')
print(accuracy_ti)

Time: 12.36s
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False) 

0.646309081253
