***使用CountVectorizer和TfidVectorizer进行特征提取***

In [1]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset = 'all')

from sklearn.cross_validation import train_test_split

X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size = 0.25,random_state = 33)

In [9]:
#采用默认配置对CountVectorizer进行初始化
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)

from sklearn.naive_bayes import MultinomialNB
mnb_count = MultinomialNB()
mnb_count.fit(X_count_train,y_train)
count_y_pred = mnb_count.predict(X_count_test)

print 'The accuracy of count_naive_bayes is:',mnb_count.score(X_count_test,y_test)

from sklearn.metrics import classification_report
print classification_report(y_test,count_y_pred,target_names=news.target_names)

The accuracy of count_naive_bayes is: 0.8397707979626485
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 s

In [11]:
#采用默认配置对TfidfVectorizer进行初始化
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)

mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_tfidf_train,y_train)
tfidf_y_pred = mnb_tfidf.predict(X_tfidf_test)

print mnb_tfidf.score(X_tfidf_test,y_test)
print classification_report(y_test,tfidf_y_pred,target_names=news.target_names)

0.8463497453310697
                          precision    recall  f1-score   support

             alt.atheism       0.84      0.67      0.75       201
           comp.graphics       0.85      0.74      0.79       250
 comp.os.ms-windows.misc       0.82      0.85      0.83       248
comp.sys.ibm.pc.hardware       0.76      0.88      0.82       240
   comp.sys.mac.hardware       0.94      0.84      0.89       242
          comp.windows.x       0.96      0.84      0.89       263
            misc.forsale       0.93      0.69      0.79       257
               rec.autos       0.84      0.92      0.88       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.96      0.91      0.94       251
        rec.sport.hockey       0.88      0.99      0.93       233
               sci.crypt       0.73      0.98      0.83       238
         sci.electronics       0.91      0.83      0.87       249
                 sci.med       0.97      0.92      0.95 

In [14]:
#使用停用词(Stop words)过滤化配置初始化CountVectorizer,TfidfVectorizer
count_filter_vec,tfidf_filter_vec = CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')

X_count_filter_train = count_filter_vec.fit_transform(X_train)
X_count_filter_test = count_filter_vec.transform(X_test)

X_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test = tfidf_filter_vec.transform(X_test)

mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
count_filter_y_pred = mnb_count_filter.predict(X_count_filter_test)

print mnb_count_filter.score(X_count_filter_test,y_test)
print classification_report(y_test,count_filter_y_pred,target_names=news.target_names)

mnb_tfidf_filter = MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
tfidf_filter_y_pred = mnb_tfidf_filter.predict(X_tfidf_filter_test)
print mnb_tfidf_filter.score(X_tfidf_filter_test,y_test)
print classification_report(y_test,tfidf_filter_y_pred,target_names=news.target_names)

0.8637521222410866
                          precision    recall  f1-score   support

             alt.atheism       0.85      0.89      0.87       201
           comp.graphics       0.62      0.88      0.73       250
 comp.os.ms-windows.misc       0.93      0.22      0.36       248
comp.sys.ibm.pc.hardware       0.62      0.88      0.73       240
   comp.sys.mac.hardware       0.93      0.85      0.89       242
          comp.windows.x       0.82      0.85      0.84       263
            misc.forsale       0.90      0.79      0.84       257
               rec.autos       0.91      0.91      0.91       238
         rec.motorcycles       0.98      0.94      0.96       276
      rec.sport.baseball       0.98      0.92      0.95       251
        rec.sport.hockey       0.92      0.99      0.95       233
               sci.crypt       0.91      0.97      0.93       238
         sci.electronics       0.87      0.89      0.88       249
                 sci.med       0.94      0.95      0.95 