In [3]:
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [10]:
categories = [
        'rec.sport.hockey',
        'sci.med',
        'soc.religion.christian',
        'talk.religion.misc',
    ]

In [11]:
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

target_names = data_train.target_names

In [15]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()

2170 documents - 2.918MB (training set)
1444 documents - 1.954MB (test set)
4 categories



In [16]:
y_train, y_test = data_train.target, data_test.target

In [18]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
feature_names = vectorizer.get_feature_names()

In [42]:
print(benchmark(MultinomialNB(alpha=.01)))
print(benchmark(BernoulliNB(alpha=.01)))
print(benchmark(RandomForestClassifier(n_estimators=100)))
print(benchmark(SGDClassifier(loss='log', alpha=.0001, n_iter=50, penalty="l2")))
print(benchmark(SGDClassifier(loss='hinge', alpha=.0001, n_iter=50, penalty="l2")))

________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
accuracy:   0.822
dimensionality: 28289
density: 1.000000
classification report:
                        precision    recall  f1-score   support

      rec.sport.hockey       0.89      0.96      0.92       399
               sci.med       0.92      0.87      0.89       396
soc.religion.christian       0.71      0.89      0.79       398
    talk.religion.misc       0.76      0.42      0.54       251

           avg / total       0.83      0.82      0.81      1444

confusion matrix:
[[382   5  10   2]
 [ 17 344  27   8]
 [ 15   4 356  23]
 [ 14  21 111 105]]

('MultinomialNB', 0.82202216066481992)
________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
accuracy:   0.801
dimensionality: 28289
density: 1.000000
classification report:


In [33]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    
    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=target_names))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score