In [0]:
%matplotlib inline


# Classification of text documents using sparse features


This is an example showing how scikit-learn can be used to classify documents
by topics using a bag-of-words approach. This example uses a scipy.sparse
matrix to store the features and demonstrates various classifiers that can
efficiently handle sparse matrices.

The dataset used in this example is the 20 newsgroups dataset. It will be
automatically downloaded, then cached.

The bar plot indicates the accuracy, training time (normalized) and test time
(normalized) of each classifier.

### Modified by Katy Gero


In [0]:
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Lars Buitinck
# License: BSD 3 clause
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [3]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
# order of labels in `target_names` can be different from `categories`
target_names = data_train.target_names

In [5]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(target_names))
print()

data_train.data[0]

2034 documents - 2.428MB (training set)
1353 documents - 1.800MB (test set)
4 categories



"Hi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation.  But if you save\nto a .PRJ file their positions/orientation are preserved.  Does anyone\nknow why this information is not stored in the .3DS file?  Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych"

In [0]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

In [7]:


print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
num_features = 100

vectorizer = CountVectorizer(stop_words='english', max_features=num_features)
X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

feature_names = vectorizer.get_feature_names()

Extracting features from the training data using a sparse vectorizer
done in 0.298565s at 8.132MB/s
n_samples: 2034, n_features: 100

Extracting features from the test data using the same vectorizer
done in 0.154824s at 11.624MB/s
n_samples: 1353, n_features: 100



In [8]:
feature_names[:10]

['10',
 'actually',
 'argument',
 'atheism',
 'available',
 'based',
 'believe',
 'better',
 'bible',
 'bit']

In [9]:
data_train.data[0], X_train[0].todense()

("Hi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation.  But if you save\nto a .PRJ file their positions/orientation are preserved.  Does anyone\nknow why this information is not stored in the .3DS file?  Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych",
 matrix([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
          0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
          0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]))

In [0]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."


# #############################################################################
# Benchmark classifiers
def benchmark(clf, print_report=True, print_cm=True, print_top10=True):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10_idx = np.argsort(clf.coef_[i])[-10:]
                top10_wrd = [feature_names[i] for i in top10_idx]
                print(trim("%s: %s" % (label, " ".join(top10_wrd))))
        print()

    if print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=target_names))

    if print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [11]:
benchmark(BernoulliNB(alpha=.01))

________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.005s
test time:  0.001s
accuracy:   0.588
dimensionality: 100
density: 1.000000
top 10 keywords per class:
alt.atheism: time does know god like say just think people don
comp.graphics: just don program need does use like know graphics thanks
sci.space: earth use time know think nasa don just like space
talk.religion.misc: say way like does know think god don just people

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.46      0.47      0.46       319
     comp.graphics       0.60      0.87      0.71       389
         sci.space       0.80      0.58      0.67       394
talk.religion.misc       0.44      0.31      0.37       251

          accuracy                           0.59      1353
         macro avg       0.57      0.56      0.55      1353
   

('BernoulliNB', 0.5875831485587583, 0.004828929901123047, 0.000988006591796875)

In [12]:
benchmark(MultinomialNB(alpha=.01))

________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.003s
test time:  0.000s
accuracy:   0.621
dimensionality: 100
density: 1.000000
top 10 keywords per class:
alt.atheism: like believe say atheism does just think don people god
comp.graphics: images software files data use file jpeg edu graphics image
sci.space: just shuttle time orbit data like earth launch nasa space
talk.religion.misc: say know christian think just bible don jesus people god

classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.48      0.55      0.51       319
     comp.graphics       0.84      0.77      0.80       389
         sci.space       0.68      0.70      0.69       394
talk.religion.misc       0.39      0.37      0.38       251

          accuracy                           0.62      1353
         macro avg       0.60      0.60      0.60  

('MultinomialNB',
 0.6208425720620843,
 0.0033359527587890625,
 0.0003676414489746094)