In [1]:
# Using a bag-of-words approach to classify documents by topics

In [4]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

from tqdm import tqdm

In [6]:
# display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s %(levelname)s %(message)s')

In [10]:
# load some categories from the training set
data_train = fetch_20newsgroups(data_home='./', subset='train', shuffle=True, random_state=42)
data_test = fetch_20newsgroups(data_home='./', subset='test', shuffle=True, random_state=42)





In [17]:
# create word vector and extract the 500 highest frequence words
vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=500)

In [18]:
X_train = vectorizer.fit_transform(data_train.data)

In [19]:
# use the same HashingVector from X_train data
X_test = vectorizer.transform(data_test.data)

In [20]:
# get target data
y_train, y_test = data_train.target, data_test.target

In [23]:
# print X_train, X_test, y_train, y_test shapes
print('X_train shape: {}' . format(X_train.shape))
print('X_test shape: {}' . format(X_test.shape))
print('y_train shape: {}' . format(y_train.shape))
print('y_test shape: {}' . format(y_test.shape))

X_train shape: (11314, 500)
X_test shape: (7532, 500)
y_train shape: (11314,)
y_test shape: (7532,)


In [40]:
# benchmark classifiers
def benchmark(clf):
    print('-'*80)
    print('Training:')
    print(clf)
    # training
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    
    # predicting
    t0 = time()
    y_pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time: %0.3fs" % test_time)
    
    score = metrics.accuracy_score(y_test, y_pred)
    print("accuracy: %0.3f" % score)
    
    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        
    print("classification report: ")
    print(metrics.classification_report(y_test, y_pred))
    
    print("confusion matrix")
    print(metrics.confusion_matrix(y_test, y_pred))
    
    clf_descr = str(SGDClassifier).split("'")[1]
    return clf_descr, score, train_time, test_time

In [41]:
# learn models
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random Forest")):
    print('='*80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
--------------------------------------------------------------------------------
Training:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='lsqr',
        tol=0.01)




train time: 4.006s
test time: 0.009s
accuracy: 0.539
dimensionality: 500
density: 1.000000
classification report: 
             precision    recall  f1-score   support

          0       0.56      0.47      0.51       319
          1       0.46      0.43      0.44       389
          2       0.57      0.60      0.58       394
          3       0.47      0.44      0.46       392
          4       0.49      0.49      0.49       385
          5       0.49      0.44      0.46       395
          6       0.57      0.64      0.60       390
          7       0.55      0.55      0.55       396
          8       0.58      0.69      0.63       398
          9       0.56      0.63      0.59       397
         10       0.59      0.73      0.65       399
         11       0.65      0.70      0.68       396
         12       0.42      0.23      0.29       393
         13       0.48      0.38      0.43       396
         14       0.52      0.66      0.58       394
         15       0.57      0.73    

   11  72]]
kNN
--------------------------------------------------------------------------------
Training:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.007s
test time: 8.463s
accuracy: 0.493
classification report: 
             precision    recall  f1-score   support

          0       0.45      0.65      0.53       319
          1       0.38      0.40      0.39       389
          2       0.20      0.51      0.29       394
          3       0.44      0.42      0.43       392
          4       0.39      0.32      0.35       385
          5       0.46      0.42      0.44       395
          6       0.46      0.36      0.40       390
          7       0.53      0.45      0.49       396
          8       0.66      0.67      0.66       398
          9       0.52      0.51      0.51       397
         10       0.67      0.67      0.67       399
         11   

4