In [4]:
from sklearn.datasets import fetch_20newsgroups

# Meeting data

In [5]:
news = fetch_20newsgroups(subset='all')

In [6]:
print type(news.data)

<type 'list'>


In [7]:
print type(news.target)

<type 'numpy.ndarray'>


In [8]:
print type(news.target_names)

<type 'list'>


In [9]:
print len(news.data), len(news.target)

18846 18846


In [10]:
print news.data[0]

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [11]:
print news.target[0]

10


In [12]:
print news.target_names[news.target[0]]

rec.sport.hockey


# Preprocessing Data

Because the data is already in a random order, so we only have to split data into, for example, 75 percent for training and rest 25 percent for testing

In [13]:
SPLT_RATIO = 0.75
split_size = int(len(news.data) * SPLT_RATIO)

In [14]:
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

## TF-IDF

Term Frequency * Inverse Document Frequency

**Term Frequency** = #Term in this doc

**Inverse Document Frequency** = log (#docs/(1+#docs using this term))

# Training a Naive Bayes classfier

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

In [32]:
clf_countVect = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', MultinomialNB())
    ])

In [40]:
clf_hashingVect = Pipeline([
        ('vect', HashingVectorizer(non_negative=True)),
        ('clf', MultinomialNB())
    ])

In [41]:
clf_tfidfVect = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ])

In [42]:
from sklearn.cross_validation import cross_val_score, KFold

In [43]:
from scipy.stats import sem
import numpy as np

In [44]:
def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print "Mean score:{0:.3f} (+/- {1:.3f})".format(np.mean(scores), sem(scores))

In [45]:
clfs = [clf_countVect, clf_hashingVect, clf_tfidfVect]

In [46]:
for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)

[ 0.85782493  0.85725657  0.84664367  0.85911382  0.8458477 ]
Mean score:0.853 (+/- 0.003)
[ 0.75543767  0.77659857  0.77049615  0.78508888  0.76200584]
Mean score:0.770 (+/- 0.005)
[ 0.84482759  0.85990979  0.84558238  0.85990979  0.84213319]
Mean score:0.850 (+/- 0.004)


**To do some regular expressions**

In [47]:
clf_tfidfVect_new = Pipeline([
        ('vect', TfidfVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
        ('clf', MultinomialNB())
    ])

In [48]:
evaluate_cross_validation(clf_tfidfVect_new, news.data, news.target, 5)

[ 0.86100796  0.8718493   0.86203237  0.87291059  0.8588485 ]
Mean score:0.865 (+/- 0.003)


**Remove some unuseful words**

In [49]:
def get_stopword():
    result = set()
    for line in open('./stopwords_en.txt','r').readlines():
        result.add(line.strip())
    return result

In [51]:
clf_tfidfVect_new_sw = Pipeline([
        ('vect', TfidfVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",stop_words=get_stopword())),
        ('clf', MultinomialNB())
    ])

In [52]:
evaluate_cross_validation(clf_tfidfVect_new_sw, news.data, news.target, 5)

[ 0.88116711  0.89519767  0.88325816  0.89227912  0.88113558]
Mean score:0.887 (+/- 0.003)


**Adjust the Naive Bayes Algorithm**

In [53]:
clf_tfidfVect_final = Pipeline([
        ('vect', TfidfVectorizer(token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",stop_words=get_stopword())),
        ('clf', MultinomialNB(alpha=0.01))
    ])

In [54]:
evaluate_cross_validation(clf_tfidfVect_final, news.data, news.target, 5)

[ 0.9204244   0.91960732  0.91828071  0.92677103  0.91854603]
Mean score:0.921 (+/- 0.002)


# Evaluating the performance

In [55]:
from sklearn import metrics

In [56]:
def train_evaluate(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    print "Accuracy on training set:"
    print clf.score(X_train, y_train)
    print "Accuracy on testing set:"
    print clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print "Classfication Report:"
    print metrics.classification_report(y_test, y_pred)
    print "Confusion matrix:"
    print metrics.confusion_matrix(y_test, y_pred)

In [57]:
train_evaluate(clf_tfidfVect_final, X_train, y_train, X_test, y_test)

Accuracy on training set:
0.996957690675
Accuracy on testing set:
0.917869269949
Classfication Report:
             precision    recall  f1-score   support

          0       0.95      0.88      0.91       216
          1       0.85      0.85      0.85       246
          2       0.91      0.84      0.87       274
          3       0.81      0.86      0.83       235
          4       0.88      0.90      0.89       231
          5       0.89      0.91      0.90       225
          6       0.88      0.80      0.84       248
          7       0.92      0.93      0.93       275
          8       0.96      0.98      0.97       226
          9       0.97      0.94      0.96       250
         10       0.97      1.00      0.98       257
         11       0.97      0.97      0.97       261
         12       0.90      0.91      0.91       216
         13       0.94      0.95      0.95       257
         14       0.94      0.97      0.95       246
         15       0.90      0.96      0.93      