# classify documents on 20 topics
## via scikit-learn
1. raw data are news from 20 topics
2. features extracted from documents include count of words, term of frequency (tf), and term frequency times inverse document frequency (tf-idf)
3. classifers include Naive Bayes, Support Vector Machine, and Random Forest
4. classifer performance is measured by precision, recall, and F1-score

## note:
1. counts must be the input to tf/tfidf

In [59]:
import warnings
warnings.filterwarnings('ignore')

In [60]:
# get data
from sklearn.datasets import fetch_20newsgroups
# get functions used for pre-processing raw data
from sklearn.feature_extraction.text import CountVectorizer # for count of words
from sklearn.feature_extraction.text import TfidfTransformer # for tf or tf-idf
# get classifer
from sklearn.naive_bayes import MultinomialNB # Naive Bayes
from sklearn.linear_model import SGDClassifier # SVM
from sklearn.ensemble import RandomForestClassifier # random forest
# get measurement
from sklearn import metrics

In [61]:
## 1. get all data, and list 20 topics
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [62]:
len(twenty_train['data'])

11314

In [63]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [64]:
# one sample of the text
twenty_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [65]:
# the topic of this text
twenty_train.target_names[twenty_train.target[0]]

'rec.autos'

In [66]:
set(twenty_train.target)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [67]:
## 2. pre-process raw text
### count of words / bag of words
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
### term of frequency
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
### term frequency times inverse document frequency
tfidf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

## A. Navie Bayes

In [68]:
## 3. deploy classifers
### NB
clf_NB = MultinomialNB().fit(X_train_tfidf, twenty_train.target) 

In [69]:
## 4. example:
##           what is the topic of these two new documents
##           when classifier is navie bayers and the feature is tfidf
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

ex_predicted = clf_NB.predict(X_new_tfidf)

for doc, category in zip(docs_new, ex_predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [70]:
## 5. measure the performance of the selected classifier
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
twenty_test_counts = count_vect.transform(twenty_test.data)
twenty_test_tfidf = tfidf_transformer.transform(twenty_test_counts)

predicted = clf_NB.predict(twenty_test_tfidf)

print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      0.74      0.82       396
         

In [58]:
#metrics.confusion_matrix(twenty_test.target, predicted)

## B. Support Vector Machine

In [72]:

clf_SVM = SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None).fit(X_train_tfidf, twenty_train.target)

predicted_svm = clf_SVM.predict(twenty_test_tfidf)

print(metrics.classification_report(twenty_test.target, predicted_svm, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.71      0.72       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.73      0.78      0.75       394
comp.sys.ibm.pc.hardware       0.74      0.67      0.70       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.84      0.76      0.80       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.91      0.90      0.90       396
         rec.motorcycles       0.93      0.96      0.95       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.88      0.99      0.93       399
               sci.crypt       0.84      0.96      0.90       396
         sci.electronics       0.83      0.62      0.71       393
                 sci.med       0.87      0.86      0.87       396
         

## C. Random Forest

In [74]:
rf = RandomForestClassifier().fit(X_train_tfidf, twenty_train.target)
predicted_rf = rf.predict(twenty_test_tfidf)

print(metrics.classification_report(twenty_test.target, predicted_rf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.62      0.66       319
           comp.graphics       0.58      0.70      0.64       389
 comp.os.ms-windows.misc       0.63      0.74      0.68       394
comp.sys.ibm.pc.hardware       0.66      0.65      0.65       392
   comp.sys.mac.hardware       0.70      0.78      0.74       385
          comp.windows.x       0.74      0.69      0.71       395
            misc.forsale       0.75      0.89      0.81       390
               rec.autos       0.81      0.79      0.80       396
         rec.motorcycles       0.89      0.90      0.90       398
      rec.sport.baseball       0.80      0.90      0.85       397
        rec.sport.hockey       0.89      0.92      0.91       399
               sci.crypt       0.86      0.91      0.89       396
         sci.electronics       0.66      0.46      0.54       393
                 sci.med       0.83      0.68      0.75       396
         