In [23]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import numpy as np

In [24]:
# Categories
categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x',
              'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey',
              'sci.crypt','sci.electronics','sci.med','sci.space',
              'misc.forsale','talk.politics.misc','talk.politics.guns','talk.politics.mideast',
              'talk.religion.misc','alt.atheism','soc.religion.christian']

# Load train and test data 
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)

In [25]:
def printResults(classifier_name, predicted_data, test_data):
    print(f"Metrics Results for {classifier_name}")
    print(metrics.classification_report(test_data.target, predicted_data,target_names=test_data.target_names))
    print(f"The {classifier_name} classifier was able to recognize the test set with this accuracy: "+ str(np.mean(predicted_data == test_data.target)))

In [26]:
def comparePlainClassifiers(train_data, test_data):
    clfByaes = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
        ])
    clfByaes.fit(train_data.data,train_data.target)
    printResults("MultinomialNB",clfByaes.predict(test_data.data), test_data)

    clfSGD = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
        ])
    clfSGD.fit(train_data.data,train_data.target)
    printResults("SGDClassifier",clfSGD.predict(test_data.data), test_data)

    clfRidgeClassifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RidgeClassifier()),
        ])
    clfRidgeClassifier.fit(train_data.data,train_data.target)
    printResults("RidgeClassifier", clfRidgeClassifier.predict(test_data.data), test_data)

In [27]:
def compareFeaturesClassifier(train_data, test_data):
    #alpha=1e-3, fit_prior=False
    
    # loss='hinge', penalty='l2',
    # alpha=1e-3, random_state=42,
    # max_iter=5, tol=None
    clfByaes = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
        ])
    clfByaes.fit(train_data.data,train_data.target)
    printResults("MultinomialNB",clfByaes.predict(test_data.data), test_data)

    clfSGD = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
        ])
    clfSGD.fit(train_data.data,train_data.target)
    printResults("SGDClassifier",clfSGD.predict(test_data.data), test_data)

    clfRidgeClassifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RidgeClassifier()),
        ])
    clfRidgeClassifier.fit(train_data.data,train_data.target)
    printResults("RidgeClassifier", clfRidgeClassifier.predict(test_data.data), test_data)

In [28]:
comparePlainClassifiers(twenty_train,twenty_test)

compareFeaturesClassifier(twenty_train,twenty_test)

# for doc, category in zip(docs_new, predicted):
#     print('%r => %s' % (doc, twenty_train.target_names[category]))

# from sklearn import metrics
# print(metrics.classification_report(twenty_test.target, predicted,
#     target_names=twenty_test.target_names))

# print(metrics.confusion_matrix(twenty_test.target, predicted))

# from sklearn.model_selection import GridSearchCV
# parameters = {
#     'vect__ngram_range': [(1, 1), (1, 2)],
#     'tfidf__use_idf': (True, False),
#     'clf__alpha': (1e-2, 1e-3),
# }

# gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

# gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

# twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

# print(gs_clf.best_score_)

# for param_name in sorted(parameters.keys()):
#     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

# print(gs_clf.cv_results_)


Metrics Results for MultinomialNB
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      