In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
news = fetch_20newsgroups(subset='all')

In [4]:
print(len(news.data))

18846


In [5]:
print(len(news.target_names))

20


In [6]:
print(news.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [10]:
from sklearn.model_selection import train_test_split

In [17]:
def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=48)
    classifier.fit(X_train, y_train)
    print("classifier accuracy is {}".format(classifier.score(X_test, y_test)))
    return classifier

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
trial1 = Pipeline([("vectorizer", TfidfVectorizer()),
                 ("classifier", MultinomialNB())])

In [20]:
train_test(trial1, news.data, news.target)

classifier accuracy is 0.8535653650254669


Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

In [21]:
from nltk.corpus import stopwords

In [24]:
trial2 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))),
                 ("classifier", MultinomialNB())])

In [25]:
train_test(trial2, news.data, news.target)

classifier accuracy is 0.8828522920203735


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', MultinomialNB())])

In [29]:
trial3 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'))),
                 ("classifier", MultinomialNB(alpha=0.05))])

In [30]:
train_test(trial3, news.data, news.target)

classifier accuracy is 0.91553480475382


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', MultinomialNB(alpha=0.05))])

In [31]:
trial4 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english'), min_df=5)),
                 ("classifier", MultinomialNB(alpha=0.05))])

In [32]:
train_test(trial4, news.data, news.target)

classifier accuracy is 0.9100169779286927


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', MultinomialNB(alpha=0.05))])

In [36]:
import string
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [63]:
def stemming_tokenizer(doc):
    tokens = word_tokenize(doc)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [64]:
trial5 = Pipeline([("vectorizer", TfidfVectorizer(tokenizer=stemming_tokenizer,
                                                  stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                 ("classifier", MultinomialNB(alpha=0.05))])

In [65]:
train_test(trial5, news.data, news.target)

  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: 

In [66]:
from sklearn import svm

In [71]:
trial5 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                 ("classifier", svm.LinearSVC())])

In [72]:
train_test(trial5, news.data, news.target)

classifier accuracy is 0.9276315789473685


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', LinearSVC())])

In [74]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
trial6 = Pipeline([("vectorizer", TfidfVectorizer(stop_words=stopwords.words('english')+list(string.punctuation), min_df=5)),
                 ("classifier", RandomForestClassifier())])

In [78]:
train_test(trial6, news.data, news.target)

classifier accuracy is 0.8438030560271647


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(min_df=5,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('classifier', RandomForestClassifier())])