In [1]:
from sklearn.datasets import fetch_20newsgroups


classes = ['sci.med', 'sci.space', 'alt.atheism']
D = fetch_20newsgroups(categories=classes)

print('Text:')
print(20 * '-')
print(D.data[0])
print()
print('Has the class:')
print(20 * '-')
print(classes[D.target[0] - 1])

Text:
--------------------
From: matthew@phantom.gatech.edu (Matthew DeLuca)
Subject: Re: Boom!  Whoosh......
Organization: The Dorsai Grey Captains
Lines: 13
NNTP-Posting-Host: oit.gatech.edu

In article <1993Apr21.024423.29182@mnemosyne.cs.du.edu> wdwells@nyx.cs.du.edu (David "Fuzzy" Wells) writes:

>I hear <insert favorite rumor here> that it will supposedly coincide
>with the Atlanta Olympics. 

Even worse, the city of Atlanta has a proposal before it to rent space on this
orbiting billboard.  Considering the caliber of people running this city, 
there's no telling what we're going to have leering down at us from orbit.
-- 
Matthew DeLuca
Georgia Institute of Technology, Atlanta Georgia, 30332
uucp:	  ...!{decvax,hplabs,ncar,purdue,rutgers}!gatech!prism!matthew
Internet: matthew@phantom.gatech.edu


Has the class:
--------------------
sci.space


In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(D.data, D.target, test_size=.1, random_state=1)

### Preprocessing inputs

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

### Testing model

In [4]:
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB(alpha=0.1)
model.fit(X_train, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [5]:
from sklearn.model_selection import cross_val_score


print('Score on train set:', cross_val_score(model, X_train, y_train, cv=3).mean())
print('Score on test set:', cross_val_score(model, X_test, y_test, cv=3).mean())

Score on train set: 0.9813279466451199
Score on test set: 0.9218500797448165


### Testing by hand

In [6]:
def predict(text):
    return classes[model.predict(vectorizer.transform([
        text,
    ]).toarray())[0] - 1]

In [7]:
dd = [
    'Are antioxidants healthy?',
    'I want to find double stars.',
    'It\'s proving that god doesn\'t exists!'
]

for d in dd:
    print(d, '--->', predict(d))

Are antioxidants healthy? ---> sci.med
I want to find double stars. ---> sci.space
It's proving that god doesn't exists! ---> alt.atheism
