In [1]:
from sklearn.datasets import fetch_20newsgroups

  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [7]:
twenty_train.target_names
len(twenty_train.data)



2257

In [6]:
len(twenty_train.filenames)

2257

In [26]:
#print the first lines of the first loaded file:
print("\n".join(twenty_train.data[0].split("\n")[:5]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14


In [9]:
#Supervised learning algorithms will require a category label for each document in the training set. 
#In this case the category is the name of the newsgroup

#For speed and space efficiency reasons scikit-learn loads the target attribute 
#as an array of integers that corresponds to the index of the category name in the target_names list. 
#The category integer id of each sample is stored in the target attribute
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [10]:
#It is possible to get back the category names as follows:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [11]:
#Extracting features from text files
#In order to perform machine learning on text documents, 
#we first need to turn the text content into numerical feature vectors.
#Bags of words 
#The most intuitive way to do so is the bags of words representation:
#assign a fixed integer id to each word occurring in any document of the training set 
#(for instance by building a dictionary from words to integer indices).
#for each document #i, count the number of occurrences of each word w 
#and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary
#The bags of words representation implies that n_features is the number of distinct words in the corpus
#Tokenizing text with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [44]:
count_vect.vocabulary_.get(U'university')

33597

In [45]:
#From occurrences to frequencies
#Occurrence count is a good start but there is an issue: longer documents will have higher 
#average count values than shorter documents, even though they might talk about the same topics.
#To avoid these potential discrepancies it suffices to divide the number of occurrences 
#of each word in a document by the total number of words in the document: 
#these new features are called tf for Term Frequencies.
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [16]:
#Training a classifier
#Now that we have our features, we can train a classifier to try to predict the category of a post

In [19]:
#Let’s start with a naïve Bayes classifier, which provides a nice baseline for this task.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [20]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [21]:
#Building a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [22]:
text_clf.fit(twenty_train.data, twenty_train.target) 

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [23]:
#Evaluation of the performance on the test set
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.8348868175765646

In [24]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

  return f(*args, **kwds)


0.9127829560585885