Working with text data: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

## Loading Data

In [1]:
import sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
# the data bunch
#twenty_train = sklearn.datasets.load_files('20news-bydate-train', categories=categories, encoding='latin1')
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data), len(twenty_train.filenames)

(2257, 2257)

In [6]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [7]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [8]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


## Extracting Features

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [11]:
CountVectorizer??

In [12]:
X_train_counts.shape

(2257, 35788)

In [13]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [14]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [15]:
type(X_train_tfidf[0])

scipy.sparse.csr.csr_matrix

In [16]:
X_train_tfidf[0].count_nonzero()

73

## Training a classifier

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)


In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Building a pipeline

In [19]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])


In [20]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Evaluation of the performance on the test set

In [21]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [22]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
        alpha=1e-3, random_state=42,
        max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9127829560585885

In [23]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [24]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

## Parameter tuning using grid search

In [25]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [26]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)


In [27]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])


In [28]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]


'soc.religion.christian'

In [29]:
print(gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [30]:
import spacy
nlp = spacy.load('en')

In [31]:
print(twenty_train.data[1])
    

From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences
Lines: 28



	Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv domain, i.e  the mesh is a 
	mapping of a 3d Bezier patch into 2d. The area in this domain
	which is inside a trimming loop had to be rendered. The trimming
	loop is a set of 2d Bezier curve segments.
	For the sake of notation: the mesh is made up of cells.

	My problem is this :
	The trimming area has to be split up into individual smaller
	cells bounded by the trimming curve segments. If a cell
	is wholly inside the area...then it is output as a whole ,
	else it is trivially rejected. 

	Does any body know how thiss can be done, or is there any algo. 
	somewhere for doing this.

	Any help would be appreciated.

	Thanks, 
	Ani.
-- 
To get irritated is human, to stay cool, divi

In [32]:
text = """The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge, it was originally collected by Ken Lang, probably for his paper “Newsweeder: Learning to filter netnews,” though he does not explicitly mention this collection. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering."""
print(text)

The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge, it was originally collected by Ken Lang, probably for his paper “Newsweeder: Learning to filter netnews,” though he does not explicitly mention this collection. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.


In [33]:
doc = nlp(text)

In [34]:
for sent in doc.sents:
    print(sent, '==')

The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. ==
To the best of our knowledge, it was originally collected by Ken Lang, probably for his paper “Newsweeder: ==
Learning to filter netnews,” though he does not explicitly mention this collection. ==
The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering. ==


In [35]:
for token in next(doc.sents):
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

The the DET DT det Xxx True False
20 20 NUM CD compound dd False False
Newsgroups newsgroups PROPN NNP compound Xxxxx True False
data datum NOUN NNS compound xxxx True False
set set VERB VBN nsubj xxx True False
is be VERB VBZ ROOT xx True True
a a DET DT det x True True
collection collection NOUN NN attr xxxx True False
of of ADP IN prep xx True True
approximately approximately ADV RB advmod xxxx True False
20,000 20,000 NUM CD nummod dd,ddd False False
newsgroup newsgroup NOUN NN compound xxxx True False
documents document NOUN NNS pobj xxxx True False
, , PUNCT , punct , False False
partitioned partition VERB VBN acl xxxx True False
( ( PUNCT -LRB- punct ( False False
nearly nearly ADV RB advmod xxxx True False
) ) PUNCT -RRB- punct ) False False
evenly evenly ADV RB advmod xxxx True False
across across ADP IN prep xxxx True True
20 20 NUM CD nummod dd False False
different different ADJ JJ amod xxxx True False
newsgroups newsgroup NOUN NNS pobj xxxx True False
. . PUNCT . punct . F

In [36]:
from io import StringIO

class RenderTokens:
    def __init__(self, sent):
        self.sent = sent
    def _repr_html_(self):
        builder = StringIO()
        builder.write('<table>')
        for token in sent:
            builder.write('<tr><td>')
            builder.write(
                '</td><td>'.join(
                    (token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                     token.shape_, str(token.is_alpha), str(token.is_stop))
                )
            )
            builder.write('</td></tr>')
        builder.write('</table>')
        return builder.getvalue()

RenderTokens(next(doc.sents))

0,1,2,3,4,5,6,7
The,the,DET,DT,det,Xxx,True,False
20,20,NUM,CD,nummod,dd,False,False
newsgroups,newsgroup,NOUN,NNS,compound,xxxx,True,False
collection,collection,NOUN,NN,nsubj,xxxx,True,False
has,have,VERB,VBZ,aux,xxx,True,True
become,become,VERB,VBN,ROOT,xxxx,True,True
a,a,DET,DT,det,x,True,True
popular,popular,ADJ,JJ,amod,xxxx,True,False
data,data,NOUN,NN,attr,xxxx,True,False
set,set,VERB,VBN,acl,xxx,True,False


### Домашнее задание

Результат тестирования, который мы получили выше был основан на работе класса CountVectorizer с настройками по умолчанию, где он использовал в качестве токенов слова в той форме, в которой они встречались в тексте. То есть несколько форм одного слова - talk, talks, talked - являются разными словами.

Задание: написато скрипт, чтобы посмотреть, как изменится качество, если для всех слов использовать лемматизацию (приведение к начальной форме слова). В качестве лемматизатора можно использовать библиотеку [spacy][1], как в примере выше (см. поля lemma и lemma_ токенов).

[1]: https://spacy.io/