# Классифкация текстов с помощью sklearn

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

## Загрузка датасета 20 newsgroups

In [2]:
categories = ['alt.atheism', 
              'soc.religion.christian',
              'comp.graphics', 
              'sci.med']
remove = ('headers', 'footers', 'quotes')

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42, remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42, remove=remove)
print('data loaded')

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
data loaded


In [3]:
len(data_train.data)

2257

In [4]:
print("\n".join(data_train.data[0].split("\n")[:3]))

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.


In [5]:
print(data_train.target_names[data_train.target[0]])

comp.graphics


## Извлечение признаков

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape

(2257, 28865)

In [7]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 28865)

## Обучение классификатора

In [8]:
clf = MultinomialNB().fit(X_train_tfidf, data_train.target)

In [9]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, data_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Построение pipeline

In [10]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                    ])

In [11]:
text_clf.fit(data_train.data, data_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Оценка качества классификатора

In [12]:
docs_test = data_test.data
predicted = text_clf.predict(docs_test)

acc = round(np.mean(predicted == data_test.target) * 100, 2)
print(f'Accuracy is {acc}%.')


Accuracy is 66.91%.


In [13]:
print(metrics.classification_report(data_test.target, predicted,
                                    target_names=data_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.96      0.08      0.16       319
         comp.graphics       0.94      0.83      0.88       389
               sci.med       0.96      0.67      0.79       396
soc.religion.christian       0.46      0.98      0.63       398

           avg / total       0.82      0.67      0.63      1502



In [14]:
metrics.confusion_matrix(data_test.target, predicted)

array([[ 27,   5,   5, 282],
       [  0, 321,   5,  63],
       [  1,  13, 265, 117],
       [  0,   4,   2, 392]])

## SGDClassifier

In [15]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None)),
                    ])

In [16]:
text_clf.fit(data_train.data, data_train.target)  
predicted = text_clf.predict(docs_test)

In [17]:
acc = round(np.mean(predicted == data_test.target) * 100, 2)
print(f'Accuracy is {acc}%.')

Accuracy is 79.56%.


## Поиск лучших параметров классификатора

In [18]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
             }

gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [19]:
gs_clf = gs_clf.fit(data_train.data[:800], data_train.target[:800])

In [20]:
gs_clf.best_score_ 

0.82629697696359761

In [21]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


# Задание

Обучить и протестировать классификаторы из лекции. Найти классификатор с лучшими показателями качества.