# Классификация текстов: предобработка

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore")

## 20Newsgroups

In [2]:
import sklearn
from sklearn.datasets import fetch_20newsgroups

In [3]:
newsgroups_train = sklearn.datasets.fetch_20newsgroups(data_home='/home/jovyan/work/seminar_6/', 
                                    subset='train',
                                    download_if_missing=False)
newsgroups_test = sklearn.datasets.fetch_20newsgroups(data_home='/home/jovyan/work/seminar_6/', 
                                    subset='test',
                                    download_if_missing=False)

In [4]:
newsgroups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print (newsgroups_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







### 1. Предварительная обработка текста

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import tqdm


def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    regex = re.compile('[^a-z A-Z]')
    preprocess_texts = []
    for i in  tqdm.tqdm(range(len(texts))):
        text = texts[i].lower()
        text = regex.sub(' ', text)
        word_tokens = word_tokenize(text) 
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        preprocess_texts.append( ' '.join(filtered_sentence))
    
    return preprocess_texts

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
newsgroups_train['preprocess_data'] = preprocess_text(newsgroups_train.data)

100%|██████████| 11314/11314 [00:14<00:00, 797.22it/s]


In [10]:
newsgroups_test['preprocess_data'] = preprocess_text(newsgroups_test.data)

100%|██████████| 7532/7532 [00:08<00:00, 860.78it/s]


In [11]:
print(newsgroups_train['preprocess_data'][0])

lerxst wam umd edu thing subject car nntp posting host rac wam umd edu organization university maryland college park lines wondering anyone could enlighten car saw day door sports car looked late early called bricklin doors really small addition front bumper separate rest body know anyone tellme model name engine specs years production car made history whatever info funky looking car please e mail thanks il brought neighborhood lerxst


In [12]:
from nltk.stem.lancaster import LancasterStemmer

In [13]:
def stemming_texts(texts):
  st = LancasterStemmer()
  stem_text = []
  for text in tqdm.tqdm(texts):
    word_tokens = word_tokenize(text)
    stem_text.append(' '.join([st.stem(word) for word in word_tokens]))
  return stem_text

In [14]:
newsgroups_train['data_stemming'] = \
                           stemming_texts(newsgroups_train.preprocess_data)

100%|██████████| 11314/11314 [00:35<00:00, 317.55it/s]


In [None]:
newsgroups_test['data_stemming'] = \
                            stemming_texts(newsgroups_test.preprocess_data)

In [None]:
print(newsgroups_train.data_stemming[0])

In [None]:
print(newsgroups_train.preprocess_data[0])

### 2. Перевод текста в вещественное пространство признаков

#### 2.1 Bag of Words (мешок слов)

Основное предположение данного метода — порядок слов в документе не важен, 

а все документы представляются в виде матрицы $ T = (t)_{d,w}$,

каждая строка в которой соответствует отдельному документу или тексту, 

а каждый столбец — определенному слову. 

Элемент $t_{d,w}$ соответствует количеству вхождений слова $w$ в документ $d$.







In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
vectorizer.fit(['порядок слов в документе не важен', 'мешок слов'])

In [None]:
vectorizer.get_feature_names()

In [None]:
vectorizer.transform(['важен порядок', 'не мешок не порядок']).toarray()

In [None]:
def bow(vectorizer, train, test):
  train_bow = vectorizer.fit_transform(train)
  test_bow = vectorizer.transform(test)
  return train_bow, test_bow

In [None]:
X_train_bow, X_test_bow = bow(vectorizer, 
                              newsgroups_train.data, 
                              newsgroups_test.data)

In [None]:
X_train_bow.shape

In [None]:
X_test_bow.shape

In [None]:
X_train_bow_preprocess, X_test_bow_preprocess = bow(vectorizer, 
                                                    newsgroups_train.preprocess_data,
                                                    newsgroups_test.preprocess_data)

In [None]:
X_train_bow_preprocess.shape

In [None]:
X_test_bow_preprocess.shape

In [None]:
X_train_bow_stem, X_test_bow_stem = bow(vectorizer, 
                                        newsgroups_train.data_stemming,
                                        newsgroups_test.data_stemming)

In [None]:
X_train_bow_stem.shape

In [None]:
X_test_bow_stem.shape

#### 2.2 Bag of Words & TF IDF

$TF-IDF$ — это статистическая мера, используемая для оценки

важности слова в контексте документа. Вычисляется по формуле:

$$TF-IDF(w, d, D) = TF(w, d) × IDF(w, D)$$

$TF$ — частота слова, оценивает важность слова $w_i$ в пределах отдельного документа.

$$TF(w, d) = \frac{n_i}{\sum_k n_k}$$

$n_i$ — число вхождений слова $i$ в документ.

$\sum_k n_k$ — общее число слов в данном документе.

$IDF$ — обратная частота документа. 

Учёт $IDF$ уменьшает вес широко употребляемых слов.
$$IDF(w, D) = \log \frac{|D|}{|w_i \subset d_i|}, \text{где}$$

$|D|$ — количество документов в корпусе.

$|w_i \subset d_i|$ — количество документов,

в которых встречается слово $w_i$.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer_tf_idf = TfidfVectorizer()

In [None]:
X_train_tfidf, X_test_tfidf = bow(vectorizer_tf_idf, 
                                  newsgroups_train.data, 
                                  newsgroups_test.data)

In [None]:
X_train_tfidf_preprocess, X_test_tfidf_preprocess = bow(vectorizer_tf_idf,
                                                        newsgroups_train.preprocess_data,
                                                        newsgroups_test.preprocess_data)

In [None]:
X_train_tfidf_stem, X_test_tfidf_stem = bow(vectorizer_tf_idf,
                                            newsgroups_train.data_stemming,
                                            newsgroups_test.data_stemming)

In [None]:
vectorizer_ngram = TfidfVectorizer(ngram_range=(1,2))

In [None]:
X_train_ngram_stem, X_test_ngram_stem = bow(vectorizer_ngram, 
                                            newsgroups_train.data_stemming,
                                            newsgroups_test.data_stemming)

In [None]:
X_train_ngram_stem.shape

In [None]:
X_test_ngram_stem.shape

### 3. Выбор алгоритма машинного обучения для классификации.

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [None]:
from sklearn.svm import LinearSVC
clf_svc = LinearSVC()

In [None]:
clf.fit(X_train_bow, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_bow), newsgroups_test.target))

clf_svc.fit(X_train_bow, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_bow), newsgroups_test.target))

In [None]:
clf.fit(X_train_bow_preprocess, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_bow_preprocess),
                      newsgroups_test.target))

clf_svc.fit(X_train_bow_preprocess, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_bow_preprocess),
                      newsgroups_test.target))



In [None]:
clf.fit(X_train_bow_stem, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_bow_stem),
                      newsgroups_test.target))

clf_svc.fit(X_train_bow_stem, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_bow_stem),
                      newsgroups_test.target))

In [None]:
clf.fit(X_train_tfidf, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_tfidf), newsgroups_test.target))

clf_svc.fit(X_train_tfidf, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_tfidf), newsgroups_test.target))

In [None]:
clf.fit(X_train_tfidf_preprocess, newsgroups_train.target)
print ('LogisticRegression: ',
        accuracy_score(clf.predict(X_test_tfidf_preprocess), 
                       newsgroups_test.target))

clf_svc.fit(X_train_tfidf_preprocess, newsgroups_train.target)
print ('LinearSVC: ',
        accuracy_score(clf_svc.predict(X_test_tfidf_preprocess), 
                       newsgroups_test.target))

In [None]:
clf.fit(X_train_tfidf_stem, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_tfidf_stem), 
                      newsgroups_test.target))

clf_svc.fit(X_train_tfidf_stem, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_tfidf_stem),
                      newsgroups_test.target))

In [None]:
clf.fit(X_train_ngram_stem, newsgroups_train.target)
print ('LogisticRegression: ',
       accuracy_score(clf.predict(X_test_ngram_stem), 
                      newsgroups_test.target))

clf_svc.fit(X_train_ngram_stem, newsgroups_train.target)
print ('LinearSVC: ',
       accuracy_score(clf_svc.predict(X_test_ngram_stem),
                      newsgroups_test.target))

In [None]:
import seaborn
from sklearn.metrics import confusion_matrix

In [None]:
predict_targets = clf_svc.predict(X_test_ngram_stem)

In [None]:
seaborn.heatmap(confusion_matrix(newsgroups_test.target, predict_targets))

In [None]:
newsgroups_test.target_names[18]

In [None]:
newsgroups_test.target_names[16]

In [None]:
ind = np.where(newsgroups_test.target != predict_targets)
ind = ind[0]
random_ind = np.random.randint(0,ind.shape[0],(2))
for i in random_ind:
    img=newsgroups_test.data[ind[i]]
    print ('Правильный класс: ', 
           newsgroups_test.target_names[newsgroups_test.target[ind[i]]])
    print ('Предсказанный класс: ', 
           newsgroups_test.target_names[predict_targets[ind[i]]])
    print(img)
