In [1]:
!pip install nltk
!python -m nltk.downloader stopwords



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HoriFox\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)
Installing collected packages: gensim
Successfully installed gensim-4.0.1


In [9]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
import zipfile
import os
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import gensim
from gensim.models import word2vec

import re
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

%matplotlib inline 
sns.set(style="ticks")

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HoriFox\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Обозначим категории, которые желаем загрузить
categories = ["sci.crypt", 
              "sci.electronics", 
              "talk.religion.misc", 
              "rec.sport.baseball"]
newsgroups = fetch_20newsgroups(subset='train', categories=categories)
data = newsgroups['data']

In [11]:
# Используемся отработанной функцией вычисления метрики
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [12]:
# С помощью CountVectorizer преобразуем коллекцию текстовых данных в матрицу счётчиков токенов
vocabVect = CountVectorizer()
vocabVect.fit(data)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 33282


In [13]:
# Отобразим нашу матрицу
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

philly=23632
ravel=25268
udel=30929
edu=12456
robert=26356
hite=16186
subject=29047
re=25308
dave=10693


In [14]:
test_features = vocabVect.transform(data)

In [15]:
test_features

<2160x33282 sparse matrix of type '<class 'numpy.int64'>'
	with 339881 stored elements in Compressed Sparse Row format>

In [16]:
test_features.todense()

matrix([[0, 3, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

33282

In [18]:
vocabVect.get_feature_names()[100:120]

['00101101b',
 '00101110',
 '00101110b',
 '00101111',
 '00101111b',
 '0011',
 '00110000',
 '00110000b',
 '00110001',
 '00110001b',
 '00110010',
 '00110010b',
 '00110011',
 '00110011b',
 '00110100',
 '00110100b',
 '00110101',
 '00110101b',
 '00110110',
 '00110110b']

In [19]:
# Функция применения к текстовым данных различных вариантов векторизации и классификации с использованием кросс-валидации
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, newsgroups['data'], newsgroups['target'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [20]:
# Определяем варианты векторизации и классификации и устанавливаем преднастройки
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000000': 2, '00000000b': 3,
                            '00000001': 4, '00000001b': 5, '00000010': 6,
                            '00000010b': 7, '00000011': 8, '00000011b': 9,
                            '00000100': 10, '00000100b': 11, '00000101': 12,
                            '00000101b': 13, '00000110': 14, '00000110b': 15,
                            '00000111': 16, '00000111b': 17, '00001000': 18,
                            '00001000b': 19, '00001001': 20, '00001001b': 21,
                            '00001010': 22, '00001010b': 23, '00001011': 24,
                            '00001011b': 25, '00001100': 26, '00001100b': 27,
                            '00001101': 28, '00001101b': 29, ...})
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.9560185185185186
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '00000000': 2, '00000000b': 3,
                            '00000001': 4, '000000

In [21]:
# Using the stopwords.
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

In [22]:
# Подготовим корпус
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in newsgroups['data']:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [26]:
corpus[1]

['sweda',
 'css',
 'itd',
 'umich',
 'edu',
 'sean',
 'sweda',
 'subject',
 'royals',
 'final',
 'run',
 'total',
 'organization',
 'university',
 'michigan',
 'itd',
 'consulting',
 'support',
 'lines',
 'nntp',
 'posting',
 'host',
 'stimpy',
 'css',
 'itd',
 'umich',
 'edu',
 'x',
 'newsreader',
 'tin',
 'version',
 'pl',
 'saying',
 'quite',
 'time',
 'absent',
 'net',
 'figured',
 'stick',
 'neck',
 'bit',
 'royals',
 'set',
 'record',
 'fewest',
 'runs',
 'scored',
 'al',
 'team',
 'since',
 'inception',
 'dh',
 'rule',
 'p',
 'ideas',
 'fall',
 'easily',
 'short',
 'runs',
 'damn',
 'sure',
 'believe',
 'media',
 'fools',
 'picking',
 'win',
 'division',
 'like',
 'tom',
 'gage',
 'detroit',
 'news',
 'claiming',
 'herk',
 'robinson',
 'kind',
 'genius',
 'trades',
 'aquisitions',
 'made',
 'c',
 'ya',
 'sean',
 'sean',
 'sweda',
 'sweda',
 'css',
 'itd',
 'umich',
 'edu',
 'css',
 'itd',
 'consultant',
 'president',
 'bob',
 'sura',
 'fan',
 'club',
 'gm',
 'manager',
 'motor',

In [27]:
%time model = word2vec.Word2Vec(corpus)

Wall time: 1.95 s


In [28]:
# Проверим, что модель обучилась
print(model.wv.most_similar(positive=['subject'], topn=5))

[('elvis', 0.8723364472389221), ('anchor', 0.8669361472129822), ('smb', 0.8620975613594055), ('att', 0.8582748770713806), ('jgk', 0.853305459022522)]


In [29]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [30]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [31]:
# Обучающая и тестовая выборки
boundary = 700
X_train = corpus[:boundary] 
X_test = corpus[boundary:]
y_train = newsgroups['target'][:boundary]
y_test = newsgroups['target'][boundary:]
sentiment(EmbeddingVectorizer(model.wv), LogisticRegression(C=5.0))

Метка 	 Accuracy
0 	 0.8956310679611651
1 	 0.8948717948717949
2 	 0.8098765432098766
3 	 0.7233201581027668
