# Лабораторная работа №6
## Корнеева Анна, ИУ5-23М
**Тема: Классификация текста.**

**Задание:**

Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:

Способ 1. На основе CountVectorizer или TfidfVectorizer.

Способ 2. На основе моделей word2vec или Glove или fastText.

Сравните качество полученных моделей.
Для поиска наборов данных в поисковой системе можно использовать ключевые слова "datasets for text classification".

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from typing import Dict, Tuple
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
from sklearn.naive_bayes import ComplementNB

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

pd.set_option("display.max_columns", None)

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [2]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [3]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv')

In [4]:
print(train.shape)
print(test.shape)

(41157, 6)
(3798, 6)


In [5]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [6]:
train.Sentiment.value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [7]:
train.Sentiment = train.Sentiment.replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})
test.Sentiment = test.Sentiment.replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})

lenc = LabelEncoder()
test.Sentiment = lenc.fit_transform(test.Sentiment)
train.Sentiment = lenc.fit_transform(train.Sentiment)

In [8]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,2
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,2
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,2
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",0


In [9]:
x_train = train['OriginalTweet']
y_train = train['Sentiment']
x_test = test['OriginalTweet']
y_test = test['Sentiment']

##  Очистка данных

1. приведение всех слов к нижнему регистру
2. удаление ссылок
3. отделение слов и знаков пунктуации пробелом
4. удаление все кроме (a-z, A-Z, ".", "?", "!", ",")
5. тоенизация - разделение строки на слова, для удаления стоп-слов
6. удалние стоп-слов, т.е. часто используемых слов не несущих большой смысловой нагрузки
7. создание строки из токенов

In [10]:
import re
def preprocess_sentence(w):

    w = w.lower()
    w = re.sub('\t\n', '', w)
    w = re.sub(r'http\S+', '', w)
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,`']+", " ", w)

    w = w.strip()
    tokens = w.split(' ')

    stop_words = set(stopwords.words('english')) # remove stopwords
    tokens = [word for word in tokens if not word in stop_words]
    tokens = ' '.join(tokens)
    return tokens

In [11]:
x_train = x_train.apply(preprocess_sentence)
x_test = x_test.apply(preprocess_sentence)

In [12]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = x_train.tolist() + x_test.tolist()
print(len(vocab_list))
vocab_list[1:10]

44955


['advice talk neighbours family exchange phone numbers create contact list phone numbers neighbours schools employer chemist gp set online shopping accounts poss adequate supplies regular meds order',
 'coronavirus australia woolworths give elderly , disabled dedicated shopping hours amid covid outbreak',
 'food stock one empty . . . please , panic , enough food everyone take need . stay calm , stay safe . covid france covid covid coronavirus confinement confinementotal confinementgeneral',
 ", ready go supermarket covid outbreak . i'm paranoid , food stock litteraly empty . coronavirus serious thing , please , panic . causes shortage . . . coronavirusfrance restezchezvous stayathome confinement",
 'news region first confirmed covid case came sullivan county last week , people flocked area stores purchase cleaning supplies , hand sanitizer , food , toilet paper goods , tim dodson reports',
 "cashier grocery store sharing insights covid prove credibility commented i'm civics class know 

In [13]:
train['OriginalTweet'][0:10][4]

"Me, ready to go at supermarket during the #COVID19 outbreak.\r\r\n\r\r\nNot because I'm paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, don't panic. It causes shortage...\r\r\n\r\r\n#CoronavirusFrance #restezchezvous #StayAtHome #confinement https://t.co/usmuaLq72n"

In [14]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 54546


In [15]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

phil=36274
gahan=18888
chrisitv=8219
advice=670
talk=47397
neighbours=32568
family=16737
exchange=16272
phone=36318


In [16]:
tfidfv = TfidfVectorizer(ngram_range=(1,3))
tfidf_ngram_features = tfidfv.fit_transform(vocab_list)
tfidf_ngram_features

<44955x1141506 sparse matrix of type '<class 'numpy.float64'>'
	with 2263622 stored elements in Compressed Sparse Row format>

In [21]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, x_train[:10000], y_train[:10000], scoring='accuracy', cv=3,).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [18]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [RandomForestClassifier(), ComplementNB(), LogisticRegression(C=3.0, solver='lbfgs', max_iter=1000), LinearSVC()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aaa': 1, 'aaaaakubosan': 2, 'aaaaas': 3,
                            'aaaand': 4, 'aaachatterjee': 5, 'aaanews': 6,
                            'aaannnddd': 7, 'aaanortheast': 8, 'aabutan': 9,
                            'aacopd': 10, 'aacounty': 11, 'aacountygovt': 12,
                            'aadeshrawal': 13, 'aadya': 14, 'aadyasitara': 15,
                            'aafp': 16, 'aahealth': 17, 'aahh': 18, 'aai': 19,
                            'aaisp': 20, 'aajeevika': 21, 'aajtak': 22,
                            'aakash': 23, 'aalonzowatt': 24, 'aalto': 25,
                            'aaltouniversity': 26, 'aalwajih': 27,
                            'aamaadmi': 28, 'aamaadmiparty': 29, ...})
Модель для классификации - RandomForestClassifier()
Accuracy = 0.6910992838935964
Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aaa': 1, 'aaaaakubosan': 2, 'aaaaas': 3,
                            'aaaand': 4, 'aaachatterjee': 

### Лучший результат покаазала модель LogisticRegression(C=3.0, max_iter=1000) с CountVectorizer
### word2vec

In [15]:
import gensim
from gensim.models import word2vec

In [16]:
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
# Подготовим корпус
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in vocab_list:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [17]:
corpus[1]

['advice',
 'talk',
 'neighbours',
 'family',
 'exchange',
 'phone',
 'numbers',
 'create',
 'contact',
 'list',
 'phone',
 'numbers',
 'neighbours',
 'schools',
 'employer',
 'chemist',
 'gp',
 'set',
 'online',
 'shopping',
 'accounts',
 'poss',
 'adequate',
 'supplies',
 'regular',
 'meds',
 'order']

In [18]:
%time model = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

CPU times: user 12.3 s, sys: 80.1 ms, total: 12.4 s
Wall time: 4.44 s


In [19]:
# Проверим, что модель обучилась
print(model.wv.most_similar(positive=['find'], topn=5))

[('looking', 0.8133401274681091), ('try', 0.7777470946311951), ('gift', 0.7415961027145386), ('easy', 0.7386958599090576), ('meal', 0.7276292443275452)]


In [22]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [RandomForestClassifier(), ComplementNB(), LogisticRegression(C=3.0, solver='lbfgs', max_iter=1000), LinearSVC()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aaa': 1, 'aaaaakubosan': 2, 'aaaaas': 3,
                            'aaaand': 4, 'aaachatterjee': 5, 'aaanews': 6,
                            'aaannnddd': 7, 'aaanortheast': 8, 'aabutan': 9,
                            'aacopd': 10, 'aacounty': 11, 'aacountygovt': 12,
                            'aadeshrawal': 13, 'aadya': 14, 'aadyasitara': 15,
                            'aafp': 16, 'aahealth': 17, 'aahh': 18, 'aai': 19,
                            'aaisp': 20, 'aajeevika': 21, 'aajtak': 22,
                            'aakash': 23, 'aalonzowatt': 24, 'aalto': 25,
                            'aaltouniversity': 26, 'aalwajih': 27,
                            'aamaadmi': 28, 'aamaadmiparty': 29, ...})
Модель для классификации - RandomForestClassifier()
Accuracy = 0.6933987639915761
Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aaa': 1, 'aaaaakubosan': 2, 'aaaaas': 3,
                            'aaaand': 4, 'aaachatterjee': 

In [23]:
len(corpus)

44955

In [24]:
x_train.value.values[0]

AttributeError: 'Series' object has no attribute 'value'

In [None]:
boundary = 30000
X_train = corpus[:boundary] 
X_test = corpus[boundary:boundary+1000]
Y_train = y_train[:boundary]
Y_test = y_train[boundary:boundary+1000]

def sentiment(v, c):
    for v in vectorizers_list:
        for c in classifiers_list:
            model = Pipeline([("vectorizer", v), ("classifier", c)])
            model.fit(X_train, Y_train)
            y_pred = model.predict(X_test)
            
            print('Модель для классификации - {}'.format(c))
            
            print_accuracy_score_for_classes(Y_test, y_pred)

            print('===========================')

In [None]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [None]:
vectorizers_list = [EmbeddingVectorizer(model.wv)]
classifiers_list = [RandomForestClassifier(),  LogisticRegression(C=3.0, solver='lbfgs', max_iter=1000), LinearSVC()]
sentiment(vectorizers_list, classifiers_list)