In [1]:
import re
import string
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import spacy
import random
import keras
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import warnings
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline    
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [2]:
text1 = 'Обработка естественного языка Natural language processing (NLP) – это сфера искусственного интеллекта, которая занимается применением алгоритмов машинного обучения и лингвистики для анализа текстовых данных. Цель NLP – понимание и воспроизведение естественного человеческого языка.'
text2 = 'Без труда не выловишь и рыбку из пруда.'
text3 = 'Маша поедет в отпуск в Сочи'


#### Токенизация

In [3]:
from spacy.lang.ru import Russian
import spacy
nlp = spacy.load('ru_core_news_sm')
spacy_text1 = nlp(text1)
spacy_text1

Обработка естественного языка Natural language processing (NLP) – это сфера искусственного интеллекта, которая занимается применением алгоритмов машинного обучения и лингвистики для анализа текстовых данных. Цель NLP – понимание и воспроизведение естественного человеческого языка.

In [4]:
for t in spacy_text1:
    print(t)

Обработка
естественного
языка
Natural
language
processing
(
NLP
)
–
это
сфера
искусственного
интеллекта
,
которая
занимается
применением
алгоритмов
машинного
обучения
и
лингвистики
для
анализа
текстовых
данных
.
Цель
NLP
–
понимание
и
воспроизведение
естественного
человеческого
языка
.


In [5]:
spacy_text2 = nlp(text2)
spacy_text2

Без труда не выловишь и рыбку из пруда.

In [6]:
for t in spacy_text2:
    print(t)

Без
труда
не
выловишь
и
рыбку
из
пруда
.


In [7]:
spacy_text3 = nlp(text3)
spacy_text3

Маша поедет в отпуск в Сочи

#### Частеречная разметка

In [8]:
for token in spacy_text1:
    print('{} - {} - {}'.format(token.text, token.pos_, token.dep_))

Обработка - NOUN - nsubj
естественного - ADJ - amod
языка - NOUN - nmod
Natural - X - appos
language - X - flat:foreign
processing - X - flat:foreign
( - PUNCT - punct
NLP - PROPN - appos
) - PUNCT - punct
– - PUNCT - punct
это - PART - expl
сфера - NOUN - ROOT
искусственного - ADJ - amod
интеллекта - NOUN - nmod
, - PUNCT - punct
которая - PRON - nsubj
занимается - VERB - acl:relcl
применением - NOUN - obl
алгоритмов - NOUN - nmod
машинного - ADJ - amod
обучения - NOUN - nmod
и - CCONJ - cc
лингвистики - NOUN - conj
для - ADP - case
анализа - NOUN - nmod
текстовых - ADJ - amod
данных - NOUN - nmod
. - PUNCT - punct
Цель - NOUN - nsubj
NLP - PROPN - appos
– - PUNCT - punct
понимание - NOUN - ROOT
и - CCONJ - cc
воспроизведение - NOUN - conj
естественного - ADJ - amod
человеческого - ADJ - amod
языка - NOUN - nmod
. - PUNCT - punct


#### Лемматизация

In [9]:
for token in spacy_text1:
      print(token, token.lemma, token.lemma_)

Обработка 4439142451406220892 обработка
естественного 10815020827654897299 естественный
языка 14510553211863083651 язык
Natural 3743574233330547430 natural
language 8740476009882919263 language
processing 10935198773122488114 processing
( 12638816674900267446 (
NLP 11273594034978133401 nlp
) 3842344029291005339 )
– 10118409446379451916 –
это 1823958246850563701 это
сфера 1978723639801592318 сфера
искусственного 2308121456052185967 искусственный
интеллекта 16386099370530278165 интеллект
, 2593208677638477497 ,
которая 5533670911196217694 которая
занимается 4118051555953163819 заниматься
применением 1606230362371498159 применение
алгоритмов 12533358430471453281 алгоритм
машинного 13159888129191933160 машинный
обучения 11207406484676134031 обучение
и 15015917632809974589 и
лингвистики 6350707646557701949 лингвистика
для 10075485332184864679 для
анализа 10328217384203977629 анализ
текстовых 6741918539707866411 текстовый
данных 16346024248312543907 данных
. 12646065887601541794 .
Цель 10764

In [10]:
for token in spacy_text2:
      print(token, token.lemma, token.lemma_)

Без 12777514208095456290 без
труда 5037404031944352030 труд
не 5319710824202933802 не
выловишь 16148160567256055770 выловить
и 15015917632809974589 и
рыбку 9364984535100188714 рыбка
из 12183146372738139588 из
пруда 7646409547435807020 пруд
. 12646065887601541794 .


#### Выделение (распознавание) именованных сущностей

In [11]:
for ent in spacy_text3.ents:
    print(ent.text, ent.label_)

Маша PER
Сочи LOC


In [12]:
from spacy import displacy
displacy.render(spacy_text3, style='ent', jupyter=True)

In [13]:
print(spacy.explain("LOC"))

Non-GPE locations, mountain ranges, bodies of water


In [14]:
print(spacy.explain("PER"))

Named person or family.


#### Разбор предложения

In [15]:
from spacy import displacy

In [16]:
displacy.render(spacy_text1, style='dep', jupyter=True)

In [17]:
displacy.render(spacy_text3, style='dep', jupyter=True)

In [18]:
print(spacy.explain("conj"))

conjunct


### Natasha

In [19]:
from natasha import Doc, Segmenter, NewsEmbedding, NewsMorphTagger, MorphVocab


In [20]:
def n_lemmatize(text):
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    segmenter = Segmenter()
    morph_vocab = MorphVocab()
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return doc

In [21]:
n_doc3 = n_lemmatize(text3)
{_.text: _.lemma for _ in n_doc3.tokens}

{'Маша': 'маша',
 'поедет': 'поехать',
 'в': 'в',
 'отпуск': 'отпуск',
 'Сочи': 'сочи'}

In [22]:
from natasha import NewsSyntaxParser

In [23]:
emb = NewsEmbedding()
syntax_parser = NewsSyntaxParser(emb)

In [24]:
n_doc3.parse_syntax(syntax_parser)
n_doc3.sents[0].syntax.print()

    ┌► Маша   nsubj
┌─┌─└─ поедет 
│ │ ┌► в      case
│ └►└─ отпуск obl
│   ┌► в      case
└──►└─ Сочи   obl


### Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:
- Способ 1. На основе CountVectorizer или TfidfVectorizer.
- Способ 2. На основе моделей word2vec или Glove или fastText.


In [25]:
df=pd.read_csv("../spam.csv", encoding="latin-1")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [26]:
to_drop = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
df = df.drop(df[to_drop], axis=1)
df.rename(columns = {"v1":"Target", "v2":"message"}, inplace = True)
df.head()

Unnamed: 0,Target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
#Lets have a look at a sample of texts before cleaning
print("The First 5 Texts: \n",*df["message"][:5], sep = "\n")

The First 5 Texts: 

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
U dun say so early hor... U c already then say...
Nah I don't think he goes to usf, he lives around here though


#### Предобработка

In [28]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [29]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,Target,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [30]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()

Unnamed: 0,Target,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,dun say early hor already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [31]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [32]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()

Unnamed: 0,Target,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkts m...
3,ham,U dun say so early hor... U c already then say...,dun say earli hor alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [33]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['Target'])

df['target_encoded'] = le.transform(df['Target'])
df.head()

Unnamed: 0,Target,message,message_clean,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkts m...,1
3,ham,U dun say so early hor... U c already then say...,dun say earli hor alreadi say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though,0


In [34]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = df['message_clean'].tolist()
vocab_list[1:5]

['ok lar joke wif oni',
 'free entri  wkli comp win fa cup final tkts  may  text fa  receiv entri questionstd txt ratetc appli ',
 'dun say earli hor alreadi say',
 'nah dont think goe usf live around though']

In [35]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 6737


In [36]:
for i in list(corpusVocab)[1:10]:
    print('{}={}'.format(i, corpusVocab[i]))

jurong=3005
point=4384
crazi=1228
avail=391
bugi=761
great=2353
world=6536
la=3138
buffet=759


### CountVectorizer

In [37]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, df['message_clean'], df['target_encoded'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [38]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aah': 1, 'aaniy': 2, 'aaooooright': 3,
                            'aathilov': 4, 'aathiwher': 5, 'ab': 6, 'abbey': 7,
                            'abdomen': 8, 'abeg': 9, 'abel': 10, 'aberdeen': 11,
                            'abi': 12, 'abil': 13, 'abiola': 14, 'abj': 15,
                            'abl': 16, 'abnorm': 17, 'abouta': 18, 'abroad': 19,
                            'absenc': 20, 'absolut': 21, 'abstract': 22,
                            'abt': 23, 'abta': 24, 'aburo': 25, 'abus': 26,
                            'ac': 27, 'academ': 28, 'acc': 29, ...})
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.9793612508571704
Векторизация - CountVectorizer(vocabulary={'aa': 0, 'aah': 1, 'aaniy': 2, 'aaooooright': 3,
                            'aathilov': 4, 'aathiwher': 5, 'ab': 6, 'abbey': 7,
                            'abdomen': 8, 'abeg': 9, 'abel': 10, 'aberdeen': 11,
                            'abi': 1

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['message_clean'], df['target_encoded'], test_size=0.3, random_state=1)

In [40]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))

In [41]:
sentiment(CountVectorizer(), LogisticRegression(C=3.0))

              precision    recall  f1-score   support

           0     0.9798    0.9993    0.9894      1454
           1     0.9947    0.8624    0.9238       218

    accuracy                         0.9815      1672
   macro avg     0.9872    0.9308    0.9566      1672
weighted avg     0.9817    0.9815    0.9809      1672



## word2vec 

In [42]:
import gensim
from gensim.models import word2vec

In [43]:
import re
import pandas as pd
import numpy as np
from typing import Dict, Tuple
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk

In [44]:
# Подготовим корпус
corpus = []
tok = WordPunctTokenizer()
for line in df['message_clean'].values:
    text_tok = tok.tokenize(line)
   # text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok)

In [45]:
corpus[:5]


[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though']]

In [46]:
print(len(corpus))

5572


In [47]:
# количество текстов в корпусе не изменилось и соответствует целевому признаку
assert df.shape[0]==len(corpus)

In [48]:
%time model = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)


Wall time: 342 ms


In [49]:
# Проверим, что модель обучилась
print(model.wv.most_similar(positive=['get'], topn=5))

[('use', 0.9994774460792542), ('sent', 0.9994766116142273), ('guy', 0.9994684457778931), ('mean', 0.999464213848114), ('bring', 0.9994499683380127)]


In [50]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [51]:
# Обучающая и тестовая выборки
boundary = 5000
X_train = corpus[:boundary] 
X_test = corpus[boundary:]
y_train = df.target_encoded.values[:boundary]
y_test = df.target_encoded.values[boundary:]

In [52]:
sentiment(EmbeddingVectorizer(model.wv), LogisticRegression(C=3.0))


              precision    recall  f1-score   support

           0     0.9574    0.9920    0.9744       498
           1     0.9286    0.7027    0.8000        74

    accuracy                         0.9545       572
   macro avg     0.9430    0.8473    0.8872       572
weighted avg     0.9536    0.9545    0.9518       572



На данном корпусе лучшее качетво удалось получить при использовании CountVectorizer.

При решении задачи классификации на основе модели word2vec получили меньшую точность.