In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm
from pymystem3 import Mystem
from gensim.corpora import Dictionary

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from timeit import default_timer as timer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# test = pd.read_csv('SMS_test.csv', encoding='latin-1')
train = pd.read_csv('SMS_train.csv', encoding='latin-1')

In [5]:
train.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [6]:
train.Label.value_counts()

Non-Spam    835
Spam        122
Name: Label, dtype: int64

## Предобработка текстов

In [7]:
# Пример текста без обработки
train.Message_body[5]

'REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode'

In [8]:
nltk.download("stopwords")
stemmer = Mystem()
lemmatize = WordNetLemmatizer()
#Слова паразиты
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [9]:
def preproc(simps):
    corpus = []
    for simp in tqdm(simps):
        tokens = nltk.word_tokenize(simp.lower()) # преобразование к нижнему регистру
        tokens = [w for w in tokens if w.isalpha()] # выбор только алфавитных значений
        tokens = [w for w in tokens if w not in stopwords] # удаление слов паразитов и предлогов
        # tokens = [stemmer.lemmatize(w) for w in tokens] # преобразование к начальной форме слова
        tokens = [w for w in tokens if len(w)>2] # удление слов длиной меньше 3 сиволов
        tokens = set(tokens) # оставляем только уникальные значения
        corpus.append(' '.join(tokens))
    return corpus

In [10]:
textes = train.Message_body.to_list()

In [11]:
corpus = preproc(textes)

  0%|          | 0/957 [00:00<?, ?it/s]

In [12]:
# Пример текста после обработки
corpus[5]

'name free pounds offers call reply reminder great house postcode text credit details valid pls get'

## Векторизация текстов

In [13]:
# векторизация каждого слова метод CountVectorizer
corpus_cvec = corpus.copy()
cv = CountVectorizer(min_df=2, max_df=1.)
cv.fit(corpus_cvec)
transformed_cvec = cv.transform(corpus_cvec)
dense_cvec = transformed_cvec.todense()

In [14]:
# векторизация каждого слова метод TF/IDF
corpus_tf = corpus.copy()
tf = TfidfVectorizer(min_df=2, max_df=1.)
tf.fit(corpus_tf)
transformed_tf = tf.transform(corpus_tf)
dense_tf = transformed_tf.todense()

In [15]:
# Разбивка выборки на тренировочную и тестовою
X_tr, X_te, y_tr, y_te = train_test_split(np.array(dense_cvec), train['Label'])

In [16]:
# Логистическая регрессия
lr_1 = LogisticRegression()
lr_1.fit(X_tr, y_tr)
svc_1 = SVC()
svc_1.fit(X_tr, y_tr)

SVC()

In [17]:
# Разбивка выборки на тренировочную и тестовою
X_tr, X_te, y_tr, y_te = train_test_split(np.array(dense_tf), train['Label'])

In [18]:
# Логистическая регрессия
lr_2 = LogisticRegression()
lr_2.fit(X_tr, y_tr)
svc_2 = SVC()
svc_2.fit(X_tr, y_tr)

SVC()

In [19]:
print('Точность при CountVectorizer + Логистическая регрессия : ', round(accuracy_score(lr_1.predict(X_te), y_te),3))
print('Точность при CountVectorizer + МОП : ', round(accuracy_score(svc_1.predict(X_te), y_te),3))
print('Точность при TF/IDF + Логистическая регрессия : ', round(accuracy_score(lr_2.predict(X_te), y_te), 3))
print('Точность при TF/IDF + МОП : ', round(accuracy_score(svc_2.predict(X_te), y_te), 3))

Точность при CountVectorizer + Логистическая регрессия :  0.867
Точность при CountVectorizer + МОП :  0.867
Точность при TF/IDF + Логистическая регрессия :  0.892
Точность при TF/IDF + МОП :  0.925


Наилучшие значения точности получены с помощью TF/IDF + МОП.
## Нейронная сеть + OHE

In [30]:
import tensorflow as tf
from tensorflow.keras import layers, models
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Dense

In [20]:
all_words = []
for sent in corpus:
    tokenize_word = nltk.word_tokenize(sent)
    for word in tokenize_word:
        all_words.append(word)

In [21]:
unique_words = set(all_words)
print(len(unique_words))

2540


In [22]:
vocab_size = 2600

In [23]:
word_count = lambda dense_tf: len(nltk.word_tokenize(dense_tf))
longest_sentence = max(corpus, key= word_count)
length_long_sentence = len(nltk.word_tokenize(longest_sentence))

In [24]:
o_h_e = [one_hot(sent, vocab_size) for sent in corpus]

In [25]:
padded_ = pad_sequences(o_h_e, length_long_sentence, padding='post')

In [26]:
model = Sequential([
    Embedding(vocab_size,100,input_length=length_long_sentence),
    Dropout(0.2),
    LSTM(200),
    Dropout(0.2),
    Dense(1,activation='sigmoid')
])

In [31]:
opt = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
predict = pd.DataFrame(train['Label'])
predict['Code'] = 0
predict.loc[predict.Label == 'Spam', 'Code'] = 1

In [33]:
# Разбивка выборки на тренировочную и тестовою
X_tr, X_te, y_tr, y_te = train_test_split(np.array(padded_), predict['Code'])

In [34]:
model.fit(X_tr, y_tr, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fba94734850>

In [35]:
y_pred = model.predict(X_te)



In [36]:
round(accuracy_score(round(pd.DataFrame(y_pred)[0], 0),y_te)*100,2)

87.92

Комбинация TF/IDF + МОП является сильным инструментом, дающий хороший результат ( работает на уравне нейронных сетей или лучше) и на эту тему есть множество статей.

In [41]:
pip freeze > requirements.txt