1. Импорт библиотек

In [86]:
import pickle, gzip, re, nltk
from sklearn.model_selection import train_test_split
import pandas as pd
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

2. Загрузка коллекции слов при отсутствии

In [87]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

3. Разархивация данных

In [88]:
data = gzip.open("data/SMSSpamCollection.txt.train.gz", "rb")
text = data.read()
data.close()
file = open('data/SMSSpamCollection.txt.train', 'wb')
file.write(text)
file.close()

4. Открываем файл

In [89]:
sms = pd.read_csv('data/SMSSpamCollection.txt.train', sep='\t', names=["label", "text"])
sms.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


5. Разделение датасета на тестовую часть и часть для обучения

In [90]:
X_train, X_test, y_train, y_test = train_test_split(sms['text'], sms['label'], test_size=0.2)

6. Объявление функции, токенизатора (превращает строку текста в список токенов)

In [91]:
def tokenizer(line):
    line = re.sub('[^ A-Za-z]', '', line)
    line = line.lower().split()
    for word in line:
        if word in stopwords.words('english'):
            line.remove(word)
    return line

7. Обучение модели

In [92]:
count_vector, tfidf, classifier = CountVectorizer(analyzer=tokenizer), TfidfTransformer(), SGDClassifier()

training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

tfidfX = tfidf.fit_transform(training_data)

classifier.fit(tfidfX, y_train)

SGDClassifier()

8. Проверка модели

In [93]:
print(classification_report(classifier.predict(testing_data),y_test))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       851
        spam       0.96      0.73      0.83       149

    accuracy                           0.96      1000
   macro avg       0.96      0.86      0.90      1000
weighted avg       0.96      0.96      0.95      1000



9. Сохранение модели и векторизатора в файл

In [94]:
with open('models/spam_classifier.obj', 'wb') as file:
    pickle.dump(classifier, file)
with open('models/data_vectorizer.obj', 'wb') as file:
    pickle.dump(count_vector, file)