In [69]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk import stem
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn import preprocessing

In [2]:
messages = pandas.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])
def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [3]:
messages.groupby('label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,message
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,count,4825
ham,unique,4516
ham,top,"Sorry, I'll call later"
ham,freq,30
spam,count,747
spam,unique,653
spam,top,Please call our customer service representativ...
spam,freq,4


4825 VS 747 - выборка явно несбалансированна, спама гораздо меньше

In [4]:
messages[messages['label']=='ham'].count() / messages.count()

label      0.865937
message    0.865937
dtype: float64

Несмотря на то, что точность около 86 процентов, доверять дамми нельзя, т.к. выборка несбалансированна. Это можно доказать:

In [5]:
DummyClas = np.array(['ham']*len(messages))

In [6]:
print(classification_report(DummyClas, messages['label']))

             precision    recall  f1-score   support

        ham       1.00      0.87      0.93      5572
       spam       0.00      0.00      0.00         0

avg / total       1.00      0.87      0.93      5572



  'recall', 'true', average, warn_for)


Что и требовалось доказать: precision и recall в спаме по нулям.

Следовательно, нужно примерно уровнять спам и неспам.

In [7]:
ham = messages[messages['label'] == 'ham']
spam = messages[messages['label'] == 'spam']
hamNEW = ham.sample(n=len(spam))

In [8]:
equal = pandas.concat([hamNEW, spam])
print(equal.groupby('label').describe())

                                                        message
label                                                          
ham   count                                                 747
      unique                                                738
      top     Night has ended for another day, morning has c...
      freq                                                    3
spam  count                                                 747
      unique                                                653
      top     Please call our customer service representativ...
      freq                                                    4


Теперь у нас их поровну, можно начинать работу.

In [9]:
bow1 = CountVectorizer()
bow1.fit_transform(equal['message'])

<1494x4695 sparse matrix of type '<class 'numpy.int64'>'
	with 25890 stored elements in Compressed Sparse Row format>

In [10]:
bowed_messages = bow1.transform(equal['message'])
naive_model = MultinomialNB()
naive_model.fit(bowed_messages, equal['label'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())

0.961207207207 0.0138734760678


Получаем основные параметры, связанные с токенизацией, теперь посмотрим на знаки препинания:

In [12]:
bow2 = CountVectorizer(token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'|,")
bow2.fit_transform(equal['message'])

<1494x4700 sparse matrix of type '<class 'numpy.int64'>'
	with 27366 stored elements in Compressed Sparse Row format>

In [13]:
bowed_messages = bow2.transform(equal['message'])
naive_model = MultinomialNB()
naive_model.fit(bowed_messages, equal['label'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())

0.960531531532 0.0141047509036


(0.960522522523 0.00807850451558) и (0.958513513514 0.00930812014104) - ничего не поменялось, продолжаем работу:

In [38]:
from nltk.stem.lancaster import LancasterStemmer
equal['stem'] = equal.message.apply(LancasterStemmer().stem)

In [39]:
from nltk.tokenize import RegexpTokenizer
bow3 = CountVectorizer(RegexpTokenizer(r'\w+').tokenize)
bow3.fit_transform(equal['stem'])

<1494x4703 sparse matrix of type '<class 'numpy.int64'>'
	with 25894 stored elements in Compressed Sparse Row format>

In [40]:
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())

0.960531531532 0.0141047509036


Стемминг ничего не изменил

In [57]:
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [59]:
bow4 = CountVectorizer(LemmaTokenizer())
bow4.fit_transform(equal['message'])

<1494x4695 sparse matrix of type '<class 'numpy.int64'>'
	with 25890 stored elements in Compressed Sparse Row format>

In [60]:
bowed_messages = bow4.transform(equal['message'])
naive_model = MultinomialNB()
naive_model.fit(bowed_messages, equal['label'])
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())

0.961207207207 0.0138734760678


Лемматизация равносильна стандартным настройкам, так что не играет никакой роли.

In [62]:
bow5 = CountVectorizer(stop_words='english')
bow5.fit_transform(equal['message'])

<1494x4461 sparse matrix of type '<class 'numpy.int64'>'
	with 16642 stored elements in Compressed Sparse Row format>

In [63]:
bowed_messages = bow5.transform(equal['message'])
naive_model = MultinomialNB()
naive_model.fit(bowed_messages, equal['label'])
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())

0.958468468468 0.0211833968917


Стоп слова также не играет большой роли, а только ухудшают.

Сравним с TfidfVectorizer:


In [70]:
bow6 = TfidfVectorizer()
bow6.fit_transform(equal['message'])

<1494x4695 sparse matrix of type '<class 'numpy.float64'>'
	with 25890 stored elements in Compressed Sparse Row format>

In [75]:
bowed_messages = bow6.transform(equal['message'])
naive_model = MultinomialNB()
naive_model.fit(bowed_messages, equal['label'])
cv_results = cross_val_score(naive_model, bowed_messages, equal['label'], cv=10, scoring='accuracy')
print(cv_results.mean(), cv_results.std())
print(classification_report(equal['label'], naive_model.predict(bowed_messages)))

0.965225225225 0.0148225989344
             precision    recall  f1-score   support

        ham       0.98      0.99      0.98       747
       spam       0.99      0.97      0.98       747

avg / total       0.98      0.98      0.98      1494



NameError: name 'draw_learning_curve' is not defined