In [2]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection.txt', sep = '\t', header = None, names = ['Labels', 'SMS'])

In [3]:
print(messages.head())
print(messages.shape)

  Labels                                                SMS
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


In [4]:
messages['Labels'].value_counts()

ham     4825
spam     747
Name: Labels, dtype: int64

In [5]:
messages['Labels'].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Labels, dtype: float64

In [6]:
messages_random = messages.sample(frac = 1, random_state = 1)
training_test_index = round(len(messages_random) * 0.7)
training_set = messages_random.iloc[: training_test_index, :].reset_index(drop = True)
test_set = messages_random.iloc[training_test_index :, :].reset_index(drop = True)

In [7]:

print("Тренировочная выборка")
print(training_set['Labels'].value_counts(normalize = True))
print()
print("Тестовая выборка")
print(test_set['Labels'].value_counts(normalize = True))


Тренировочная выборка
ham     0.865897
spam    0.134103
Name: Labels, dtype: float64

Тестовая выборка
ham     0.866029
spam    0.133971
Name: Labels, dtype: float64


In [8]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()

  training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()


Unnamed: 0,Labels,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [9]:
vocabulary = []
training_set['SMS'] = training_set['SMS'].str.split()

In [10]:
for text in training_set['SMS']:
    for word in text:
        vocabulary.append(word)        
vocabulary = list(set(vocabulary))

In [11]:
word_counts_per_sms = { word: [0] * len(training_set['SMS']) for word in vocabulary }
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [12]:
word_counts = pd.DataFrame(word_counts_per_sms)
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Labels,SMS,weeks,sugababes,honi,08718727870150ppm,thirunelvali,santa,cthen,ads,...,aids,applyed,gimmi,jus,inches,come,theplace,wright,fine,authorise
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
alpha = 1
p_ham_tr = training_set['Labels'].value_counts(normalize = True)['ham']
p_spam_tr = training_set['Labels'].value_counts(normalize = True)['spam']

In [14]:
spam_messages = training_set_clean[training_set_clean['Labels'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Labels'] == 'ham']

n_words_per_spam = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam.sum()

n_words_per_ham = ham_messages['SMS'].apply(len)
n_ham=n_words_per_ham.sum()

In [15]:
parameters_spam={unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

In [16]:
spam_messages.sum().head()

Labels       spamspamspamspamspamspamspamspamspamspamspamsp...
SMS          [freemsg, why, haven, t, you, replied, to, my,...
weeks                                                        4
sugababes                                                    1
honi                                                         0
dtype: object

In [17]:
spam_words = spam_messages.sum()
ham_words = ham_messages.sum()

In [18]:
n_v = len(vocabulary)
denominator_spam = n_spam+alpha*n_v
denominator_ham = n_ham+alpha*n_v
for word in vocabulary:
    numerator_spam = spam_words[word]+alpha
    parameters_spam[word] = numerator_spam/denominator_spam

    numerator_ham = ham_words[word]+alpha
    parameters_ham[word] = numerator_ham/denominator_ham

In [19]:
parameters_ham['a']

0.013102260270397244

In [20]:
import re

def naive_bayes(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = n_spam
    p_ham_given_message = n_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [22]:
def naive_bayes_classifier(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = n_spam
    p_ham_given_message = n_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'requires human classification'

In [23]:
test_set['predicted'] = test_set['SMS'].apply(naive_bayes_classifier)

In [24]:
test_set

Unnamed: 0,Labels,SMS,predicted
0,ham,"Camera quite good, 10.1mega pixels, 3optical a...",ham
1,ham,At 4. Let's go to bill millers,ham
2,ham,Is there coming friday is leave for pongal?do ...,ham
3,spam,WINNER! As a valued network customer you hvae ...,spam
4,ham,Yar... I tot u knew dis would happen long ago ...,ham
...,...,...,...
1667,ham,"We're all getting worried over here, derek and...",ham
1668,ham,Oh oh... Den muz change plan liao... Go back h...,ham
1669,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...,ham
1670,spam,Text & meet someone sexy today. U can find a d...,spam


In [25]:
test_set['correct'] = test_set['Labels'] == test_set['predicted']

In [26]:
test_set

Unnamed: 0,Labels,SMS,predicted,correct
0,ham,"Camera quite good, 10.1mega pixels, 3optical a...",ham,True
1,ham,At 4. Let's go to bill millers,ham,True
2,ham,Is there coming friday is leave for pongal?do ...,ham,True
3,spam,WINNER! As a valued network customer you hvae ...,spam,True
4,ham,Yar... I tot u knew dis would happen long ago ...,ham,True
...,...,...,...,...
1667,ham,"We're all getting worried over here, derek and...",ham,True
1668,ham,Oh oh... Den muz change plan liao... Go back h...,ham,True
1669,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...,ham,True
1670,spam,Text & meet someone sexy today. U can find a d...,spam,True


In [32]:
from sklearn.metrics import classification_report, accuracy_score

accuracy_score(test_set['Labels'], test_set['predicted'])
print(classification_report(test_set['Labels'],test_set['predicted']))

                               precision    recall  f1-score   support

                          ham       0.99      0.99      0.99      1448
requires human classification       0.00      0.00      0.00         0
                         spam       0.97      0.95      0.96       224

                     accuracy                           0.99      1672
                    macro avg       0.65      0.65      0.65      1672
                 weighted avg       0.99      0.99      0.99      1672



  and should_run_async(code)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
