# Building a Spam Filter with Naive Bayes

The dataset was put together by Tiago A. Almeida and José María Gómez Hidalgo, and it can be downloaded from the The UCI Machine Learning Repository. Dataquest provided the tutorial.

In [144]:
import pandas as pd

In [193]:
sms = pd.read_csv('/SMSSpamCollection.csv', sep ='\t', header = None, names = ['Label', 'SMS'])

In [194]:
sms.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [195]:
spam = 0
ham = 0
for item in sms["Label"]:
    if item == 'spam':
        spam += 1
    else:
        ham += 1      

In [196]:
sms.shape

(5572, 2)

In [197]:
spam

747

In [198]:
ham

4825

In [199]:
percent_sp = spam/len(sms)*100
percent_hm = ham/len(sms)*100

In [200]:
print(percent_sp)
print(percent_hm)

13.406317300789663
86.59368269921033


The dataset contains ~87% non spam and ~13% spam. We will spilt 80% of our data for the training set and the rest will be used to test our algorithm later.

In [201]:
random = sms.sample(frac = 1, random_state = 1)

index = round(len(random) * 0.8)

train = random[:index].reset_index(drop = True)
test = random[index:].reset_index(drop= True)

In [202]:
train.shape

(4458, 2)

In [203]:
test.shape

(1114, 2)

Examining the percent spam and non-spam in both sets.

In [204]:
sp_train = 0
hm_train = 0
for item in train["Label"]:
    if item == 'spam':
        sp_train += 1
    else:
        hm_train += 1

sp_test = 0
hm_test = 0
for item in test["Label"]:
    if item == 'spam':
        sp_test += 1
    else:
        hm_test += 1   

In [205]:
train['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [206]:
test['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

The percent of spam for both the test and the training set is close to the value of the entire set.

In [207]:
# test['SMS'] = test['SMS'].str.replace('\W', ' ').str.lower()
train['SMS']= train['SMS'].str.replace('\W', ' ').str.lower()

In [208]:
test.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


In [209]:
train['SMS'] = train["SMS"].str.split()

In [210]:
train.head()

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [211]:
vocabulary = []
# test['SMS'] = test["SMS"].str.split()

In [212]:
for item in train['SMS']:
        for word in item:
            vocabulary.append(word)

In [213]:
vocabulary = list(set(vocabulary))

In [214]:
len(train['SMS'])

4458

In [215]:
word_counts_per_sms = {unique_word: [0] * len(train['SMS']) for unique_word in vocabulary}

In [216]:
for index, sms in enumerate(train['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [217]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [218]:
print(len(word_counts), len(train))
word_counts.head()

4458 4458


Unnamed: 0,twittering,engagement,masters,invited,pobox334,shouting,land,see,ecstasy,packs,...,fixed,correct,george,evrey,50award,recovery,loving,unjalur,fit,divorce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [219]:
train

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."
...,...,...
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ..."
4454,ham,"[babe, i, fucking, love, you, too, you, know, ..."
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2..."
4456,ham,"[hello, my, boytoy, geeee, i, miss, you, alrea..."


In [220]:
training_set = pd.concat([train, word_counts],axis =1)

In [221]:
training_set.head()

Unnamed: 0,Label,SMS,twittering,engagement,masters,invited,pobox334,shouting,land,see,...,fixed,correct,george,evrey,50award,recovery,loving,unjalur,fit,divorce
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [222]:
N_ham = 0
N_spam = 0
for index, item in enumerate(training_set["Label"]):
    if item == 'ham':
        N_ham += len(training_set["SMS"].iloc[index])
    if item == 'spam':
        N_spam += len(training_set["SMS"].iloc[index])
        
n_spam = training_set[training_set['Label'] == 'spam'].sum(
                                                    axis=1).sum()
n_ham = training_set[training_set['Label'] == 'ham'].sum(
                                                    axis=1).sum()        

In [223]:
print(N_ham,n_ham, n_spam, N_spam )

57237 57237 15190 15190


In [224]:
alpha = 1
n_vocab = len(vocabulary)

In [225]:
p_ham = training_set['Label'].value_counts(normalize = True)['ham']
p_spam = training_set['Label'].value_counts(normalize = True)['spam']

In [226]:
spam_dict = {word:0 for word in vocabulary}
ham_dict = {word:0 for word in vocabulary}
spam_df = training_set[training_set["Label"] == 'spam']
ham_df = training_set[training_set["Label"] == 'ham']

In [227]:
for word in vocabulary:
    n_w_spam = spam_df[word].sum()
    p_w_spam = (n_w_spam+alpha)/(n_spam+alpha*n_vocab)
    spam_dict[word] =  p_w_spam
    n_w_ham = ham_df[word].sum()
    p_w_ham = (n_w_ham+alpha)/(n_ham+alpha*n_vocab)
    ham_dict[word] =  p_w_ham

In [240]:
import re
def classify(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham  
    
    for word in message:
        if word in spam_dict:
            p_spam_given_message *= (spam_dict[word])
        if word in ham_dict:    
             p_ham_given_message *= ham_dict[word]  

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal propabilities, have a human classify this!')

In [181]:
test_msg1 = 'WINNER!! This is the secret code to unlock the money: C3421.'
test_msg2 = "Sounds good, Tom, then see u there"

In [182]:
classify(test_msg1)


P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [183]:
classify(test_msg2)

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [229]:
test.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


In [252]:
def classify_test(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham  
    
    for word in message:
        if word in spam_dict:
            p_spam_given_message *= (spam_dict[word])
        if word in ham_dict:    
             p_ham_given_message *= ham_dict[word]  

    if p_ham_given_message > p_spam_given_message:
        return('ham')
    elif p_ham_given_message < p_spam_given_message:
        return('spam')
    else:
        return('Equal propabilities')

In [253]:
test['predicted'] = test['SMS'].apply(classify_test)

In [248]:
test['Label'].value_counts()

ham     967
spam    147
Name: Label, dtype: int64

In [254]:
test['predicted'].value_counts()

ham                    969
spam                   144
Equal propabilities      1
Name: predicted, dtype: int64

In [255]:
test.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


Measuring accuracy of our spam filter

In [258]:
correct = 0
total = len(test)
for row in test.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1
        
accuracy = correct/total
print(accuracy)

0.9874326750448833


Our accuracy for our spam filter is 98%