# Bayes - Spam Filtering
# Introduction

Throughout this laboratory you will be working with an SMS database. The text messages in this database are either spam or ham (not spam). You are to classify them using Bayesian probabilities and will therefore create a Naive Bayes classifier.

Serving as guide for this laboratory will be the code comments and the following formula(s):

**P(A∣B) = (P(A) * P(B∣A)) / P(B)**

P(A∣B1, B2, ..., Bn) = (P(B1|A) * P(B2∣A) * ... * P(Bn|A) * P(A)) / (P(B1) * P(B2) * ... * P(Bn))

where:

P(A) = The probability of A occurring

P(B) = The probability of B occurring

P(A∣B) =The probability of A given B

P(B∣A) = The probability of B given A

E. g. P(Ham|w1, w2, w3) is the probability that a message is ham given that it contains the words w1, w2 and w3.
​


# Warming up


In [None]:
def prob_A_if_B(prob_A, prob_B, prob_B_if_A):
    return (prob_A * prob_B_if_A) / prob_B

prob_rain = 0.1  # Probability that it rains throughout any given day
prob_morning_cloud = 0.4  # Probability of having clouds in the morning of any given day
prob_morning_cloud_if_rain = 0.5  # Probability that the morning was cloudy in any rainy day

# Calculate probability of raining if the morning was cloudy
print(prob_A_if_B(prob_rain, prob_morning_cloud, prob_morning_cloud_if_rain))  # Should be 0.125


0.125


# Read the data from file
You will be reading a set of 5572 messages into a pandas dataframe

In [None]:
import pandas as pd
import re

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names=['Label', 'SMS'])
print(sms_spam.shape)
print(sms_spam.head())

(5572, 2)
  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


# Split data into train and test sets

In [None]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape) # We train on 4458 messages
print(test_set.shape) # At the end we'll test our classifier on 1114 messages

(4458, 2)
(1114, 2)


# Clean data

In [None]:
# Redundant, but done twice just in case you run this cell multiple times (avoids an error)
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

# Normalize and clean data (lowercase, remove punctuation)
training_set['Label'].value_counts(normalize=True)
test_set['Label'].value_counts(normalize=True)

# Before cleaning
print("Before cleaning:")
print(training_set.head(3))

# After cleaning
training_set['SMS'] = training_set['SMS'].str.replace(
   '\W', ' ') # Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()

print("\n After cleaning:")
print(training_set.head(3))

# Split string into words
training_set['SMS'] = training_set['SMS'].str.split()

print("\n After splitting into words:")
print(training_set.head(3))

Before cleaning:
  Label                                            SMS
0   ham                   Yep, by the pretty sculpture
1   ham  Yes, princess. Are you going to make me moan?
2   ham                     Welp apparently he retired

 After cleaning:
  Label                                            SMS
0   ham                   yep  by the pretty sculpture
1   ham  yes  princess  are you going to make me moan 
2   ham                     welp apparently he retired

 After splitting into words:
  Label                                                SMS
0   ham                  [yep, by, the, pretty, sculpture]
1   ham  [yes, princess, are, you, going, to, make, me,...
2   ham                    [welp, apparently, he, retired]


  training_set['SMS'] = training_set['SMS'].str.replace(


# Create dictionary of word occurences

In [None]:
# Save a list of the words found in all of the training messages
vocabulary = []
for sms in training_set['SMS']:
   for word in sms:
      vocabulary.append(word)

# Remove duplicates
vocabulary = list(set(vocabulary))

print(vocabulary)



In [None]:
# Create an empty dictionary where each key is a (unique) word, each value is a list of num_training_messages zeros
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

# Fill the dictionary by adding to each word the number of times it appears in each message
# e.g. word_counts_per_sms['ur'][5] = 2 means that the word "ur" appears twice in the 5th message of the dataset
for index, sms in enumerate(training_set['SMS']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

# Transform everything back to a pandas dataframe
word_counts = pd.DataFrame(word_counts_per_sms)

# Add the label and SMS to the data
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,aaniye,hooked,02073162414,accommodation,7oz,part,firmware,review,...,reacting,m8,str8,suggestions,attack,chef,gumby,nike,diff,karaoke
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Number and probability of spam, ham messages (to be implemented)
# Calculate individual word probabilities for both ham & spam

In [None]:
# Isolating spam and ham messages
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing - add 1 (alpha) to each word count to avoid probabilities equal to 0
alpha = 1

print("p_spam =", p_spam)
print("p_ham =", p_ham)


p_spam = 0.13458950201884254
p_ham = 0.8654104979811574


In [None]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate and store parameters (probabilities) for each word if message is spam and if message is ham
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
   parameters_ham[word] = p_word_given_ham

# Probability of word existing given that message is ham
print("ham word probabilities:", parameters_ham, '\n')

# Probability of word existing given that message is spam
print("spam word probabilities:", parameters_spam)





# Classify message function (to be implemented)


In [None]:
def classify_message(message):
    '''
    message: a string
    '''
    # Remove punctuation from the message, change message to lowercase, and then split it into words
    # (message becomes a list of strings, each string is a word)
    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    if p_spam_given_message > p_ham_given_message:
        return 'spam'
    elif p_spam_given_message < p_ham_given_message:
        return 'ham'
    else:
        return 'unknown'


# Predictions and accuracy

In [None]:
# See some predictions of your classifier on the test dataset
test_set['predicted'] = test_set['SMS'].apply(classify_message)
test_set.head(10)

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
5,ham,But my family not responding for anything. Now...,ham
6,ham,U too...,ham
7,ham,Boo what time u get out? U were supposed to ta...,ham
8,ham,Genius what's up. How your brother. Pls send h...,ham
9,ham,I liked the new mobile,ham


In [None]:
# Test your own custom-made messages. Replace the ones below with whatever you want.
my_message = 'click shopping now fast urgent'
my_spam_message = "Call fast BIG reward"

print(classify_message(my_message))
print(classify_message(my_spam_message))

spam
spam


In [None]:
correct = 0
total = test_set.shape[0]

# Iterate through the test_set, check how many predictions were correct
for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
         correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)

# If accuracy is above 0.98, your code is correct
print('Accuracy:', correct/total)

Correct: 1100
Incorrect: 14
Accuracy: 0.9874326750448833


# Create tests

In [None]:
def test_0_p_spam_ham():
    if p_spam < 0.11 or p_spam > 0.14:
        return False
    if p_ham > 0.88 or p_ham < 0.84:
        return False
    return True

def test_1_ham():
    pred_test_list = [classify_message(test_set['SMS'][100]), classify_message(test_set['SMS'][300]), classify_message(test_set['SMS'][400]), classify_message(test_set['SMS'][777])]
    corr_test_list = ['ham', 'ham', 'ham', 'ham']
    return pred_test_list == corr_test_list

def test_2_spam():
    pred_test_list = []
    pred_test_list.append(classify_message(test_set['SMS'][500]))
    pred_test_list.append(classify_message(test_set['SMS'][2]))
    pred_test_list.append(classify_message(test_set['SMS'][28]))
    pred_test_list.append(classify_message(test_set['SMS'][121]))
    corr_test_list = ['spam', 'spam', 'spam', 'spam']

    return pred_test_list == corr_test_list

def test_3_combined():
    pred_test_list = []
    pred_test_list.append(classify_message(test_set['SMS'][111]))
    pred_test_list.append(classify_message(test_set['SMS'][222]))
    pred_test_list.append(classify_message(test_set['SMS'][333]))
    pred_test_list.append(classify_message(test_set['SMS'][444]))
    pred_test_list.append(classify_message(test_set['SMS'][555]))
    pred_test_list.append(classify_message(test_set['SMS'][556]))
    pred_test_list.append(classify_message(test_set['SMS'][777]))
    pred_test_list.append(classify_message(test_set['SMS'][678]))
    corr_test_list = ['ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham']

    return pred_test_list == corr_test_list

# Run tests

In [None]:
def run_tests():
    test_names = ['test_0_p_spam_ham', 'test_1_ham', 'test_2_spam', 'test_3_combined']
    tests_passed = 0
    for i in range(4):
        if eval(test_names[i] + '()') is True:
            print("Test " + str(i + 1) + '/4 passed')
            tests_passed += 1
        else:
            print("Test " + str(i + 1) + '/4 failed')
    if tests_passed == 4:
        print("\nAll tests have passed")
    else:
        print(str(tests_passed) + "/4 tests passed in total")
run_tests()

Test 1/4 passed
Test 2/4 passed
Test 3/4 passed
Test 4/4 passed

All tests have passed
