# Coding a spam classifier with naive Bayes

Based on the book "grokking Machine Learning" by Luis G. Serrano

In [21]:
# Import modules
import pandas as pd
import numpy as np

In [3]:
emails = pd.read_csv('emails.csv')
emails.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
def process_email(text):
    text = text.lower()
    return list(text.split())

emails['words'] = emails['text'].apply(process_email)
emails.head(10)

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[subject:, naturally, irresistible, your, corp..."
1,Subject: the stock trading gunslinger fanny i...,1,"[subject:, the, stock, trading, gunslinger, fa..."
2,Subject: unbelievable new homes made easy im ...,1,"[subject:, unbelievable, new, homes, made, eas..."
3,Subject: 4 color printing special request add...,1,"[subject:, 4, color, printing, special, reques..."
4,"Subject: do not have money , get software cds ...",1,"[subject:, do, not, have, money, ,, get, softw..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[subject:, great, nnews, hello, ,, welcome, to..."
6,Subject: here ' s a hot play in motion homela...,1,"[subject:, here, ', s, a, hot, play, in, motio..."
7,Subject: save your money buy getting this thin...,1,"[subject:, save, your, money, buy, getting, th..."
8,Subject: undeliverable : home based business f...,1,"[subject:, undeliverable, :, home, based, busi..."
9,Subject: save your money buy getting this thin...,1,"[subject:, save, your, money, buy, getting, th..."


In [12]:
num_emails = len(emails)
spam_emails = sum(emails['spam'])

print(f'Number of emails: {num_emails}')
print(f'Number of spam emails: {spam_emails}')

# Calculate prior probability, that email is spam
p_spam = spam_emails / num_emails
print(f'\nProbability of spam: {p_spam}')

p_ham = 1 - p_spam
print(f'Probability that an email is ham: {p_ham}')

Number of emails: 5728
Number of spam emails: 1368

Probability of spam: 0.2388268156424581
Probability that an email is ham: 0.7611731843575419


## Training a naive Bayes model

In [13]:
model = {}

# Training process
for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [14]:
model['lottery']

{'spam': 21, 'ham': 1}

In [15]:
model['sale']

{'spam': 51, 'ham': 57}

## Using model to make predicitons

In [16]:
def predict_bayes(word):
    word = word.lower()
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [17]:
predict_bayes('lottery')

0.9545454545454546

In [18]:
predict_bayes('sale')

0.4722222222222222

In [33]:
def predict_naive_bayes(email):
    total = len(emails)
    num_spam = sum(emails['spam'])
    num_ham = total - num_spam
    email = email.lower()
    words = set(email.split())
    spams = [1.0]
    hams = [1.0]
    for word in words:
        if word in model:
            spams.append(model[word]['spam']/num_spam*total)
            hams.append(model[word]['ham']/num_ham*total)
    # Multiplies all the previous probabilities times the prior probability of the email being spam/ham
    prod_spams = np.compat.long(np.prod(spams)*num_spam)
    prod_hams = np.compat.long(np.prod(hams)*num_ham)
    return prod_spams/(prod_spams+prod_hams)

In [34]:
predict_naive_bayes('lottery sale')

[1.0, 87.9298245614035, 213.5438596491228]
25686763


0.9835755120483081

In [25]:
predict_naive_bayes('hi mom how are you')

0.20298303946655008

In [26]:
predict_naive_bayes('enter the lottery to win three million dollars')

0.999849057446422

In [28]:
predict_naive_bayes('meeting tomorrow at 7')

0.000892658664893121