# Dataset

- Source: https://www.kaggle.com/datasets/karthickveerakumar/spam-filter

In [15]:
import pandas as pd
import numpy as np

In [3]:
pd.read_csv("data/emails.csv")

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [4]:
emails = pd.read_csv("data/emails.csv")
emails.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


- The first column is the text of the email in string format
- The second column tells us if the email is spam (1) or ham (0)

# Preprocessing

In [6]:
def process_email(text):
    """Turning the text string into a list of words"""
    text = text.lower()
    return list(set(text.split()))

In [7]:
emails['words'] = emails['text'].apply(process_email)
emails.head()

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[logos, content, be, here, ', irresistible, ga..."
1,Subject: the stock trading gunslinger fanny i...,1,"[mcdougall, earmark, einsteinian, herald, like..."
2,Subject: unbelievable new homes made easy im ...,1,"[this, ., dorcas, post, factor, $, for, 169, v..."
3,Subject: 4 color printing special request add...,1,"[., rd, ca, this, additional, fax, graphix, he..."
4,"Subject: do not have money , get software cds ...",1,"[., cds, compatibility, t, ', here, be, ain, b..."


In [8]:
num_emails = len(emails)
num_spam = sum(emails['spam'])

print("Number of emails:", num_emails)
print("Number of spam emails:", num_spam)
print()

# Calculating the prior probability that an email is spam
print("Probability of spam:", num_spam/num_emails)

Number of emails: 5728
Number of spam emails: 1368

Probability of spam: 0.2388268156424581


# Training a naive Bayes model

In [10]:
# write a dictionary, and in this dictionary record every word, 
# and its pair of occurrences in spam and ham

model = dict()

for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [11]:
model['lottery']

{'spam': 9, 'ham': 1}

In [12]:
model['sale']

{'spam': 39, 'ham': 42}

# Using the model to make predictions

The input of the algorithm is the email. It goes through all the words in the email, and for each word, it calculates the probabilities that a spam email contains it and that a ham email contains it. These probabilities arecalculated using the dictionary we defined in the previous section. Then we multiply these probabilities (the naive assumption) and apply Bayes’ theorem to find the probability that an email is spam given that it contains the words on this particular email. 

In [16]:
def predict_naive_bayes(email):
    total = len(emails)
    num_spam = sum(emails['spam'])
    num_ham = total - num_spam
    
    email = email.lower()
    words = set(email.split())
    
    spams = [1.0]
    hams = [1.0]
    
    for word in words:
        if word in model:
            spams.append(model[word]['spam'] / num_spam * total)
            hams.append(model[word]['ham'] / num_ham * total)
            
    prod_spams = np.long(np.prod(spams)*num_spam)
    prod_hams = np.long(np.prod(hams)*num_ham)
    
    return prod_spams/(prod_spams + prod_hams)

In [17]:
predict_naive_bayes('lottery sale')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  prod_spams = np.long(np.prod(spams)*num_spam)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  prod_hams = np.long(np.prod(hams)*num_ham)


0.9638144992048691

In [18]:
predict_naive_bayes('Hi mom how are you')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  prod_spams = np.long(np.prod(spams)*num_spam)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  prod_hams = np.long(np.prod(hams)*num_ham)


0.12554358867164464