# CMSC 197 Section 1  - Problem Set 2 - HW 4<br>## Aren Deza, 2019-50022
# Naive Bayes Spam Filter

In [1]:
# Packages used for Data Processing
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import email

# Packages used for File Management
import os

# Packages for text replacement and vectorizing
import re
from sklearn.feature_extraction.text import CountVectorizer as cv

### Importing the data

In [3]:
# Importing the data and placing it in a dataframe
filelist = os.listdir('Data/trec06p-cs280/data')
labels = pd.read_csv('Data/trec06p-cs280/labels', sep = ' ')

# creating a list of stop words from the text file
stop_words = list(open('stop_words.txt', 'r').read().split())

In [4]:
# Making the dataframe easier to use
df = pd.DataFrame({'ham':['ham'],'../data/000/000':['../data/000/000']})
df = pd.concat([df,labels])
df.reset_index(inplace = True)

df.rename(columns = {'index':'content','ham':'type','../data/000/000':'location'}, inplace = True)
df['content'] = ''
df['location'] = df['location'].str.replace('^..', '', regex = True)

df.head()

Unnamed: 0,content,type,location
0,,ham,/data/000/000
1,,spam,/data/000/001
2,,spam,/data/000/002
3,,ham,/data/000/003
4,,spam,/data/000/004


### Extracting the bodies of the emails

In [7]:
def email_extractor(df, stopwords_yes):
    for i in df.index:
        # iterate through each location and then extract the email body
        # note to self, do a more elegant version of this next time.
        with open("Data/trec06p-cs280" + df['location'][i], "rb") as f:
            msg = f.read().decode('latin-1')
            msg = email.message_from_string(msg)
        # contingency if msg is multipart
        if msg.is_multipart():
            for part in msg.walk():
                payload = part.get_payload(decode=True)
                # contingency in case of no payload, just decode
                if str(type(payload)) == "<class 'NoneType'>":
                    msg = msg.get_payload()[0].get_payload();
                    break
                msg = payload.decode()
        else:
            payload = msg.get_payload(decode=True)
            if str(type(payload)) == "<class 'NoneType'>":
                msg = msg.get_payload()[0].get_payload();
                break
            msg = payload.decode('latin-1')

        # in case of list, convert to single uninterrupted string
        if isinstance(msg,list):
            temp = ""
            for j in msg:
                payload = j.get_payload(decode='True')
                if str(type(payload)) == "<class 'NoneType'>":
                    j = j.get_payload()[0].get_payload();
                    temp = temp + j + " "
                    continue
                j = payload.decode('latin-1')
                temp = temp + j + " "
            msg = temp

        # Lowercase + Removing special chars, stopwords, extra whitespace
        if msg != '':
            msg = msg.lower()
            msg = re.sub(r"[^a-z\n ]", "", msg)
            msg = re.sub(r'\n' , ' ', msg)
            if stopwords_yes:
                li = [word for word in msg.split() if word not in stop_words]
                msg = " ".join(li)

        # adds msg
        df['content'][i] = msg
    return df

In [None]:
df2 = df.copy()
df = email_extractor(df, True)
df

### Separating into Testing and Training Sets

In [6]:
# separating df into a training and testing dataset.
# Knew it was 213000 because there are 300 emails per file & we cutoff at 70.
training = df.iloc[:21300,]
testing = df.iloc[21300:]

### Creating a feature matrix

In [7]:
# Using countvectorizer to create feature matrices for spam and ham
# type will be dict
spam_fm = cv().fit(training[training['type']=='spam']['content'].values)
ham_fm = cv().fit(training[training['type']=='ham']['content'].values)

### Computing the Prior Probabilities

In [8]:
hamcount = training[(training['type'] == 'ham' )].count()[0]
spamcount = training[(training['type'] == 'spam' )].count()[0]
print("\nnumber of messages that are ham: ", hamcount, "\nnumber of messages that are spam:", spamcount)

h_prior = 7523/21300
s_prior = 13777/21300
s_log_prior = np.log(s_prior)
h_log_prior = np.log(h_prior)
print("---\nP(c = ham) =", h_prior, "\nP(c = spam) =", s_prior)


number of messages that are ham:  7523 
number of messages that are spam: 13777
---
P(c = ham) = 0.3531924882629108 
P(c = spam) = 0.6468075117370892


### Computing the likelihood of each word (with Laplace smoothing)

In [9]:
s_fm = pd.DataFrame(spam_fm.vocabulary_.copy().items(), columns=['word', 's-count'])
h_fm = pd.DataFrame(ham_fm.vocabulary_.copy().items(), columns=['word', 'h-count'])
fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")

# This is the alpha value we can adjust for laplace smoothing
alpha = 1

# each word has likelihood for spam and ham
# likelihood = (number of times word in s/h + alpha) / (words in spam) + alpha(number of words)
fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

In [10]:
fm

Unnamed: 0,word,h-count,s-count,s%,h%
0,mailing,49103,25697,0.498816,0.520467
1,list,47137,25144,0.488082,0.499629
2,weeks,80497,39624,0.769149,0.853221
3,ago,6346,1143,0.022206,0.067274
4,running,66506,32550,0.631837,0.704927
...,...,...,...,...,...
10601,employers,27986,15506,0.301002,0.296642
10602,downgrade,25746,14364,0.278835,0.272900
10603,barnett,12300,3459,0.067161,0.130382
10604,sore,70664,34531,0.670290,0.748998


### Classifying Emails in the testing set

In [11]:
from collections import Counter

In [12]:
def NaiveBayes(email):
    is_spam = s_log_prior
    is_ham = h_log_prior
    email = Counter(email.split())
    for word in email:
        if word in fm['word'].values:
            is_spam += np.multiply(np.log((fm[fm['word'] == word]['s%'].values[0])), email.get(word))
            is_ham += np.multiply(np.log((fm[fm['word'] == word]['h%'].values[0])), email.get(word))
    is_spam = np.exp(is_spam)
    is_ham = np.exp(is_ham)
    if is_spam > is_ham:
        return 'spam'
    else:
        return 'ham'

In [13]:
# Testing the function we've created using emails selected from the training set.

print(NaiveBayes(df['content'][0]))
print(NaiveBayes(df['content'][68]))

ham
spam


In [14]:
# Proceeding to testing using the testing set.
# We only use 600 emails because I do not currently have a faster method of doing this.
test = testing.iloc[:600]
test = test.copy()
test['prediction'] = ''
len(test)

600

In [15]:
test['prediction'] = test['content'].apply(NaiveBayes)

test

Unnamed: 0,content,type,location,prediction
21300,hesitantly derive perverse satisfaction clodho...,spam,/data/071/000,spam
21301,things perform experiment display will remain ...,ham,/data/071/001,ham
21302,best offer month viggra ci ialis vaiium xa naa...,spam,/data/071/002,ham
21303,de ar wne cr doesnt matter ow real st mmed ia ...,spam,/data/071/003,ham
21304,special offer adobe video collection adobe pre...,spam,/data/071/004,ham
...,...,...,...,...
21895,html head meta httpequivcontenttype contenttex...,spam,/data/072/295,ham
21896,ra httpwwwbasioscuacom whistling voices releas...,spam,/data/072/296,ham
21897,bwsekfkb bpqqib httpiknowyoucoukdeai bldkb jio...,spam,/data/072/297,spam
21898,pm wrote question set mouse input device would...,ham,/data/072/298,ham


### Performance Evaluation

In [16]:
test['type'][21300]

'spam'

In [17]:
def performance_eval(test):
    FP = 0
    FN = 0
    TP = 0
    TN = 0
    for i in test.index:
        # if the prediction does not match type
        if test['type'][i] != test['prediction'][i]:
            # misclassified spam email
            if test['type'][i] == 'spam':
                FN += 1
            # misclassified ham email
            else: 
                FP += 1
        # if the prediction does match type
        else:
            # correctly classified spam
            if test['type'][i] == 'spam':
                TP += 1
            # correctly classified ham
            else: 
                TN += 1           
    Acc = (TN + TP)/(TN + TP + FN + FP)
    r = TP/(TP + FN)
    P = TP/(TP + FP)
    print("\nFP:", FP, "\nFN:", FN, "\nTP:", TP, "\nTN:", TN, "\n-----\nAcc:", Acc, "\nr:  ", r, "\nP:  ", P)
    
    return [Acc, r, P]

wst = performance_eval(test)


FP: 17 
FN: 272 
TP: 119 
TN: 192 
-----
Acc: 0.5183333333333333 
r:   0.30434782608695654 
P:   0.875


# Results and Discussion
This is just the experiments. See the pdf for the full discussion.

In [18]:
df_nostop = email_extractor(df2, False)
df_nostop

Unnamed: 0,content,type,location
0,the mailing list i queried about a few weeks a...,ham,/data/000/000
1,...,spam,/data/000/001
2,academic qualifications available from prestig...,spam,/data/000/002
3,greetings all this is to verify your subscrip...,ham,/data/000/003
4,try chauncey may conferred the luscious not co...,spam,/data/000/004
...,...,...,...
37817,great news expec ted infinex ventures inc inf...,spam,/data/126/017
37818,the oil sector is going crazy this is our week...,spam,/data/126/018
37819,httpvdtobjdocscaninfo suffering from pain dep...,spam,/data/126/019
37820,u n i v e r s i t y d i p l o m a s do you w...,spam,/data/126/020


In [19]:
training = df_nostop.iloc[:21300,]
testing = df_nostop.iloc[21300:]

spam_fm = cv().fit(training[training['type']=='spam']['content'].values)
ham_fm = cv().fit(training[training['type']=='ham']['content'].values)

hamcount = training[(training['type'] == 'ham' )].count()[0]
spamcount = training[(training['type'] == 'spam' )].count()[0]

h_prior = 7523/21300
s_prior = 13777/21300
s_log_prior = np.log(s_prior)
h_log_prior = np.log(h_prior)

s_fm = pd.DataFrame(spam_fm.vocabulary_.copy().items(), columns=['word', 's-count'])
h_fm = pd.DataFrame(ham_fm.vocabulary_.copy().items(), columns=['word', 'h-count'])
fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")

alpha = 1

fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test2 = testing.iloc[:600]
test2 = test.copy()
test2['prediction'] = ''

In [20]:
test2['prediction'] = test2['content'].apply(NaiveBayes)

test2

Unnamed: 0,content,type,location,prediction
21300,hesitantly derive perverse satisfaction clodho...,spam,/data/071/000,ham
21301,things perform experiment display will remain ...,ham,/data/071/001,ham
21302,best offer month viggra ci ialis vaiium xa naa...,spam,/data/071/002,ham
21303,de ar wne cr doesnt matter ow real st mmed ia ...,spam,/data/071/003,ham
21304,special offer adobe video collection adobe pre...,spam,/data/071/004,ham
...,...,...,...,...
21895,html head meta httpequivcontenttype contenttex...,spam,/data/072/295,ham
21896,ra httpwwwbasioscuacom whistling voices releas...,spam,/data/072/296,ham
21897,bwsekfkb bpqqib httpiknowyoucoukdeai bldkb jio...,spam,/data/072/297,spam
21898,pm wrote question set mouse input device would...,ham,/data/072/298,ham


In [21]:
wost = performance_eval(test2)


FP: 17 
FN: 274 
TP: 117 
TN: 192 
-----
Acc: 0.515 
r:   0.29923273657289 
P:   0.8731343283582089


In [22]:
val1 = ['removed', 'kept']
val2 = [wst[0], wost[0]]
val3 = [wst[1], wost[1]]
val4 = [wst[2], wost[2]]

results = pd.DataFrame(columns = ['Stopwords','Accuracy','Recall','Precision'])
results['Stopwords'] = ['removed', 'kept']
results['Accuracy'] = [wst[0], wost[0]]
results['Recall'] = [wst[1], wost[1]]
results['Precision'] = [wst[2], wost[2]]

results

Unnamed: 0,Stopwords,Accuracy,Recall,Precision
0,removed,0.518333,0.304348,0.875
1,kept,0.515,0.299233,0.873134


### no. 2

In [23]:
training = df.iloc[:21300,]
testing = df.iloc[21300:]
spam_fm = cv().fit(training[training['type']=='spam']['content'].values)
ham_fm = cv().fit(training[training['type']=='ham']['content'].values)
hamcount = training[(training['type'] == 'ham' )].count()[0]
spamcount = training[(training['type'] == 'spam' )].count()[0]
h_prior = 7523/21300
s_prior = 13777/21300
s_log_prior = np.log(s_prior)
h_log_prior = np.log(h_prior)

s_fm = pd.DataFrame(spam_fm.vocabulary_.copy().items(), columns=['word', 's-count'])
h_fm = pd.DataFrame(ham_fm.vocabulary_.copy().items(), columns=['word', 'h-count'])
fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")

fm['total'] = fm['h-count'] + fm['s-count']
fm = fm[fm['total'] > 100000]

In [24]:
alpha = 1

fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test3 = testing.iloc[:600]
test3 = test.copy()
test3['prediction'] = ''

In [25]:
fm

Unnamed: 0,word,h-count,s-count,total,s%,h%
2,weeks,80497,39624,120121,0.919502,0.936873
5,set,68473,33330,101803,0.773449,0.796932
7,server,68393,33295,101688,0.772637,0.796001
14,serious,68366,33283,101649,0.772358,0.795687
20,will,81039,39870,120909,0.925210,0.943181
...,...,...,...,...,...,...
10594,xga,82397,40279,122676,0.934701,0.958986
10596,toolbar,75631,37708,113339,0.875041,0.880240
10598,shabby,68683,33390,102073,0.774841,0.799376
10600,toto,75768,37762,113530,0.876294,0.881835


In [26]:
test3['prediction'] = test3['content'].apply(NaiveBayes)

test3

Unnamed: 0,content,type,location,prediction
21300,hesitantly derive perverse satisfaction clodho...,spam,/data/071/000,spam
21301,things perform experiment display will remain ...,ham,/data/071/001,ham
21302,best offer month viggra ci ialis vaiium xa naa...,spam,/data/071/002,spam
21303,de ar wne cr doesnt matter ow real st mmed ia ...,spam,/data/071/003,spam
21304,special offer adobe video collection adobe pre...,spam,/data/071/004,spam
...,...,...,...,...
21895,html head meta httpequivcontenttype contenttex...,spam,/data/072/295,spam
21896,ra httpwwwbasioscuacom whistling voices releas...,spam,/data/072/296,spam
21897,bwsekfkb bpqqib httpiknowyoucoukdeai bldkb jio...,spam,/data/072/297,spam
21898,pm wrote question set mouse input device would...,ham,/data/072/298,spam


In [27]:
performance_eval(test3)


FP: 194 
FN: 25 
TP: 366 
TN: 15 
-----
Acc: 0.635 
r:   0.9360613810741688 
P:   0.6535714285714286


[0.635, 0.9360613810741688, 0.6535714285714286]

### no. 3

In [28]:
training = df.iloc[:21300,]
testing = df.iloc[21300:]

spam_fm = cv().fit(training[training['type']=='spam']['content'].values)
ham_fm = cv().fit(training[training['type']=='ham']['content'].values)

hamcount = training[(training['type'] == 'ham' )].count()[0]
spamcount = training[(training['type'] == 'spam' )].count()[0]
print("\nnumber of messages that are ham: ", hamcount, "\nnumber of messages that are spam:", spamcount)

h_prior = 7523/21300
s_prior = 13777/21300
s_log_prior = np.log(s_prior)
h_log_prior = np.log(h_prior)

s_fm = pd.DataFrame(spam_fm.vocabulary_.copy().items(), columns=['word', 's-count'])
h_fm = pd.DataFrame(ham_fm.vocabulary_.copy().items(), columns=['word', 'h-count'])
fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")


number of messages that are ham:  7523 
number of messages that are spam: 13777


In [29]:
alpha = 2

fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")
fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test = testing.iloc[:600]
test = test.copy()
test['prediction'] = ''
test['prediction'] = test['content'].apply(NaiveBayes)

print('alpha: 2.0')
performance_eval(test)

alpha: 2.0

FP: 5 
FN: 280 
TP: 111 
TN: 204 
-----
Acc: 0.525 
r:   0.28388746803069054 
P:   0.9568965517241379


[0.525, 0.28388746803069054, 0.9568965517241379]

In [30]:
alpha = 0.5

fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")
fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test = testing.iloc[:600]
test = test.copy()
test['prediction'] = ''
test['prediction'] = test['content'].apply(NaiveBayes)

print('alpha: 0.5')
performance_eval(test)

alpha: 0.5

FP: 66 
FN: 225 
TP: 166 
TN: 143 
-----
Acc: 0.515 
r:   0.42455242966751916 
P:   0.7155172413793104


[0.515, 0.42455242966751916, 0.7155172413793104]

In [31]:
alpha = 0.1

fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")
fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test = testing.iloc[:600]
test = test.copy()
test['prediction'] = ''
test['prediction'] = test['content'].apply(NaiveBayes)

print('alpha: 0.1')
performance_eval(test)

alpha: 0.1

FP: 157 
FN: 145 
TP: 246 
TN: 52 
-----
Acc: 0.49666666666666665 
r:   0.629156010230179 
P:   0.6104218362282878


[0.49666666666666665, 0.629156010230179, 0.6104218362282878]

In [32]:
alpha = 0.005

fm = pd.merge(h_fm, s_fm, on = "word", how = "inner")
fm['s%'] = (fm['s-count'] + alpha)/(len(s_fm) + (alpha * len(fm)))
fm['h%'] = (fm['h-count'] + alpha)/(len(h_fm) + (alpha * len(fm)))

test = testing.iloc[:600]
test = test.copy()
test['prediction'] = ''
test['prediction'] = test['content'].apply(NaiveBayes)

print('alpha: 0.005')
performance_eval(test)

alpha: 0.005

FP: 165 
FN: 120 
TP: 271 
TN: 44 
-----
Acc: 0.525 
r:   0.6930946291560103 
P:   0.6215596330275229


[0.525, 0.6930946291560103, 0.6215596330275229]