In [1]:
def parse_email(txt):
    import re 
    pattern = r"[\w\-'\$]+"
    return re.findall(pattern, txt)


In [2]:
def add_email(email, dataholder):
    words = parse_email(email)
    for word in words:
        dataholder[word] = dataholder.get(word, 0) + 1
    return

In [3]:
def get_probs(spamCounts, hamCounts):
    spamTotal = sum(spamCounts.values())
    hamTotal = sum(hamCounts.values())
    spam_probs = {word: spamCounts[word]/spamTotal for word in spamCounts}
    ham_probs = {word: hamCounts[word]/hamTotal for word in hamCounts}
    return spam_probs, ham_probs

In [4]:
def get_spam_filter(spamfile):
    import csv
    spam = {} # spam[word] = # of times word appears in spam email
    ham = {} # ham[word] = # of times word appears in non-spam email
    with open(spamfile, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            txt = row[0]
            add_email(txt, spam) if row[1] == '1' else add_email(txt, ham)
    spam_probs, ham_probs = get_probs(spam, ham)
    return spam_probs, ham_probs

In [21]:
def get_email_logits(emailtxt, spam_probs, ham_probs):
    import math
    spamTotal, hamTotal = sum(spam_probs.values()), sum(ham_probs.values()) # for smoothing
    vocab_count = len(spam_probs) + len(ham_probs)
    words = set(parse_email(emailtxt))
    log_prob_spam = 0
    log_prob_ham = 0
    for word in words:
        if word in spam_probs:
            log_prob_spam += math.log(spam_probs[word])
        else:
            # laplace smoothing 
            log_prob_spam += math.log((1/(spamTotal + vocab_count + 1)))

        if word in ham_probs:
            log_prob_ham += math.log(1.2*ham_probs[word])
        else:
            # laplace smoothing 
            log_prob_ham += math.log(1.2*(1/(hamTotal + vocab_count + 1)))

    return log_prob_spam, log_prob_ham

In [25]:
def classify_email(emailtxt, spam_probs, ham_probs):
    import math
    logit_spam, logit_ham = get_email_logits(emailtxt, spam_probs, ham_probs)

    return 1 if logit_spam > logit_ham + math.log(1000) else 0
            
    # return 1 if log_prob_spam > log_prob_ham + math.log(1000) else 0


In [26]:
spam_probs, ham_probs = get_spam_filter('emails.csv')

In [27]:
import csv
accuracy = 0
total = 0
negatives = 0
false_positives = 0
bad = []
with open("emails.csv", newline='') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if row[1] == '':
            break
        txt, y = row[0], int(row[1])
        y_pred = classify_email(txt, spam_probs=spam_probs, ham_probs=ham_probs)
        if y == y_pred:
            accuracy += 1
        if y == 0:
            negatives += 1
        if y == 0 and y_pred == 1:
            false_positives += 1
            bad.append(row[0])
        total += 1
        
print(f'number of false positives = {false_positives}')
print(f'number of ground negatives = {negatives}')
print(f'total is {total}')
print(f'accuracy = {accuracy/total * 100}')
print(f'false positives = {false_positives/negatives * 100}')

number of false positives = 129
number of ground negatives = 4358
total is 5726
accuracy = 91.40761439049948
false positives = 2.960073428178063


In [28]:
txt = bad[2]
print(f'the following text was classified as spam when it is really not:\n{txt}')

the following text was classified as spam when it is really not:
Subject: folks ,  attached is a conservative ( and fairly rough ) estimate of the size of the  petrochemicals and refining market that is potentially exposed to prolonged  drought in southern texas which could result in extremely low riverflows and  possible curtailed production . the total annual revenue generated by these  assets is no less than $ 20 b and could be substantially higher as the  estimated capacity on some of these facilties is likely understated and other  facilties not yet identified are likely to be vulnerable .  note that this data does not include any facilities in the industrial  complexes from houston northward and eastward as they are much less likely to  experience such a drought - induced interruption . the only facilties  identified thus far lie on or near the following rivers : brazos , colorado ,  navidad , guadalupe , and nueces .  please let me know if you have any questions / comments as we

In [31]:
logit_spam, logit_ham = get_email_logits(txt, spam_probs=spam_probs, ham_probs=ham_probs)
print(f'logit_spam={logit_spam}, \nlogit_ham={logit_ham}')

logit_spam=-968.0513427769715, 
logit_ham=-984.9200298850855


In [29]:
classify_email(txt, spam_probs=spam_probs, ham_probs=ham_probs)

1