In [14]:
def parse_email(txt):
    import re 
    pattern = r"[\w\-'\$]+"
    return re.findall(pattern, txt)


In [68]:
def add_email(email, dataholder):
    words = parse_email(email)
    for word in words:
        dataholder[word] = dataholder.get(word, 0) + 1
    return

In [17]:
def get_probs(spamCounts, hamCounts):
    spamTotal = sum(spamCounts.values())
    hamTotal = sum(hamCounts.values())
    spam_probs = {word: spamCounts[word]/spamTotal for word in spamCounts}
    ham_probs = {word: hamCounts[word]/hamTotal for word in hamCounts}
    return spam_probs, ham_probs

In [207]:
def get_spam_filter(spamfile):
    import csv
    spam = {} # spam[word] = # of times word appears in spam email
    ham = {} # ham[word] = # of times word appears in non-spam email
    with open(spamfile, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            txt = row[0]
            add_email(txt, spam) if row[1] == '1' else add_email(txt, ham)
    spam_probs, ham_probs = get_probs(spam, ham)
    return spam_probs, ham_probs

In [217]:
def classify_email(emailtxt, spam_probs, ham_probs):
    import math
    spamTotal, hamTotal = sum(spam_probs.values()), sum(ham_probs.values()) # for smoothing
    vocab_count = len(spam_probs) + len(ham_probs)
    words = set(parse_email(emailtxt))
    log_prob_spam = 0
    log_prob_ham = 0
    for word in words:
        if word in spam_probs:
            log_prob_spam += math.log(spam_probs[word])
            # prob_spam *= spam_probs[word]
        else:
            # laplace smoothing 
            log_prob_spam += math.log((1/(spamTotal + vocab_count + 1)))
            # prob_spam *= (1/(spamTotal + vocab_count + 1))

        if word in ham_probs:
            log_prob_ham += math.log(1.2*ham_probs[word])
            # prob_ham *= ham_probs[word]
        else:
            # laplace smoothing 
            # prob_ham *= (1/(hamTotal + vocab_count + 1))
            log_prob_ham += math.log(1.2*(1/(hamTotal + vocab_count + 1)))
            
    print(log_prob_spam, log_prob_ham)
    return 1 if log_prob_spam > log_prob_ham + math.log(1000) else 0
    # return 1 if prob_spam > 9 * prob_ham else 0



In [209]:
spam_probs, ham_probs = get_spam_filter('emails.csv')

In [218]:
import csv
accuracy = 0
total = 0
negatives = 0
false_positives = 0
bad = []
with open("emails.csv", newline='') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if row[1] == '':
            break
        txt, y = row[0], int(row[1])
        y_pred = classify_email(txt, spam_probs=spam_probs, ham_probs=ham_probs)
        if y == y_pred:
            accuracy += 1
        if y == 0:
            negatives += 1
        if y == 0 and y_pred == 1:
            false_positives += 1
            bad.append(row[0])
        total += 1
        
print(f'number of false positives = {false_positives}')
print(f'number of ground negatives = {negatives}')
print(f'total is {total}')
print(f'accuracy = {accuracy/total * 100}')
print(f'false positives = {false_positives/negatives * 100}')

-1016.6007330151376 -1123.3204273855017
-714.5287150461209 -640.2705031348845
-503.6418623235985 -512.9499140960498
-417.138654912208 -417.29825388615484
-273.40732127023307 -277.51820934783063
-622.9233796285798 -624.2477287246909
-5458.922583846778 -5822.424215583738
-474.9021880768454 -519.3615281632045
-583.0866292849018 -553.991968063309
-475.30765318495355 -520.4362575246665
-739.8017325358893 -741.8763934357128
-478.23902285540476 -522.3039911128155
-701.5609738432862 -726.4233923280661
-4969.901019385923 -5259.561245548633
-576.392302392657 -592.7763423270643
-799.4835413313975 -832.7382233572844
-240.34496442378472 -247.98207056252247
-633.9703355116936 -719.8994832945132
-72.05599080003034 -73.53094160059327
-1973.4133890777625 -1974.5011123110319
-402.1011814318845 -437.68028498121896
-675.339401662885 -669.3029045528715
-640.8253434362083 -718.5359421809989
-309.2172952750977 -321.8749679476477
-399.08164279253236 -417.7083051245638
-958.1898325628126 -993.7862827988635
-74

In [187]:
txt = bad[2]

In [188]:
txt

'Subject: folks ,  attached is a conservative ( and fairly rough ) estimate of the size of the  petrochemicals and refining market that is potentially exposed to prolonged  drought in southern texas which could result in extremely low riverflows and  possible curtailed production . the total annual revenue generated by these  assets is no less than $ 20 b and could be substantially higher as the  estimated capacity on some of these facilties is likely understated and other  facilties not yet identified are likely to be vulnerable .  note that this data does not include any facilities in the industrial  complexes from houston northward and eastward as they are much less likely to  experience such a drought - induced interruption . the only facilties  identified thus far lie on or near the following rivers : brazos , colorado ,  navidad , guadalupe , and nueces .  please let me know if you have any questions / comments as we work to determine  whether or not a low riverflow insurance pro

In [189]:
classify_email(txt, spam_probs=spam_probs, ham_probs=ham_probs)

-968.051342776972 -1006.0693304731841


1