In [76]:
%matplotlib inline
import numpy as np
import pandas as pd
from pprint import pprint
from os import listdir
from os.path import isfile, join
from decimal import *

In [164]:
class NaiveBayes():
    def __init__(self):
        self.words = {}
        self.words["legit"] = {"subj" : {}, "body" : {}}
        self.words["spam"] = {"subj" : {}, "body" : {}}
        self.total_count = 0
        self.count = {"legit" : 0, "spam" : 0}
        
    def __calc_prob(self, cls):
        #total = sum(list(words[cls]["subj"].values())))
        total = Decimal(sum(list(self.words[cls]["body"].values())))
        self.words[cls]["prob"] = {}
        for word, count in self.words[cls]["body"].items():
            self.words[cls]["prob"][word] = Decimal(count) / total
    
        
    def train(self, mails):
        self.total_count = len(mails)
        for mail in mails:
            cls, subj, body = mail
            self.count[cls] += 1
            for w in subj:
                self.words[cls]["subj"][w] = self.words[cls]["subj"].get(w, 0) + 1
            for w in body:
                self.words[cls]["body"][w] = self.words[cls]["body"].get(w, 0) + 1
        self.__calc_prob("legit")
        self.__calc_prob("spam")
        
    def classify(self, subj, body):
        score_legit = Decimal(self.count["legit"]) / Decimal(self.total_count)
        score_spam = Decimal(self.count["spam"]) / Decimal(self.total_count)
        pprint((score_legit, score_spam))
        for word in body:
            score_legit *= self.words["legit"]["prob"].get(word, Decimal(1))
            score_spam *= self.words["spam"]["prob"].get(word, Decimal(1))
        #pprint ((score_legit, score_spam))
        if score_spam > score_legit:
            return "spam"
        return "legit"

In [155]:
def read_mail(filename):
    lines = [line[:-1] for line in open(filename, "r").readlines()]
    subj = lines[0][len("Subject: "):].split(" ")
    if subj and subj[0]:
        subj = [int(i) for i in subj]
    else:
        subj = []
    body = lines[2].split(" ")
    if body and body[0]:
        body = [int(i) for i in body]
    else:
        body = []
    cls = "legit" if "legit" in filename else "spam"
    return (cls, subj, body)

def files_in_block(index):
    mypath = "data/part{}/".format(index)
    onlyfiles = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
    return onlyfiles

In [98]:
def f1score(theory, practice):
    tp, fn, fp, tn = 0, 0, 0, 0
    for i in range(len(theory)):
        tp += theory[i] == practice[i] and practice[i] == 1
        tn += theory[i] == practice[i] and practice[i] == 0
        fn = sum(practice) - tp
        fp = (len(practice) - sum(practice)) - tn
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return (2 * precision * recall / (precision + recall), fn, fp)

In [165]:
all_files = []
for i in range(1, 10):
    for f in files_in_block(i):
        all_files.append(f)
mails = [read_mail(m) for m in all_files]
classifier = NaiveBayes()
classifier.train(mails)

In [162]:
pprint(classifier.count)

{'legit': 549, 'spam': 432}


In [163]:
classifier.total_count

981

In [169]:
mail = [read_mail(f) for f in files_in_block(10)] 
ans = list(map(lambda x: 1 if x is "spam" else 0, [i[0] for i in mail]))
subj = [i[1] for i in mail]
body = [i[2] for i in mail]
data = list(zip(subj, body))

In [170]:
ans[0]

1

In [171]:
classifier.classify(data[0][0], data[0][1])

(Decimal('0.5596330275229357798165137615'),
 Decimal('0.4403669724770642201834862385'))


'legit'

In [160]:
predicted = [classifier.classify(i[0], i[1]) for i in data]
binary = list(map(lambda x: 1 if x is "spam" else 0 , predicted))

In [161]:
f1score(ans, binary)

(0.3464566929133858, 57, 26)