In [ ]:
%matplotlib inline
import numpy as np
import pandas as pd
from pprint import pprint
from os import listdir
from os.path import isfile, join
from decimal import *

In [ ]:
class NaiveBayes():
    def __init__(self):
        self.words = {}
        self.words["legit"] = {"subj" : {}, "body" : {}}
        self.words["spam"] = {"subj" : {}, "body" : {}}
        self.total_count = 0
        self.count = {"legit" : 0, "spam" : 0}
        
    def __calc_prob_body_only(self, cls):
        self.words[cls]["prob"] = {}
        total = Decimal(sum(list(self.words[cls]["body"].values())))
        for word, count in self.words[cls]["body"].items():
            self.words[cls]["prob"][word] = Decimal(count) / total

    def __calc_prob_body_subj(self, cls):
        self.words[cls]["prob"] = {}
        all_words = {}
        for w, count in self.words[cls]["subj"].items():
            all_words[w] = count
        for w, count in self.words[cls]["body"].items():
            all_words[w] = all_words.get(w, 0) + count
        total = Decimal(sum(list(all_words.values())))
        for word, count in all_words.items():
            self.words[cls]["prob"][word] = Decimal(count) / total
    
    def __calc_prob_body_subj_weighted(self, cls):
        self.words[cls]["prob"] = {}
        all_words = {}
        for w, count in self.words[cls]["subj"].items():
            all_words[w] = count
        for w, count in self.words[cls]["body"].items():
            all_words[w] = all_words.get(w, 0) + count
        total = Decimal(sum(list(all_words.values())))
        for word, count in all_words.items():
            self.words[cls]["prob"][word] = Decimal(count) / total
            #if cls == "spam" and word in self.words[cls]["subj"]:
            #    self.words[cls]["prob"][word] = self.words[cls]["prob"][word] * Decimal(2)
            
    def __add_words(self, mails):
        self.total_count = len(mails)
        for mail in mails:
            cls, subj, body = mail
            self.count[cls] += 1
            for w in subj:
                self.words[cls]["subj"][w] = self.words[cls]["subj"].get(w, 0) + 1
            for w in body:
                self.words[cls]["body"][w] = self.words[cls]["body"].get(w, 0) + 1
        for w in self.words["spam"]["subj"]:
            if w not in self.words["legit"]["subj"]:
                self.words["legit"]["subj"][w] = 1
        for w in self.words["spam"]["body"]:
            if w not in self.words["legit"]["body"]:
                self.words["legit"]["body"][w] = 1
        for w in self.words["legit"]["subj"]:
            if w not in self.words["spam"]["subj"]:
                self.words["spam"]["subj"][w] = 1
        for w in self.words["legit"]["body"]:
            if w not in self.words["spam"]["body"]:
                self.words["spam"]["body"][w] = 1
        
    
    def train_body_subj_weighted(self, mails):
        self.__add_words(mails)
        self.__calc_prob_body_subj_weighted("legit")
        self.__calc_prob_body_subj_weighted("spam")
    
    def train_body_subj(self, mails):
        self.__add_words(mails)
        self.__calc_prob_body_subj("legit")
        self.__calc_prob_body_subj("spam")
        
    def train(self, mails):
        self.__add_words(mails)
        self.__calc_prob_body_only("legit")
        self.__calc_prob_body_only("spam")
        
    def classify(self, subj, body):
        score_legit = Decimal(self.count["legit"]) / Decimal(self.total_count)
        score_spam = Decimal(self.count["spam"]) / Decimal(self.total_count)
        #pprint((score_legit, score_spam))
        for word in body:
            if word in self.words["legit"]["prob"] and word in self.words["spam"]["prob"]:
                score_legit *= self.words["legit"]["prob"][word]
                score_spam *= self.words["spam"]["prob"][word]
        #pprint ((score_legit, score_spam))
        if score_spam > score_legit:
            return "spam"
        return "legit"

In [ ]:
def read_mail(filename):
    lines = [line[:-1] for line in open(filename, "r").readlines()]
    subj = lines[0][len("Subject: "):].split(" ")
    if subj and subj[0]:
        subj = [int(i) for i in subj]
    else:
        subj = []
    body = lines[2].split(" ")
    if body and body[0]:
        body = [int(i) for i in body]
    else:
        body = []
    cls = "legit" if "legit" in filename else "spam"
    return (cls, subj, body)

def files_in_block(index):
    mypath = "data/part{}/".format(index)
    onlyfiles = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
    return onlyfiles

In [ ]:
def f1score(theory, practice):
    tp, fn, fp, tn = 0, 0, 0, 0
    for i in range(len(theory)):
        tp += theory[i] == practice[i] and practice[i] == 1
        tn += theory[i] == practice[i] and practice[i] == 0
        fn = sum(practice) - tp
        fp = (len(practice) - sum(practice)) - tn
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return (2 * precision * recall / (precision + recall), fn, fp)

In [ ]:
def calc_score(train_range, test, train_function_name):
    all_files = []
    for i in train_range:
        for f in files_in_block(i):
            all_files.append(f)
    train_mails = [read_mail(m) for m in all_files]
    
    classifier = NaiveBayes()
    #classifier.train(train_mails)
    getattr(classifier, train_function_name)(train_mails)
    
    test_mail = [read_mail(f) for f in files_in_block(test)] 
    true_y = list(map(lambda x: 1 if x is "spam" else 0, [i[0] for i in test_mail]))
    data = [(i[1], i[2]) for i in test_mail]
    predicted = [classifier.classify(i[0], i[1]) for i in data]
    predicted_y = list(map(lambda x: 1 if x is "spam" else 0 , predicted))
    return f1score(true_y, predicted_y) # f1, good_as_spam, spam_as_good

In [ ]:
def f1score_list(train_function_name):
    tests = [list(range(1,11)) for i in range(1, 11)]
    for i in range(10):
        del tests[i][i]
    scores = [calc_score(tests[i], i+1, train_function_name) for i in range(10)]
    return scores

def f1score_average(scores):
    sums = [0] * len(scores[0])
    for s in scores:
        for i in range(len(sums)):
            sums[i] += s[i]
    for i in range(len(sums)):
        sums[i] = float(sums[i]) / float(len(scores))
    return sums
    

In [ ]:
li = f1score_list("train")
li

In [ ]:
f1score_average(li)

In [ ]:
li = f1score_list("train_body_subj")
li

In [ ]:
f1score_average(li)

In [ ]:
li = f1score_list("train_body_subj_weighted")
li

In [ ]:
f1score_average(li)