In [39]:
import os
import re
import email
import numpy as np
import pandas as pd
import bs4
import nltk
from collections import Counter
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from math import log

In [28]:
notspam_folder = "./notspam/"
spam_folder = "./spam/"
unknown_files_folder = "./unknown/"


def get_file_names(path):
    filenames = []
    for f in os.listdir(path):
        if not f.endswith('.ipynb_checkpoints'):
            filenames.append(f)         
    return filenames

notspam_files = get_file_names(notspam_folder)
spam_files = get_file_names(spam_folder)
unknown_files = get_file_names(unknown_files_folder)

In [29]:
class NBClassifier(object):
    def __init__(self, alpha=10**(-7)):
        self.alpha = alpha 
        self.classes = defaultdict(lambda:.0)
        self.prob = defaultdict(lambda:.0)
        
    def fit(self, data_set):
        for feats, label in data_set:
            self.classes[label] += 1
            for feat in feats:
                self.prob[label, feat] += 1
                
        for label, feat in self.prob:   
            self.prob[label, feat] /= self.classes[label]
        for cl in self.classes:
            self.classes[cl] /= len(data_set)
        return self
    
    def get_class(self, feats):
        def get_log_prob(cl):
            return -log(self.classes[cl]) + \
                   sum(-log(self.prob.get((cl,feat), self.alpha)) for feat in feats)
        return min(self.classes.keys(), key=get_log_prob)
    
    def predict(self, data_to_predict):
        return [self.get_class(words.split()) for words in data_to_predict]

In [30]:
def get_messages_from_files(folder):
    messages = []
    for filename in os.listdir(folder):
        if not filename.endswith('.ipynb_checkpoints'):
            with open(folder + filename) as mail:
                msg = email.message_from_file(mail)
                messages.append(clean_msg(msg))
    return messages

def clean_msg(msg):
    m = msg.get_payload()
    if msg.is_multipart():
        m = ''
        for _ in [k.get_payload() for k in msg.walk() if k.get_content_type() == 'text/html']:
            m += _
    return bs4.BeautifulSoup(m , 'lxml', exclude_encodings=["ISO-8859-7"]).get_text()

In [31]:
# def get_tokenized_message(message):
#     return ' '.join(nltk.word_tokenize(message.lower()))

In [32]:
def get_filtered_message(message):
    tm = nltk.word_tokenize(message.lower())
    
    def stopwords_filter(word):
        return word not in nltk.corpus.stopwords.words('english')
    
    message = ' '.join(filter(stopwords_filter, tm))
    return message

In [33]:
def get_framed_train_data():
    spam_words = [(get_filtered_message(m), 1) for m in get_messages_from_files('./spam/')]
    notspam_words = [(get_filtered_message(m), 0) for m in get_messages_from_files('./notspam/')]
    
    return pd.DataFrame(spam_words+notspam_words, columns=['words','label'])

In [34]:
train_df = get_framed_train_data()
words_train, words_test, label_train, label_test = train_test_split(train_df.words, train_df.label, test_size=0.3)

In [35]:
data_to_fit = [(words.split(), label) for words, label in zip(words_train, label_train)]

In [36]:
NBCpredicted_test = NBClassifier().fit(data_to_fit).predict(words_test)

In [40]:
accuracy_score(label_test,NBCpredicted_test)

0.95416666666666672

In [41]:
unknown_words = [get_filtered_message(m) for  m in get_messages_from_files('./unknown/')]
NBCpredicted = NBClassifier().fit(data_to_fit).predict(unknown_words)

KeyboardInterrupt: 

In [None]:
pd.DataFrame({'filename': unknown_files,'label': NBCpredicted}).to_csv('bayes_result.csv', index=False)