In [12]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score 

# PreProsessing phase
def make_Dictionary(root_dir):
    all_words = []
    
    # Build the email file path list
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    
    # A Counter is a container that keeps track of how many times equivalent values are added.
    dictionary = Counter(all_words)
    
    list_to_remove = list(dictionary)
    
    for item in list_to_remove:
        if (item.isalpha() == False) or (len(item) == 1):
            del dictionary[item]
            
    dictionary = dictionary.most_common(1000)
    return dictionary

def extract_features(mail_dir, dictionary):
    files = [os.path.join(mail_dir,f) for f in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),1000))
    train_labels = np.zeros(len(files))
    
    count = 0;
    docID = 0;
    
    for fil in files:
      with open(fil) as fi:
        for i,line in enumerate(fi):
          if i == 2:
            words = line.split()
            for word in words:
              wordID = 0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        train_labels[docID] = 0;
        filepathTokens = fil.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1;
            count = count + 1
        docID = docID + 1
    return features_matrix, train_labels

if __name__ == '__main__':
    TRAIN_DIR = './train-mails'
    TEST_DIR = './test-mails'
    
    print('Start making word dictionary from trainning folder.....')
    dictionary = make_Dictionary(TRAIN_DIR)
    print('Dictionary sample: ', dictionary[1:5])
    
    print('reading and processing emails from file.')
    features_matrix, labels = extract_features(TRAIN_DIR,dictionary)
    test_feature_matrix, test_labels = extract_features(TEST_DIR,dictionary)

    model = GaussianNB()
    #model = MultinomialNB()
    #model = BernoulliNB()

    print('Training model.')
    #train model
    model.fit(features_matrix, labels)

    predicted_labels = model.predict(test_feature_matrix)

    print('FINISHED classifying. accuracy score : ')
    print(accuracy_score(test_labels, predicted_labels))
    
    # GaussianNB, accuracy score = 0.896153846154
    # MultinomialNB, accuracy_score = 0.946153846154
    # BernoulliNB, accuracy_score = 0.746153846154

Start making word dictionary from trainning folder.....
Dictionary sample:  [('address', 1299), ('report', 1217), ('mail', 1133), ('language', 1099)]
reading and processing emails from file.
Training model.
FINISHED classifying. accuracy score : 
0.746153846154
