In [None]:
#Naive Bayes Classiifier By Daniel McDonough (12/6/18)

import numpy as np
import math
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
import os.path
#p_keep given labels
def P_keep(labels):
    return len(labels[np.where(labels == 1)])/ (len(labels))

#p_spam given labels
def P_spam(labels):
    return len(labels[np.where(labels == 0)]) / len(labels)

#returns array of means and variances for each feature
def mean_var(training):
    means = np.zeros(training.shape[1])
    vars = np.zeros(training.shape[1])
    #take the mean use of each feature
    for i in range(0,len(means)):
        means[i] = np.mean(training[:,i])
        vars[i] = np.var(training[:,i])
    return means,vars

#probability of x given y w/ Laplace Smoothing
def p_x_given_y(x, y,k=1,N=2):
    y = len(y) #y is number of spam or ham
    x = len(x[np.where(x!=0)])#x is an number of messages with a given feature/word
    return (x+ k)/(y + N*k)

def naive_bayes(training, labels, tests):
    predlabels = []

    #print(tests.shape)
    p_keep = P_keep(labels) #probability of kept
    p_spam = P_spam(labels) #probability of spam

    #keep = training[np.where(training[:,-1] == 1),:-1] #data set of all kept messages

    keep = training[np.where(training[:,-1] == 1)]
    keep = keep[:, :-1]  # remove labels

    #print(keep)
    #print(keep.shape)

    spam = training[np.where(training[:,-1] == 0)] #dataset of all spam messages
    spam = spam[:,:-1] #remove labels

    test_keep = p_keep
    test_spam = p_spam

    #for each new message...
    for m in tests:

        #calc the probability of message given spam
        for i in range(0,len(keep[0])):
            if m[i] == 1: #if word is used...
                test_keep *= p_x_given_y(keep[:,i],keep,1)
                test_spam *= p_x_given_y(spam[:,i],spam,1)

        # we technically dont need to calc the P(M) because we check
        # P(SPAM|M)*P(M) > P(HAM|M)*P(M) so P(M) cancels out
        if test_spam > test_keep:
            predlabels.append(0)
        else:
            predlabels.append(1)

    return predlabels

def test_Naive_Bayes(data):

    # all email body lables (removed header)
    email_body_labels = data[1:,-1]

    # all email body data (removed header and IDs)
    email_body = data[1:, 1:]

    # split data into training and testing sets
    triainting_size = math.floor(email_body.shape[0]*.8)

    email_body_train = email_body[:triainting_size].astype(int)

    #print(email_body_train)
    email_body_labels_train = email_body_labels[:triainting_size].astype(int)

    #print(email_body_labels_train)
    email_body_test = email_body[triainting_size:].astype(int)

    #print(email_body_test)
    email_body_labels_test = email_body_labels[triainting_size:].astype(int)

    #print(len(email_body_test[0,:]))
    pred_labels = naive_bayes(email_body_train, email_body_labels_train, email_body_test)

    print("True Labels",email_body_labels_test.tolist())
    print("Self Pred labels",pred_labels)
    # print(email_body_labels_test.tolist())

    print("Naive Bayes from Scratch f-measure:",f1_score(email_body_labels_test, pred_labels,pos_label=1))

    clf = MultinomialNB()
    clf.fit(email_body_train, email_body_labels_train)

    pred_labels = clf.predict(email_body_test)
    print("SciPy Pred Labels",pred_labels.tolist())

    print("Naive Bayes from SciPy f-measure:",f1_score(email_body_labels_test, pred_labels, pos_label=1))



def Main():
    #un processed file csv array

    body_input = input("Please enter location of email body dataset")
    subjects_input = input("Please enter location of email subjects dataset")

    if(os.path.isfile(body_input) and os.path.isfile(subjects_input)):

        email_body = np.genfromtxt(body_input, delimiter=",",dtype='str')
        email_subjects = np.genfromtxt(subjects_input, delimiter=",",dtype='str')

        print("\nTesting Email Body...")
        test_Naive_Bayes(email_body)

        print("\nTesting Email Subject...")
        test_Naive_Bayes(email_subjects)
    else:
        print("Bad files...")
        exit(1)


if __name__ == "__main__":
    Main()

