In [1]:
#### CA02 Building a Spam Detector using Naive Bayes Algorithm

import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


In [2]:

def make_Dictionary(root_dir):
    all_words = []
    # Gather all words from the files
    emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail) as m:
            for line in m:
                all_words.extend(line.split())

    # Create a Counter object for all words that are alphabetic and longer than one character
    dictionary_counter = Counter(word for word in all_words if word.isalpha() and len(word) > 1)

    # Return the 3000 most common words as a list of tuples (word, frequency)
    return dictionary_counter.most_common(3000)



In [3]:
def extract_features(mail_dir, dictionary):
    # Assuming the dictionary passed in is a list of tuples (word, frequency)
    # Convert it into a dictionary of word:index
    word_index = {word[0]: idx for idx, word in enumerate(dictionary)}

    files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files), len(dictionary)), dtype=np.int_)
    train_labels = np.zeros(len(files), dtype=np.int_)

    for docID, file in enumerate(files):
        with open(file, 'r') as fi:
            lines = fi.readlines()
            if len(lines) > 2:
                words = lines[2].split()
                for word in words:
                    wordID = word_index.get(word, -1)
                    if wordID >= 0:
                        features_matrix[docID, wordID] += 1
        train_labels[docID] = 1 if 'spmsg' in file else 0

    return features_matrix, train_labels


In [4]:
# Pathnames for testing and training data

TRAIN_DIR = ("/Users/lanceroyston/Downloads/MSBA 2023 - 2024/Spring 2024/Intro to Machine Learning BSAN 6070/CA's/CA02 Spam Detector Using Naive Bayes /train-mails")
TEST_DIR = ("/Users/lanceroyston/Downloads/MSBA 2023 - 2024/Spring 2024/Intro to Machine Learning BSAN 6070/CA's/CA02 Spam Detector Using Naive Bayes /test-mails")


In [5]:
# Create a dictionary from the training data
dictionary_list = make_Dictionary(TRAIN_DIR)


# Extract features and labels from the training data
features_matrix, labels = extract_features(TRAIN_DIR, dictionary_list)

# Extract features and labels from the test data
test_features_matrix, test_labels = extract_features(TEST_DIR, dictionary_list)


In [6]:
print ("reading and processing emails from TRAIN and TEST folders")


# Training the Naive Bayes model
print("Training Model using Gaussian Naive Bayes algorithm .....")
model = GaussianNB()
model.fit(features_matrix, labels)
print("Training completed")
    
# Predicting the labels of the test data
print("testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix)
print("Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:") 
    
# Calculating and printing the accuracy
accuracy = accuracy_score(test_labels, predicted_labels)
print(accuracy)

reading and processing emails from TRAIN and TEST folders
Training Model using Gaussian Naive Bayes algorithm .....
Training completed
testing trained model to predict Test Data labels
Completed classification of the Test Data .... now printing Accuracy Score by comparing the Predicted Labels with the Test Labels:
0.9653846153846154
