In [1]:
# Imports
import os, email, string, re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics



In [2]:
# extract email body from raw email
def get_email_body(email_string):
    # parse emial
    email_body = email.message_from_string(email_string)
    # only keep the main email body, discard any attachments
    for i in email_body.walk():
        if i.get_content_type() == 'text/plain':
            # get payload in bytes
            body = i.get_payload(decode=True)
            # decode utf-8 character set
            try:
                return body.decode('windows-1252')
            except UnicodeDecodeError:
                # decode windows-1252 character set
                return ''
    # return empty string if main body not found
    return ''
# remove the stop words from email body
def remove_stop_words(text, stop_words):
    # split sentences into tokens for individual identical check of stop_words 
    text_tokens = text.split()
    filtered_tokens = [token for token in text_tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

# extract features from email body
def extract_features(text):
    # convert to lowercase and remove punctuation
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # split text into words
    words = text.split()
    # count word frequencies
    return Counter(words)
# load stop words
with open('stop_words.txt', 'r') as f:
    stop_words = f.read().split()
    
# initialize data structures
emails = []
labels = []
features = []
# read in labels
skip_count = 0
with open('labels', 'r') as label_n_file_path:
    for line in label_n_file_path:
        try:
            # extract label and file path
            label, file_path = line.strip().split(' ../')
#             colon, normalized_path = file_path.split('../')
            # read in email
            with open(file_path, 'r') as email_file:
                raw_email = email_file.read()
                
                body = get_email_body(raw_email)
                
                body = remove_stop_words(body, stop_words)

                emails.append(raw_email)

                labels.append(label)

                features.append(extract_features(body))
                
        except UnicodeDecodeError:
            # due to other unrecognizable formats, skip that file
            skip_count += 1
            
# split data into training and testing sets 
# data reduced down to 36747 emails
X_train=features[:20659]
y_train=labels[:20659]
X_test=features[20659:]
y_test = labels[20659:]


In [3]:
#folders 0-70 emails 20659
#folders 0-70 emails 16088

# Creating Feature Matrix


In [4]:
# create dictionary of top 10000 most common words
word_counts = Counter()
# get most common words/ 10k top words
for email_features in features:
    for word, count in email_features.items():
        word_counts[word] += count
top_words = [word for word, count in word_counts.most_common(10000)]

# create feature matrix for training data
#initialize matrix w/ all zeros initialy## %^remove
X_train_matrix = np.zeros((len(X_train), len(top_words)))
# turn to 2D array matrix with x_train rows, top_w colunm
X_train_matrix = X_train_matrix.reshape(len(X_train), len(top_words))
#check if word in top_words appeared in email
for i, email_features in enumerate(X_train):
    for j, word in enumerate(top_words):
        if word in email_features:
            X_train_matrix[i, j] = 1

# create feature matrix for testing data
X_test_matrix = np.zeros((len(X_test), len(top_words)))
#
X_test_matrix = X_test_matrix.reshape(len(X_test), len(top_words))
#
for i, email_features in enumerate(X_test):
    for j, word in enumerate(top_words):
        if word in email_features:
            X_test_matrix[i, j] = 1


# Computing Priors

In [13]:
# count number of spam and ham emails in training set
num_spam = sum(1 for label in y_train if label == 'spam')
num_ham = sum(1 for label in y_train if label == 'ham')

# calculate prior probabilities
# X_train or y_train can be use, it will have same len of emails
prior_prob_spam = num_spam / len(X_train)
prior_prob_ham = num_ham / len(X_train)
print('prior probability of spam: ', prior_prob_spam,'prior probability of ham: ', prior_prob_ham)

prior probability of spam:  0.6360424028268551 prior probability of ham:  0.36395759717314485


# Computing the Likelihood of each word 

In [15]:
## Computing the Likelihood of each word
# create dictionary of word counts for spam and ham emails
spam_word_counts = Counter()
ham_word_counts = Counter()

for i, label in enumerate(y_train):
    if label == 'spam':
        for word, count in X_train[i].items():
            spam_word_counts[word] += count
    else:
        for word, count in X_train[i].items():
            ham_word_counts[word] += count

# apply Laplace smoothing
lambda_ = 1
vocab_size = len(top_words)

# compute likelihood of each word given spam
likelihood_spam = {}
for word in top_words:
    likelihood_spam[word] = (spam_word_counts[word] + lambda_) / (num_spam + lambda_ * vocab_size)

# compute likelihood of each word given ham
likelihood_ham = {}
for word in top_words:
    likelihood_ham[word] = (ham_word_counts[word] + lambda_) / (num_ham + lambda_ * vocab_size)
    


# Testing the Classifier

## Test classifier with unknown message

In [7]:
# Create an instance of the NaiveBayesClassifier
classifier = MultinomialNB()
cv = CountVectorizer()
# Train the classifier
classifier.fit(X_train_matrix, y_train)

# fed with unknown message I got from the test dataset
with open('data/071/001', 'r') as f:
    raw_email = f.read()
# Extract the features for the unknown message
body = get_email_body(raw_email)
# Remove stop words
body = remove_stop_words(body, stop_words)
# Extract features from the message
features = extract_features(body)

unknown_matrix = np.zeros((1, len(top_words)))
for j, word in enumerate(top_words):
    if word in word_counts:
        unknown_matrix[0, j] = word_counts[word]
prediction = classifier.predict(unknown_matrix.reshape(-1, len(top_words)))

In [8]:
# Predict the labels for the test data
predictions = classifier.predict(X_test_matrix.reshape(-1, len(top_words))) #0-topwords

# Calculate the accuracy of the classifier using the test set
# can use the score function of the sk-learn to return mean accuracy or can use .mean()
accuracy = classifier.score(X_test_matrix, y_test)
# print(accuracy) = 0.9320611636001989


# Performance Evaluation

In [22]:
# performance evaluation of test data
accuracy = metrics.accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test,predictions,pos_label='ham')
recall = metrics.recall_score(y_test,predictions,pos_label='ham')

cm = confusion_matrix(y_test,predictions)
tn,fp,fn,tp = cm.ravel()

fpr = fp/(fp+tn)
fnr = fn/(tp+fn)
tpr = tp/(tp+fn)
tnr = tn/(tn+fp)
print(accuracy,precision,recall)
print("fpr=",fpr,"fnr=",fnr,"tpr=",tpr,"tnr=",tnr)

0.9315638985579314 0.8610032089174126 0.9482886904761905
fpr= 0.05171130952380952 fnr= 0.07682972367438387 tpr= 0.9231702763256161 tnr= 0.9482886904761905


# effect of removing stop words in terms of precision, recall, and accuracy

In [10]:
def extract_features_without_stop_words(text):
    # convert to lowercase and remove punctuation
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # split text into words
    words = text.split()
    # remove stop words
    words = [word for word in words if word not in stop_words]
    # create a dictionary of word counts
    word_counts = Counter(words)
    # return the dictionary as a regular python dictionary (not a Counter object)
    return dict(word_counts)
features_with_stopwords=[]
with open('labels', 'r') as label_n_file_path:
    for line in label_n_file_path:
        try:
            # extract label and file path
            label, file_path = line.strip().split(' ../')
#             colon, normalized_path = file_path.split('../')
            # read in email
            with open(file_path, 'r') as email_file:
                raw_email = email_file.read()
                
                body = get_email_body(raw_email)
                
                emails.append(raw_email)

                labels.append(label)

                features_with_stopwords.append(extract_features_without_stop_words(body))
                
        except UnicodeDecodeError:
            # due to other unrecognizable formats, skip that file
            skip_count += 1
# create dictionary of top 10000 most common words
word_counts_wsw = Counter()
# get most common words/ 10k top words
for email_features in features_with_stopwords:
    for word, count in email_features.items():
        word_counts_wsw[word] += count
top_words_wsw = [word for word, count in word_counts_wsw.most_common(10000)]


# create feature matrix for training data
#initialize matrix w/ all zeros initialy## %^remove
X_train_matrix_wsw = np.zeros((len(X_train), len(top_words_wsw)))
# turn to 2D array matrix with x_train rows, top_w colunm
X_train_matrix_wsw = X_train_matrix_wsw.reshape(len(X_train), len(top_words_wsw))
#check if word in top_words appeared in email
for i, email_features in enumerate(X_train):
    for j, word in enumerate(top_words_wsw):
        if word in email_features:
            X_train_matrix_wsw[i, j] = 1

# create feature matrix for testing data
X_test_matrix_wsw = np.zeros((len(X_test), len(top_words_wsw)))
#
X_test_matrix_wsw = X_test_matrix_wsw.reshape(len(X_test), len(top_words_wsw))
#
for i, email_features in enumerate(X_test):
    for j, word in enumerate(top_words_wsw):
        if word in email_features:
            X_test_matrix_wsw[i, j] = 1
            
# Predict the labels for the test data
predictions_wsw = classifier.predict(X_test_matrix_wsw.reshape(-1, len(top_words_wsw))) #0-topwords

# Calculate the accuracy of the classifier using the test set
# can use the score function of the sk-learn to return mean accuracy or can use .mean()


accuracy_wsw = metrics.accuracy_score(y_test,predictions_wsw)
precision_wsw = metrics.precision_score(y_test,predictions_wsw,pos_label='ham')
recall_wsw = metrics.recall_score(y_test,predictions_wsw,pos_label='ham')

cm_wsw = confusion_matrix(y_test,predictions_wsw)
tn,fp,fn,tp = cm_wsw.ravel()

fpr_wsw = fp/(fp+tn)
fnr_wsw = fn/(tp+fn)
tpr_wsw = tp/(tp+fn)
tnr_wsw = tn/(tn+fp)
print(accuracy_wsw,precision_wsw,recall_wsw)


0.5468672302337145 0.39067854694996573 0.6361607142857143


In [11]:
print(fpr_wsw,fnr_wsw,tpr_wsw,tnr_wsw)


0.3638392857142857 0.4979462285287528 0.5020537714712472 0.6361607142857143
