# Spam Classification using Naive Bayes

In [1]:
# for traversing directories
import os
# for email message extraction and text preprocessing
import email
import email.policy
import re

import pandas as pd
import numpy as np
import math

## Preprocessing

### Splitting the Data Set

In [2]:
#create dictionary mapping file path to spam/ham category
spamHam_dict = {}

label_f = open("labels", 'r')
for line in label_f:
    key_value = line.split()
    key = key_value[1]
    value = key_value[0]
    spamHam_dict[key] = value

label_f.close()

In [3]:
#split train and test files
data_file_list = os.listdir('../data')
data_file_list.remove('.DS_Store')

train_folders = []
test_folders = []
spam_files = []
ham_files = []

#separate directories as train or test folders
for dirs in data_file_list:
    if int(dirs) <= 70:
        train_folders.append(dirs)
    elif int(dirs) > 70:
        test_folders.append(dirs)

In [4]:
#open reference file for train files

for dirs in train_folders:
    #list all file names under train_folders
    train_folder_files = os.listdir(f"../data/{dirs}")
    
    #list of file paths of all files under train_folders
    file_paths = []
    for file_names in train_folder_files:
        file_paths.append(f"../data/{dirs}/{file_names}")
        
    for paths in file_paths:
        #write path in spam reference file if labeled spam
        if(spamHam_dict[paths] == 'spam'):
            spam_files.append(paths)
        #write path in ham reference file if labeled ham
        elif(spamHam_dict[paths] == 'ham'):
            ham_files.append(paths)

In [5]:
test_file_paths = []

for dirs in test_folders:
    #list all file names under test_folders
    test_folder_files = os.listdir(f"../data/{dirs}")
    
    #list of file paths of all files under test_folders
    for file_names in test_folder_files:
        test_file_paths.append(f"../data/{dirs}/{file_names}")

### Cleaning the data set (Removing uselesss, stop words)

In [6]:
def extract_msg(file):
    
    body = ""
    # convert email content into one string
    str_email = ""
    for line in file:
        str_email+=line
    raw_msg = email.message_from_string(str_email)
    
    try:
        # extract only plain text part of email if multipart
        if raw_msg.is_multipart():
            for part in raw_msg.walk():
                ctype = part.get_content_type()
                if ctype == 'text/plain':
                    msg = email.message_from_string(str_email, policy=email.policy.default)
                    body = msg.get_body(('plain',))
                    if body:
                        body = body.get_content()
                    break
        # else proceed to getting content of the body                
        else:
            msg = email.message_from_string(str_email, policy=email.policy.default)
            body = msg.get_body(('plain',))
            if body:
                body = body.get_content()
        return body
    # ignore if lookup error, i.e., encoding not recognized
    except LookupError as e:
        print(f'{e}, {e.__class__}')
        pass
    


In [7]:
def tokenize_msg(body):
    # tokenize body content
    str_tkns = body.split()

    # replace non-alphanumeric characters occuring in each word
    for word in str_tkns:
        # replace non-alphanumeric characters at the start or end
        # of each word into an empty string
        word2 = re.sub(r'^\W+|\W+$',"", word)
        index = str_tkns.index(word)
        # replace word at current index with new word w/out alphanum chars
        str_tkns[index] = word2

        # change stop words in messages to 0
        if(word in stp_wrds):
            str_tkns[index] = '0'
        # change non-alphanumeric words in messages to 0
        elif(word.isalnum() == False):
            str_tkns[index] = '0'
        # change other symbols/characters not caught in isalnum check to 0
        else:
            match = re.search(r"[^a-zA-Z0-9]", word)
            if match:
                str_tkns[index] = '0'

        # remove empty strings
        if(len(word) == 0):
                str_tkns[index] = '0'
    
    # remove words change to 0
    str_tkns = [word for word in str_tkns if word != '0']

    return str_tkns

In [8]:
# create a list of stop words from the stop_words.txt file
stp_wrds = []

open_stp_wrds = open("stop_words.txt","r")
for line in open_stp_wrds:
    line = line.strip("\n")
    stp_wrds.append(line)

### Extracting Spam Features

In [9]:
# list of words per spam message
spam_messages = []

for paths in spam_files:
    open_file = open(paths, 'r',  encoding="ISO-8859-1")
    print(paths)

    # extract body of messages
    body = extract_msg(open_file)

    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""

    # tokenize body of messages
    words = tokenize_msg(body)
    # append to spam words
    spam_messages.append(words)

../data/000/001
../data/000/002
../data/000/004
../data/000/007
../data/000/008
../data/000/009
../data/000/011
../data/000/012
../data/000/013
../data/000/014
../data/000/015
../data/000/016
../data/000/017
../data/000/018
../data/000/019
../data/000/022
../data/000/023
../data/000/028
../data/000/029
../data/000/030
../data/000/031
../data/000/033
../data/000/036
../data/000/037
../data/000/040
../data/000/041
../data/000/042
../data/000/043
../data/000/048
../data/000/050
../data/000/051
../data/000/052
../data/000/053
../data/000/054
../data/000/055
../data/000/056
../data/000/057
../data/000/058
../data/000/059
../data/000/060
../data/000/062
../data/000/063
../data/000/064
../data/000/065
../data/000/066
../data/000/067
../data/000/068
../data/000/071
../data/000/080
../data/000/084
../data/000/085
../data/000/086
../data/000/087
../data/000/088
../data/000/089
../data/000/090
../data/000/091
../data/000/092
../data/000/093
unknown encoding: %CHARSET, <class 'LookupError'>
../dat

In [10]:
# create a set of unique words from the spam training set
spam_features = [word for message in spam_messages for word in message]
spam_features = set(spam_features)

In [11]:
# count occurence of each unique word in above set
# dict of features (key) with value = feature occurrence
# per message
spam_feature_count = {}
for feature in spam_features:
    count_ls = []
    # for each message
    for message in spam_messages:
        count = 0
        # if feature/word is found in message
        if(feature in message):
            # count all occurences
            matches = filter(lambda word: (word == feature), message)
            matches = list(matches)
            count = len(matches)
        else:
            count = 0
        count_ls.append(count)
    spam_feature_count[feature] = count_ls

In [12]:
spam_features_df = pd.DataFrame(spam_feature_count, index = spam_files)

In [13]:
spam_features_df

Unnamed: 0,credible,hobbles,moreplease,kofi,pole,palmetto,jams,examines,qjcwg,theses,...,loterij,downtown,shoddy,mercator,enter,initially,infeasible,alternative,anticipate,aeolus
../data/000/001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
../data/070/294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# calculate total number of occurences per feature in spam training dataset
total_occurrence = spam_features_df.sum(axis='rows')
total_occurrence = pd.DataFrame(total_occurrence)
total_occurrence = total_occurrence.rename(columns={0: 'total count'})

In [15]:
total_occurrence = total_occurrence.sort_values(by=['total count'],ascending=False)

In [16]:
spam_features_df2 = total_occurrence.head(10000)

In [17]:
spam_features_df2

Unnamed: 0,total count
will,4105
adobe,3464
company,3071
professional,2389
gold,2310
...,...
expedient,2
lash,2
prearranged,2
carrara,2


### Extracting Ham Features

In [18]:
# list of words per spam message
ham_messages = []

for paths in ham_files:
    open_file = open(paths, 'r',  encoding="ISO-8859-1")
    print(paths)

    # extract body of messages
    body = extract_msg(open_file)

    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""

    # tokenize body of messages
    words = tokenize_msg(body)
    # append to spam words
    ham_messages.append(words)

../data/000/000
../data/000/003
../data/000/005
../data/000/006
../data/000/010
../data/000/020
../data/000/021
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/024
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/025
../data/000/026
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/027
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/032
../data/000/034
../data/000/035
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/038
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/039
../data/000/044
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/045
../data/000/046
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/047
../data/000/049
../data/000/061
../data/000/069
../data/000/070
../data/000/072
../data/000/073
../data/000/074
../data/000/075
../data/000/076
../data/000/077
../data/000/078
../data/000/079
../data/000/081
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/082
../data/000/083

In [19]:
# create a set of unique words from the ham training set
ham_features = [word for message in ham_messages for word in message]
ham_features = set(ham_features)

In [20]:
# count occurence of each unique word in above set
# dict of features (key) with value = feature occurrence
# per message
ham_feature_count = {}
for feature in ham_features:
    count_ls = []
    # for each message
    for message in ham_messages:
        count = 0
        # if feature/word is found in message
        if(feature in message):
            # count all occurences
            matches = filter(lambda word: (word == feature), message)
            matches = list(matches)
            count = len(matches)
        else:
            count = 0
        count_ls.append(count)
    ham_feature_count[feature] = count_ls

In [21]:
# cannot create pandas dataframe for feature occurence per message like in spam training data
# because of memory error

In [22]:
# create dictionary with key = total number of occurence in ham training set, 
# value = the corresponding feature
ham_feature_total_count = {}
for item in ham_feature_count:
    sum_count = sum(ham_feature_count[item])
    ham_feature_total_count[item] = sum_count

In [23]:
# create data frame of feature and total count
ham_feature_total_count_df = pd.DataFrame(ham_feature_total_count, index=["total count"])
ham_feature_total_count_df = ham_feature_total_count_df.T

In [24]:
# sort in descending order
ham_features_df = ham_feature_total_count_df.sort_values(by=["total count"],ascending=False)

In [25]:
# exrract first 10000 features
ham_features_df2 = ham_features_df.head(10000)

In [26]:
ham_features_df2

Unnamed: 0,total count
will,6269
send,3283
board,3218
nil,2951
list,2808
...,...
spectral,7
loud,7
bureau,7
subsidiaries,7


## Creating the feature matrices (5 points)

### Spam Feature Matrix

In [27]:
# dataframe with features as columns, document as rows
# each cell is the number of occurrences for each features in each message
spam_matrix = spam_features_df[spam_features_df2.index]

In [28]:
spam_matrix

Unnamed: 0,will,adobe,company,professional,gold,3,pro,microsoft,ms,office,...,audiotape,ernestine,sorting,monkeyflower,hochqualifizierte,expedient,lash,prearranged,carrara,dross
../data/000/001,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/007,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/008,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
../data/070/294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/296,0,8,0,5,0,2,4,4,4,3,...,0,0,0,0,0,0,0,0,0,0
../data/070/297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Ham Feature Matrix

In [29]:
ham_matrix = {}
for feature in ham_features_df2.index:
    ham_matrix[feature] = ham_feature_count[feature]

In [30]:
# dataframe with features as columns, document as rows
# each cell is the number of occurrences for each features in each message
ham_matrix = pd.DataFrame(ham_matrix, index=ham_files)

In [31]:
ham_matrix

Unnamed: 0,will,send,board,nil,list,1998,message,help,university,time,...,interstate,arms,authorised,lettering,1ea,spectral,loud,bureau,subsidiaries,wins
../data/000/000,2,1,0,0,6,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/003,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/006,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/010,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
../data/070/270,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/271,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/288,1,0,0,0,0,0,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
../data/070/293,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Computing Priors

In [49]:
ham_msgs = len(ham_files)
spam_msgs = len(spam_files)
total = ham_msgs + spam_msgs

ham_prob = ham_msgs/total
spam_prob = spam_msgs/total
print(ham_prob, spam_prob)

0.3531924882629108 0.6468075117370892


## Computing the Likelihood of each word

In [51]:
lambda_var = 1

### Spam Words

In [34]:
total_spam_word_occurrence = spam_features_df2.sum(axis='rows')
total_spam_word_occurrence = total_spam_word_occurrence["total count"]
total_spam_word_occurrence

361962

In [35]:
denom = total_spam_word_occurrence + (lambda_var*len(spam_features_df2.index))
spam_features_df2['likelihood'] = (spam_features_df2[['total count']] + lambda_var) / denom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spam_features_df2['likelihood'] = (spam_features_df2[['total count']] + lambda_var) / denom


In [36]:
spam_features_df2

Unnamed: 0,total count,likelihood
will,4105,0.011039
adobe,3464,0.009315
company,3071,0.008259
professional,2389,0.006425
gold,2310,0.006213
...,...,...
expedient,2,0.000008
lash,2,0.000008
prearranged,2,0.000008
carrara,2,0.000008


In [37]:
tuple(spam_features_df2.loc['will'])

(4105.0, 0.011038762024077728)

### Ham Words

In [38]:
total_ham_word_occurrence = ham_features_df2.sum(axis='rows')
total_ham_word_occurrence = total_ham_word_occurrence["total count"]
total_ham_word_occurrence

556461

In [39]:
denom = total_ham_word_occurrence + (lambda_var*len(ham_features_df2.index))
ham_features_df2['likelihood'] = (ham_features_df2[['total count']] + lambda_var) / denom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ham_features_df2['likelihood'] = (ham_features_df2[['total count']] + lambda_var) / denom


In [40]:
ham_features_df2

Unnamed: 0,total count,likelihood
will,6269,0.011069
send,3283,0.005797
board,3218,0.005683
nil,2951,0.005211
list,2808,0.004959
...,...,...
spectral,7,0.000014
loud,7,0.000014
bureau,7,0.000014
subsidiaries,7,0.000014


In [41]:
ham_features_df2.loc["will"]['likelihood']

0.011068723177765107

## Classifying the emails


In [46]:
def computeLog(x):
    return math.log(x)

In [43]:
document = "../data/000/000"

def classify(document, lambda_var=1):
    # open document
    open_file = open(f"{document}", 'r', encoding="ISO-8859-1")
    # extract body of email
    body = extract_msg(open_file)
    # list of spam likelihood for each word
    spam_probabilities = []
    # list of ham likelihood for each word
    ham_probabilities = []
    
    
    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""
        
    # tokenize the messasge
    tokens = tokenize_msg(body)
    
    # get features from training set (spam and ham)
    spam_features = spam_features_df2.index
    ham_features = ham_features_df2.index
    
    # retrieve spam and ham likelihood for each word
    for word in tokens:
        if word in spam_features:
            spam_likelihood = spam_features_df2.loc["word"]["likelihood"]
        else:
            spam_likelihood = (0 + lambda_var) / denom
        
        if word in ham_features:
            ham_likelihood = ham_features_df2.loc["word"]["likelihood"]
        else:
            ham_likelihood = (0 + lambda_var) / denom
        
        spam_probabilities.append(spam_likelihood)
        ham_probabilities.append(ham_likelihood)
    
    # compute for sum of all logs
    spam_sum_logs = sum(map(computeLog, spam_probabilities))
    ham_sum_logs = sum(map(computeLog,ham_probabilities))
    
    # compute for final proabability
    final_spam_prob = computeLog(spam_prob) + spam_sum_logs
    final_ham_prob = computeLog(ham_prob) + ham_sum_logs
    
    if final_spam_prob > final_ham_prob:
        return "spam"
    elif final_spam_prob < final_ham_prob:
        return "ham"

In [44]:
classify(document)

'ham'

## Testing the Classifier

In [45]:
classifications = []
for path in test_file_paths:
    # classify whether spam or ham
    email_class = classify(path)
    classifications.append(email_class)

unknown encoding: iso-3629-1, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-2671-9, <class 'LookupError'>
unknown encoding: iso-9001-6, <class 'LookupError'>
unknown encoding: 134, <class 'LookupError'>
unknown encoding: iso-1354-9, <class 'LookupError'>
unknown encoding: iso-3629-1, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-1354-9, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-9001-6, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: windows-874, <class 'LookupError'>
unknown encoding: DEFAULT, <class 'LookupError'>
unknown encoding: windows-874, <class 'LookupError'>
unknown encoding: DEFAULT, <class 'LookupError'>
unknown encoding: iso-8446-6, <class 'LookupError'>
unknown encoding: iso-5474-8, <clas

In [46]:
classifications

['spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 

In [47]:
open_file = open("classifier_labels", "w")
for class_label in classifications:
    open_file.write(class_label + "\n")


## Evaluation

In [57]:
# list of given labels for each email
labels = []
for path in test_file_paths:
    label = spamHam_dict[path]
    labels.append(label)
labels = np.array(labels)

In [49]:
# list of labels give by classifier
classifier_labels = []
open_file = open("classifier_labels","r")
for line in open_file:
    classifier_labels.append(line.strip("\n"))
classifier_labels = np.array(classifier_labels)

In [50]:
print(len(labels))
print(len(classifier_labels))

16522
16522


In [51]:
eval_info = pd.DataFrame(labels,index=test_file_paths)

In [52]:
eval_info = eval_info.rename(columns={0: 'given labels'})

In [53]:
eval_info["classifer_labels"] = classifier_labels

In [54]:
eval_info

Unnamed: 0,given labels,classifer_labels
../data/071/000,spam,spam
../data/071/001,ham,ham
../data/071/002,spam,spam
../data/071/003,spam,spam
../data/071/004,spam,spam
...,...,...
../data/126/017,spam,spam
../data/126/018,spam,spam
../data/126/019,spam,spam
../data/126/020,spam,spam


In [55]:
evaluation_labels = (labels == classifier_labels)

In [56]:
evaluation_labels

array([ True,  True,  True, ...,  True,  True,  True])

In [57]:
eval_info["evaluation labels"] = evaluation_labels

In [58]:
eval_info

Unnamed: 0,given labels,classifer_labels,evaluation labels
../data/071/000,spam,spam,True
../data/071/001,ham,ham,True
../data/071/002,spam,spam,True
../data/071/003,spam,spam,True
../data/071/004,spam,spam,True
...,...,...,...
../data/126/017,spam,spam,True
../data/126/018,spam,spam,True
../data/126/019,spam,spam,True
../data/126/020,spam,spam,True


In [59]:
# evaluate whether result of classification is:
# TP (true positive), TN (true Negative), FP (false positive), or FN (false negative)
accuracy = []
for path in test_file_paths:
    if eval_info.loc[path]["evaluation labels"] == True:
        if  eval_info.loc[path]["given labels"] == "spam":
            accuracy.append("TN")
        elif eval_info.loc[path]["given labels"] == "ham":
            accuracy.append("TP")
    elif eval_info.loc[path]["evaluation labels"] == False:
        if  eval_info.loc[path]["given labels"] == "spam":
            accuracy.append("FP")
        elif eval_info.loc[path]["given labels"] == "ham":
            accuracy.append("FN")

In [60]:
accuracy

['TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'FP',
 'TN',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'FP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TN',
 'FN',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TP',
 'TN',
 'TN',
 'TP',
 'TN',
 'TP',
 'TN',
 'TP',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'FP',
 'TN',
 'TP',
 'TP',
 'TN',
 'TP',
 'TN',
 'TN',
 'FP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',

In [61]:
for item in accuracy:
    tp = filter(lambda item: (item == "TP"), accuracy)
    tn = filter(lambda item: (item == "TN"), accuracy)
    fp = filter(lambda item: (item == "FP"), accuracy)
    fn = filter(lambda item: (item == "FN"), accuracy) 

In [62]:
tp = list(tp)
tn = list(tn)
fp = list(fp)
fn = list(fn)

In [63]:
len_tp = len(tp)
len_tn = len(tn)
len_fp = len(fp)
len_fn = len(fn)

In [64]:
accuracy = (len_tp + len_tn)/(len_tp + len_tn + len_fp + len_fn)
recall = len_tp/ (len_tp + len_fn)
precision = len_tp / (len_tp + len_fp)
print(accuracy, recall, precision)

0.9213170318363394 0.940226471134212 0.838159854376965


In [65]:
remove_stop_words = [accuracy, recall, precision]

## Classification without removing stop words

In [9]:
def tokenize_msg2(body):
    # tokenize body content
    str_tkns = body.split()

    # replace non-alphanumeric characters occuring in each word
    for word in str_tkns:
        # replace non-alphanumeric characters at the start or end
        # of each word into an empty string
        word2 = re.sub(r'^\W+|\W+$',"", word)
        index = str_tkns.index(word)
        # replace word at current index with new word w/out alphanum chars
        str_tkns[index] = word2

        # change non-alphanumeric words in messages to 0
        if(word.isalnum() == False):
            str_tkns[index] = '0'
        # change other symbols/characters not caught in isalnum check to 0
        else:
            match = re.search(r"[^a-zA-Z0-9]", word)
            if match:
                str_tkns[index] = '0'

        # remove empty strings
        if(len(word) == 0):
                str_tkns[index] = '0'
    
    # remove words change to 0
    str_tkns = [word for word in str_tkns if word != '0']

    return str_tkns

### extracting spam features without removing stop words

In [10]:
# list of words per spam message
spam_messages2 = []

for paths in spam_files:
    open_file = open(paths, 'r',  encoding="ISO-8859-1")
    print(paths)

    # extract body of messages
    body = extract_msg(open_file)

    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""

    # tokenize body of messages
    words = tokenize_msg2(body)
    # append to spam words
    spam_messages2.append(words)

../data/000/001
../data/000/002
../data/000/004
../data/000/007
../data/000/008
../data/000/009
../data/000/011
../data/000/012
../data/000/013
../data/000/014
../data/000/015
../data/000/016
../data/000/017
../data/000/018
../data/000/019
../data/000/022
../data/000/023
../data/000/028
../data/000/029
../data/000/030
../data/000/031
../data/000/033
../data/000/036
../data/000/037
../data/000/040
../data/000/041
../data/000/042
../data/000/043
../data/000/048
../data/000/050
../data/000/051
../data/000/052
../data/000/053
../data/000/054
../data/000/055
../data/000/056
../data/000/057
../data/000/058
../data/000/059
../data/000/060
../data/000/062
../data/000/063
../data/000/064
../data/000/065
../data/000/066
../data/000/067
../data/000/068
../data/000/071
../data/000/080
../data/000/084
../data/000/085
../data/000/086
../data/000/087
../data/000/088
../data/000/089
../data/000/090
../data/000/091
../data/000/092
../data/000/093
unknown encoding: %CHARSET, <class 'LookupError'>
../dat

In [11]:
# create a set of unique words from the spam training set
spam_features2 = [word for message in spam_messages2 for word in message]
spam_features2 = set(spam_features2)

In [12]:
# list of lists, each sublist contains count of each feature per message
spam_feature_count2 = []

for message in spam_messages2:
    # list of count of each feature in a particular message
    count_per_feature = []
    for feature in spam_features2:
        if(feature in message):
            matches = filter(lambda word: (word == feature), message)
            matches = list(matches)
            count = len(matches)
        else:
            count = 0
        count_per_feature.append(count)
    spam_feature_count2.append(count_per_feature)
    
spam_feature_count2 = np.array(spam_feature_count2) 

In [13]:
spam_features2_df = pd.DataFrame(spam_feature_count2, index = spam_files, columns = spam_features2)

In [14]:
spam_features2_df

Unnamed: 0,fees,ogress,harmony,awhile,commendation,labours,annually,xannax,coil,altered,...,58,breadwinner,calvin,betoken,carpet,levin,compulsory,bloomington,dapper,levitera
../data/000/001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
../data/070/294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# get total occurence of each feature across all messages
total_per_feature_spam = spam_features2_df.sum(axis='rows')
total_per_feature_spam = pd.DataFrame(total_per_feature_spam)
total_per_feature_spam = total_per_feature_spam.rename(columns={0: 'total count'})
# sort occurences by total count from largest to smallest value
total_per_feature_spam = total_per_feature_spam.sort_values(by=["total count"],ascending=False)
total_per_feature_spam

Unnamed: 0,total count
the,30144
a,23937
and,21519
to,20538
of,16458
...,...
edwards,1
queue,1
chronology,1
belfry,1


In [19]:
total_overall_spam = total_per_feature_spam.sum(axis='rows')
total_overall_spam = int(total_overall_spam)
total_overall_spam

798230

In [22]:
spam_prob_per_feature = np.array((total_per_feature_spam/total_overall_spam))
spam_prob_per_feature = pd.DataFrame(spam_prob_per_feature,columns=["likelihood"],index=total_per_feature_spam.index)
spam_prob_per_feature = spam_prob_per_feature.head(10000)
spam_prob_per_feature

Unnamed: 0,likelihood
the,0.037764
a,0.029988
and,0.026958
to,0.025729
of,0.020618
...,...
rosetta,0.000005
opec,0.000005
stammer,0.000005
constitutes,0.000005


### extracting ham features without removing stop words

In [27]:
# list of words per ham message
ham_messages2 = []

for paths in ham_files:
    open_file = open(paths, 'r',  encoding="ISO-8859-1")
    print(paths)

    # extract body of messages
    body = extract_msg(open_file)

    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""

    # tokenize body of messages
    words = tokenize_msg2(body)
    # append to spam words
    ham_messages2.append(words)

../data/000/000
../data/000/003
../data/000/005
../data/000/006
../data/000/010
../data/000/020
../data/000/021
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/024
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/025
../data/000/026
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/027
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/032
../data/000/034
../data/000/035
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/038
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/039
../data/000/044
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/045
../data/000/046
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/047
../data/000/049
../data/000/061
../data/000/069
../data/000/070
../data/000/072
../data/000/073
../data/000/074
../data/000/075
../data/000/076
../data/000/077
../data/000/078
../data/000/079
../data/000/081
unknown encoding: X-utf-2, <class 'LookupError'>
../data/000/082
../data/000/083

In [28]:
# create a set of unique words from the ham training set
ham_features2 = [word for message in ham_messages2 for word in message]
ham_features2 = set(ham_features2)

In [29]:
# list of lists, each sublist contains count of each feature per message
ham_feature_count2 = []

for message in ham_messages2:
    # list of count of each feature in a particular message
    count_per_feature = []
    for feature in ham_features2:
        if(feature in message):
            matches = filter(lambda word: (word == feature), message)
            matches = list(matches)
            count = len(matches)
        else:
            count = 0
        count_per_feature.append(count)
    ham_feature_count2.append(count_per_feature)
    
ham_feature_count2 = np.array(ham_feature_count2) 

In [30]:
ham_features2_df = pd.DataFrame(ham_feature_count2, index = ham_files, columns = ham_features2)

In [31]:
ham_features2_df

Unnamed: 0,ac8kaadhjwaaaaar8aqaaaataaaadwae8egaaabcaqrwcaaaag0haacccgaaiwal8awaaadl,livingstone,dqahaaaagasfbxqfjwjtagqaaaataqgabaaaapabdaaeaaaalqefaaqaaadwaq0abwaaapwc,awhile,coil,le1,zooplankters,ju,series,reasoned,...,bwaaapwcaad8asgaaaaeaaaalqenaa4aaaakawuavgs6blwefqshbg0eogsrbfyeugqeaaaa,pretends,aad6agaabaaaaaaaaaaiaaqaaaataqwabaaaac0bagaiaaaajqmcakacywlzakecbaaaac0b,zqaaadcfaacfaaaaxguaaaqaaaataqmabwaaapwcaad5wrcaaaaeaaaalqehabiaaaakawca,compulsory,finiteness,enfermagem,bloomington,economics,reboost
../data/000/000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/000/010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
../data/070/270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
../data/070/288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
../data/070/293,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# get total occurence of each feature across all messages
total_per_feature_ham = ham_features2_df.sum(axis='rows')
total_per_feature_ham = pd.DataFrame(total_per_feature_ham)
total_per_feature_ham = total_per_feature_ham.rename(columns={0: 'total count'})
# sort occurences by total count from largest to smallest value
total_per_feature_ham = total_per_feature_ham.sort_values(by=["total count"],ascending=False)
total_per_feature_ham

Unnamed: 0,total count
the,90645
to,52485
of,35576
a,35505
and,33736
...,...
akgkaaayiqaausqaacwhaaaaabhwbaaaaaiaaaapaatwvaaaadiacvaiaaaacaqaaaikaabd,1
teorica,1
clauses,1
aaataqgabwaaabgevwlyb0ocywceaaaalqeeaaqaaadwaqcabaaaac0bagaeaaaa8aeiaaca,1


In [33]:
total_overall_ham = total_per_feature_ham.sum(axis='rows')
total_overall_ham = int(total_overall_ham)
total_overall_ham

1486100

In [34]:
ham_prob_per_feature = np.array((total_per_feature_ham/total_overall_ham))
ham_prob_per_feature = pd.DataFrame(ham_prob_per_feature,columns=["likelihood"],index=total_per_feature_ham.index)
ham_prob_per_feature = ham_prob_per_feature.head(10000)
ham_prob_per_feature

Unnamed: 0,likelihood
the,0.060995
to,0.035317
of,0.023939
a,0.023891
and,0.022701
...,...
arctic,0.000005
minuit,0.000005
sustain,0.000005
rings,0.000005


### Classifying without removing stop words during training

In [54]:
def classify2(document, lambda_var=1):
    # open document
    open_file = open(f"{document}", 'r', encoding="ISO-8859-1")
    # extract body of email
    body = extract_msg(open_file)
    # list of spam likelihood for each word
    spam_probabilities = []
    # list of ham likelihood for each word
    ham_probabilities = []
    
    
    if(type(body) == str):
        # convert all chars to lowercase
        body = body.lower()
    else:
        # if body is not string, turn into empty string (no message)
        body = ""
        
    # tokenize the messasge
    tokens = tokenize_msg2(body)
    
    # get features from training set (spam and ham)
    spam_features = spam_features2
    ham_features = ham_features2
    
    # retrieve spam and ham likelihood for each word
    for word in tokens:
        if(word in spam_features):
            spam_likelihood = spam_prob_per_feature.loc["word"]["likelihood"]
        else:
            spam_likelihood = (0 + lambda_var) / (total_overall_spam + (lambda_var*len(spam_features)))
        
        if(word in ham_features):
            ham_likelihood = ham_prob_per_feature.loc["word"]["likelihood"]
        else:
            ham_likelihood = (0 + lambda_var) / (total_overall_ham + (lambda_var*len(ham_features)))
        
        spam_probabilities.append(spam_likelihood)
        ham_probabilities.append(ham_likelihood)
    
    # compute for sum of all logs
    spam_sum_logs = sum(map(computeLog, spam_probabilities))
    ham_sum_logs = sum(map(computeLog,ham_probabilities))
    
    # compute for final proabability
    final_spam_prob = computeLog(spam_prob) + spam_sum_logs
    final_ham_prob = computeLog(ham_prob) + ham_sum_logs
    
    if final_spam_prob > final_ham_prob:
        return "spam"
    elif final_spam_prob < final_ham_prob:
        return "ham"

In [55]:
classifications2 = []
for path in test_file_paths:
    # classify whether spam or ham
    email_class = classify2(path)
    classifications2.append(email_class)
classifications2

unknown encoding: iso-3629-1, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-2671-9, <class 'LookupError'>
unknown encoding: iso-9001-6, <class 'LookupError'>
unknown encoding: 134, <class 'LookupError'>
unknown encoding: iso-1354-9, <class 'LookupError'>
unknown encoding: iso-3629-1, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-1354-9, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: iso-9001-6, <class 'LookupError'>
unknown encoding: %CHARSET, <class 'LookupError'>
unknown encoding: windows-874, <class 'LookupError'>
unknown encoding: DEFAULT, <class 'LookupError'>
unknown encoding: windows-874, <class 'LookupError'>
unknown encoding: DEFAULT, <class 'LookupError'>
unknown encoding: iso-8446-6, <class 'LookupError'>
unknown encoding: iso-5474-8, <clas

['spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'ham',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',
 'ham',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'spam',
 'spam',
 'spam',
 'spam',
 'ham',
 'ham',
 'spam',
 'ham',
 'ham',
 'ham',
 'ham',

In [59]:
evaluate = (labels == classifications2)

In [102]:
eval_info = pd.DataFrame(labels,index=test_file_paths)
eval_info = eval_info.rename(columns={0: 'given labels'})
eval_info["evaluation label"] = evaluate

In [103]:
eval_info

Unnamed: 0,given labels,evaluation label
../data/071/000,spam,True
../data/071/001,ham,True
../data/071/002,spam,True
../data/071/003,spam,True
../data/071/004,spam,True
...,...,...
../data/126/017,spam,True
../data/126/018,spam,True
../data/126/019,spam,True
../data/126/020,spam,True


In [104]:
# evaluate whether result of classification is:
# TP (true positive), TN (true Negative), FP (false positive), or FN (false negative)
accuracy2 = []
for path in test_file_paths:
    if eval_info.loc[path]["evaluation label"] == True:
        if  eval_info.loc[path]["given labels"] == "spam":
            accuracy2.append("TN")
        elif eval_info.loc[path]["given labels"] == "ham":
            accuracy2.append("TP")
    elif eval_info.loc[path]["evaluation label"] == False:
        if  eval_info.loc[path]["given labels"] == "spam":
            accuracy2.append("FP")
        elif eval_info.loc[path]["given labels"] == "ham":
            accuracy2.append("FN")

In [105]:
accuracy2

['TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'FP',
 'FP',
 'TN',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'FP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'FP',
 'FP',
 'TP',
 'FP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TP',
 'TP',
 'TP',
 'TN',
 'TP',
 'TP',
 'TP',
 'TP',
 'FP',
 'FP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'FP',
 'TN',
 'TP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TN',
 'FP',
 'TN',
 'TP',
 'FP',
 'TN',
 'TP',
 'FP',
 'TP',
 'FP',
 'TP',
 'TN',
 'TP',
 'TN',
 'TN',
 'TN',
 'TN',
 'FP',
 'TN',
 'TN',
 'TN',
 'TN',
 'TP',
 'TP',
 'TN',
 'FP',
 'TP',
 'TP',
 'TP',
 'FP',
 'TN',
 'TP',
 'TP',
 'TN',
 'TP',
 'TN',
 'TN',
 'FP',
 'TP',
 'TN',
 'TN',
 'TN',
 'TP',
 'TN',
 'FP',
 'TN',
 'TN',
 'TN',
 'TN',
 'FP',
 'TN',
 'TN',
 'FP',

In [106]:
for item in accuracy2:
    tp2 = filter(lambda item: (item == "TP"), accuracy2)
    tn2 = filter(lambda item: (item == "TN"), accuracy2)
    fp2 = filter(lambda item: (item == "FP"), accuracy2)
    fn2 = filter(lambda item: (item == "FN"), accuracy2) 

In [107]:
tp2 = list(tp2)
tn2 = list(tn2)
fp2 = list(fp2)
fn2 = list(fn2)

In [108]:
len_tp2 = len(tp2)
len_tn2 = len(tn2)
len_fp2 = len(fp2)
len_fn2 = len(fn2)

In [110]:
accuracy2 = (len_tp2 + len_tn2)/(len_tp2 + len_tn2 + len_fp2 + len_fn2)
recall2 = len_tp2/ (len_tp2 + len_fn2)
precision2 = len_tp2 / (len_tp2 + len_fp2)
print(accuracy2, recall2, precision2)

0.8459024331194771 0.9456098013736773 0.6933442221314823


In [119]:
comparison = np.array([[0.9213170318363394, 0.940226471134212,0.838159854376965],
                       [0.8459024331194771, 0.9456098013736773, 0.6933442221314823]])
comparison = pd.DataFrame(comparison, columns=["accuracy", "recall", "precision"], index=["with stop words","without stop words"])

In [120]:
comparison

Unnamed: 0,accuracy,recall,precision
with stop words,0.921317,0.940226,0.83816
without stop words,0.845902,0.94561,0.693344


In [None]:
def count_per_feature2(files):
    messages = []
    for file in files:
        open_file = open(file, 'r',  encoding="ISO-8859-1")
        print(file)

        # extract body of messages
        body = extract_msg(open_file)

        if(type(body) == str):
            # convert all chars to lowercase
            body = body.lower()
        else:
            # if body is not string, turn into empty string (no message)
            body = ""

        # tokenize body of messages
        words = tokenize_msg2(body)
        # append to list of words per message
        messages.append(words)

    # create a set of unique words from the ham training set
    features= [word for message in messages for word in message]
    features= set(features)

    # list of lists, each sublist contains count of each feature per message
    feature_count = []

    for message in messages:
        # list of count of each feature in a particular message
        count_per_feature = []
        for feature in features:
            # if feature occurs in message
            if(feature in message):
                # count occurence
                matches = filter(lambda word: (word == feature), message)
                matches = list(matches)
                count = len(matches)
            else:
                # else count is 0
                count = 0
            count_per_feature.append(count)
        feature_count.append(count_per_feature)
    
    print("finished counting. creating dataframe")
    # creates dataframe of feature and its count per message
    feature_count = np.array(feature_count) 
    features_df = pd.DataFrame(feature_count, index = files, columns = features)

    return features_df