In [1]:
from os import listdir, getcwd, chdir
from os.path import isfile, join, dirname, realpath
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
#change working directory to where data folders are
chdir('C:\\Users\Mburg\\Documents\\SMU\\Spring 2022\\Quantifying the World\\Case Study 3')

# Data Ingestion

In [3]:
#function to strip the message header from the email since we only want the body data
#adapted from code by Brad Blanchard
def strip_head(text):
    tmp=[]
    flag = False
    for i in text:
        if i=='':
            flag = True
        if flag == True:
            tmp.append(i)
    return tmp

In [4]:
#create list of data containing directories
directories = [
    'easy_ham',
    'easy_ham_2',
    'spam',
    'spam_2'
]    

In [5]:
#create data frame to hold email info
email_info = pd.DataFrame()

In [6]:
"""
fill email dataframe
info includes whether email is spam, whether the email is a reply,
if the subject is in all caps, if there are any attachemnts,
the total numbers of lines, the number of lines containing text(body lines), and the number of empty lines
finally, the body of the email is retrieved and added as the data column
adapted from Email Processing by Brad Blanchard
"""

#get the list of files in each directory
for d in directories:
    mypath = getcwd() + '\\SpamAssassinMessages\\' + d + '\\'
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    
    try:
        onlyfiles.remove('.DS_Store')
    except:
        pass
    
    #for each file read the lines inside the email
    for file in onlyfiles:
        with open(mypath + file, encoding='latin1') as f:
            data = [i.strip() for i in f]
            s = ' '
            lines = f.readlines()
            f.close()
            
            in_reply_count = 0
            sub_line_all_caps = 0
            attachments = 0
            subject_line = []
            n_lines = 0
            blank_lines = 0
            
            #for each line return info about the email content
            for line in lines:
                n_lines += 1
                if "Subject: Re: " in line:
                    in_reply_count += 1
                if "Subject: " in line:
                    s_line = line.strip().replace('Subject: ','')
                    s_line = ''.join(e for e in s_line if e.isalnum())
                    num_upper = sum(1 for c in s_line if c.isupper())
                    ttl_chars = len(s_line)
                    if num_upper == ttl_chars:
                        sub_line_all_caps += 1
                    subject_line.append(s_line)
                if "content-type: multipart" in line.lower():
                    attachments += 1
                if line == "\n":
                    blank_lines += 1
            
          
            
            #temp storage of data for each email file
            temp_frame = pd.DataFrame({
                'filename':file,
                'is_spam':['1' if 'spam' in d else '0'],
                'in_reply': ['1' if in_reply_count > 0 else '0'], 
                'subj_caps': ['1' if sub_line_all_caps > 0 else '0'], 
                'attachments': ['1' if attachments > 0 else '0'],
                'num_lines' : n_lines,
                'body_lines': n_lines - blank_lines,
                'blank_lines': blank_lines,
                'data' :  s.join(strip_head(data))
            }, index=[0])
            
            #move data to the email dataframe
            email_info = email_info.append(temp_frame, ignore_index=True)

#write the data to a csv output file
email_info.to_csv('output_file.csv', index=False)

In [7]:
#drop the CMDS file info, these are not emails
email_info = email_info[email_info.filename != 'cmds']

In [8]:
#sanity check - make sure data was written correctly
email_info

Unnamed: 0,filename,is_spam,in_reply,subj_caps,attachments,num_lines,body_lines,blank_lines,data
0,00001.7c53336b37003a9286aba55d2945844c,0,0,0,0,0,0,0,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ..."
1,00002.9c4069e25e1ef370c078db7ee85ff9ac,0,0,0,0,0,0,0,"Martin A posted: Tassos Papadopoulos, the Gre..."
2,00003.860e3c3cee1b42ead714c5c874fe25f7,0,0,0,0,0,0,0,Man Threatens Explosion In Moscow Thursday A...
3,00004.864220c5b6930b209cc287c361c99af1,0,0,0,0,0,0,0,Klez: The Virus That Won't Die Already the m...
4,00005.bf27cdeaf0b8c4647ecd61b1d09da613,0,0,0,0,0,0,0,"> in adding cream to spaghetti carbonara, wh..."
...,...,...,...,...,...,...,...,...,...
8846,01396.e80a10644810bc2ae3c1b58c5fd38dfa,1,0,0,0,0,0,0,"<html> <head> <meta http-equiv=""content-type""..."
8847,01397.f75f0dd0dd923faefa3e9cc5ecb8c906,1,0,0,0,0,0,0,This is a multi-part message in MIME format. ...
8848,01398.8ca7045aae4184d56e8509dc5ad6d979,1,0,0,0,0,0,0,"Dear Subscriber, If I could show you a way t..."
8849,01399.2319643317e2c5193d574e40a71809c2,1,0,0,0,0,0,0,****Mid-Summer Customer Appreciation SALE!***...


# Data prep

In [9]:
#shuffle data set so not all spam emails are at bottom
email_cv = shuffle(email_info, random_state = 42)
email_cv.reset_index(drop = True, inplace = True)

In [10]:
#check balance of dataset
email_cv['is_spam'].value_counts()

0    6451
1    2397
Name: is_spam, dtype: int64

Data is unbalanced. 73% not spam, 27% spam

In [11]:
#create a 70/30 train test set
train_set = email_cv.sample(frac=0.7, replace = False, random_state = 42)
test_set = email_cv.drop(train_set.index)

train_set.reset_index(drop = True, inplace = True)
test_set.reset_index(drop = True, inplace = True)

In [12]:
#check that original spam/not spam balance perserved
train_set['is_spam'].value_counts()

0    4539
1    1655
Name: is_spam, dtype: int64

In [13]:
#check that original spam/not spam balance perserved
test_set['is_spam'].value_counts()

0    1912
1     742
Name: is_spam, dtype: int64

Percentages of spam/not spam are within 1 percent of the orignal dataset

# Baseline models with no additional features? 

Just using in_reply, subj_caps,	attachments, num_lines,	body_lines,	blank_lines or other similiar features we want to add. Not looking at the body of the email.

# Add TF-IDF features

All the previous features plus creating TF-IDF values from the bodies of the emails. Do we want to baseline just using the TF-IDF features or are we okay with the combined model? That would give us a baseline, a TF-IDF, and a baseline + TF-IDF to compare.

## Create training set

In [14]:
#create vectorizer and get values for training data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf.fit_transform(train_set['data'], train_set['is_spam'])

In [15]:
tfidf_matrix_train

<6194x86395 sparse matrix of type '<class 'numpy.float64'>'
	with 723553 stored elements in Compressed Sparse Row format>

In [16]:
tfidf_matrix_train.shape

(6194, 86395)

86,395 unique words in 6,194 emails.

In [17]:
#append the training vectors to the training dataframe
train_set2 = train_set.join(pd.DataFrame.sparse.from_spmatrix(tfidf_matrix_train))

In [18]:
#drop the filename and data column as no longer needed for predictions
train_set2 = train_set2.drop(columns = ['filename', 'data'])

In [19]:
train_set2

Unnamed: 0,is_spam,in_reply,subj_caps,attachments,num_lines,body_lines,blank_lines,0,1,2,...,86385,86386,86387,86388,86389,86390,86391,86392,86393,86394
0,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6189,0,0,0,0,0,0,0,0.020107,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6190,1,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6191,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6192,0,0,0,0,0,0,0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#create training X and Ys
X_train = train_set2.drop(columns = ['is_spam']).values
y_train = train_set2['is_spam']

## Create testing set

In [21]:
#create the tf-idf values for the test data
tfidf_matrix_test = tfidf.transform(test_set['data'])

In [22]:
tfidf_matrix_test

<2654x86395 sparse matrix of type '<class 'numpy.float64'>'
	with 286933 stored elements in Compressed Sparse Row format>

In [23]:
tfidf_matrix_test.shape

(2654, 86395)

In [24]:
#create new dataframe for testing with tf-idf features
test_set2 = test_set.join(pd.DataFrame.sparse.from_spmatrix(tfidf_matrix_test))

In [25]:
#drop unused columns
test_set2 = test_set2.drop(columns = ['filename', 'data'])

In [26]:
test_set2

Unnamed: 0,is_spam,in_reply,subj_caps,attachments,num_lines,body_lines,blank_lines,0,1,2,...,86385,86386,86387,86388,86389,86390,86391,86392,86393,86394
0,1,0,0,0,0,0,0,0.004368,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.099122,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0.107368,0.116065,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649,0,0,0,0,0,0,0,0.023217,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2650,1,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2651,0,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2652,1,0,0,0,0,0,0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [27]:
#create the test x and y sets
X_test = test_set2.drop(columns = ['is_spam']).values
y_test = test_set2['is_spam']

## Multinomial Naive Bayes on Data with TF-IDF features added

In [28]:
#create the classifier and fit the model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

MultinomialNB()

In [29]:
#adapted from https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/
#create spam or not spam predictions for test data
spam_pred = nb_classifier.predict(X_test)

# compute the performance measures
score1 = metrics.accuracy_score(y_test, spam_pred)
print("accuracy:   %0.3f" % score1)

#1 is spam, 0 is not spam
print(metrics.classification_report(y_test, spam_pred))

#for the matrix remember that 1 is spam, 0 is not spam
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, spam_pred, labels = ['1','0']))

print('------------------------------')

accuracy:   0.927
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1912
           1       0.99      0.74      0.85       742

    accuracy                           0.93      2654
   macro avg       0.95      0.87      0.90      2654
weighted avg       0.93      0.93      0.92      2654

confusion matrix:
[[ 551  191]
 [   4 1908]]
------------------------------


Our untuned Naive Bayes model performs very well with about 93% accuracy (f1-score) at classifying spam or not spam. There were 191 non-spam emails classified as spam out of 1,912 not spam emails, about a 10% false positive rate. Only 4 spam emails out of 742 emails got through the filter. A false negative rate of 0.5%.

## Put Clustering Model Here

# Count Vectorizer

Basically the same steps as above, but in every step where TF-IDF occurs it will need to be replaced with Count Vectorizer instead.

In [30]:
count = CountVectorizer(stop_words = 'english')