In [1]:
from os import listdir, getcwd, chdir
from os.path import isfile, join, dirname, realpath
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#change working directory to where data folders are
chdir('C:\\Users\Mburg\\Documents\\SMU\\Spring 2022\\Quantifying the World\\Case Study 3')

In [3]:
#create list of data containing directories
directories = [
    'easy_ham',
    'easy_ham_2',
    'spam',
    'spam_2'
]    

In [4]:
#create data frame to hold email info
email_info = pd.DataFrame()

In [5]:
"""
fill email dataframe
info includes whether email is spam, whether the email is a reply,
if the subject is in all caps, if there are any attachemnts,
the total numbers of lines, the number of lines containing text(body lines), and the number of empty lines
adapted from Email Processing by Brad Blanchard
"""

#get the list of files in each directory
for d in directories:
    mypath = getcwd() + '\\SpamAssassinMessages\\' + d + '\\'
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    
    try:
        onlyfiles.remove('.DS_Store')
    except:
        pass
    
    #for each file read the lines inside the email
    for file in onlyfiles:
        with open(mypath + file, 'r', encoding='latin1') as f:
            lines = f.readlines()
            f.close()
            
            in_reply_count = 0
            sub_line_all_caps = 0
            attachments = 0
            subject_line = []
            n_lines = 0
            blank_lines = 0
            
            #for each line return info about the email content
            for line in lines:
                n_lines += 1
                if "Subject: Re: " in line:
                    in_reply_count += 1
                if "Subject: " in line:
                    s_line = line.strip().replace('Subject: ','')
                    s_line = ''.join(e for e in s_line if e.isalnum())
                    num_upper = sum(1 for c in s_line if c.isupper())
                    ttl_chars = len(s_line)
                    if num_upper == ttl_chars:
                        sub_line_all_caps += 1
                    subject_line.append(s_line)
                if "content-type: multipart" in line.lower():
                    attachments += 1
                if line == "\n":
                    blank_lines += 1
            
            #temp storage of data for each email file
            temp_frame = pd.DataFrame({
                'filename':file,
                'is_spam':['Y' if 'spam' in d else 'N'],
                'in_reply': ['Y' if in_reply_count > 0 else 'N'], 
                'subj_caps': ['Y' if sub_line_all_caps > 0 else 'N'], 
                'attachments': ['Y' if attachments > 0 else 'N'],
                'num_lines' : n_lines,
                'body_lines': n_lines - blank_lines,
                'blank_lines': blank_lines
            }, index=[0])
            
            #move data to the email dataframe
            email_info = email_info.append(temp_frame, ignore_index=True)

#write the data to a csv output file
email_info.to_csv('output_file.csv', index=False)

In [6]:
#drop the CMDS file info, these are not emails
email_info = email_info[email_info.filename != 'cmds']

In [7]:
#sanity check - make sure data was written correctly
email_info

Unnamed: 0,filename,is_spam,in_reply,subj_caps,attachments,num_lines,body_lines,blank_lines
0,00001.7c53336b37003a9286aba55d2945844c,N,Y,N,N,113,93,20
1,00002.9c4069e25e1ef370c078db7ee85ff9ac,N,N,N,N,73,65,8
2,00003.860e3c3cee1b42ead714c5c874fe25f7,N,N,N,N,82,73,9
3,00004.864220c5b6930b209cc287c361c99af1,N,N,N,N,78,69,9
4,00005.bf27cdeaf0b8c4647ecd61b1d09da613,N,Y,N,N,77,66,11
...,...,...,...,...,...,...,...,...
8846,01396.e80a10644810bc2ae3c1b58c5fd38dfa,Y,N,N,N,271,264,7
8847,01397.f75f0dd0dd923faefa3e9cc5ecb8c906,Y,N,N,Y,368,358,10
8848,01398.8ca7045aae4184d56e8509dc5ad6d979,Y,N,N,N,78,54,24
8849,01399.2319643317e2c5193d574e40a71809c2,Y,N,N,N,304,239,65


In [8]:
#create vectorizer using standard English stop words from SciKit Learn
tfidf = TfidfVectorizer(stop_words='english')