In [1]:
#basic algorithm
#convert email into feature vector
#add hyperparameters to:
# - strip email headers
# - convert to lowercase
# - remove punctuation
# - replace url with "URL" 
# - replace numbers with "NUMBERS"
# - Preform Stemming (trim word endings with library)

In [2]:
import pandas as pd
import numpy as np
import os
import email
import email.policy
import nltk 
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin


os.listdir('./hamnspam/')

['spam', 'ham']

DATA EXPLORATION

In [13]:
ham_filenames = [name for name in sorted(os.listdir('./hamnspam/ham')) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir('./hamnspam/spam')) if len(name) > 20]


In [7]:
print('Amount of ham files: ', len(ham_filenames))
print('Amount of spam files: ', len(spam_filenames))
print('ham, spam ratio: ', len(ham_filenames)/len(spam_filenames))
print('spam, ham ratio: ', len(spam_filenames)/len(ham_filenames))

Amount of ham files:  2551
Amount of spam files:  501
ham, spam ratio:  5.091816367265469
spam, ham ratio:  0.1963935711485692


In [14]:
def load_emails(is_spam, filename):
    directory = "./hamnspam/spam" if is_spam else "./hamnspam/ham"
    with open(os.path.join(directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_email = [load_emails(is_spam = False, filename = name) for name in ham_filenames]
spam_email = [load_emails(is_spam = True, filename = name) for name in spam_filenames]


TURNING EMAILS TO TO PLAIN TEXT

In [79]:
def get_email_structure(email):
    if isinstance(email, str): #if email is a string, it is basically already in plain text so just return the email type
        return email 
    payload = email.get_payload()
    if isinstance (payload, list): #if email payload is a list then it means there are multiple emails, so we loop through all the emails in the list and return each email type the "get email structure" function
        return 'multipart({})'.format(', '.join([get_email_structure(sub_email) for sub_email in payload])) #kinda recursive lol
    else: 
        return email.get_content_type()

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1 #basically just increasing the value count (frequency) by 1
    return structures 


In [80]:
ham_structure = structures_counter(ham_email)
spam_structure = structures_counter(spam_email)

In [151]:
# - Strip email headers
# - Convert to lowercase
# - Remove punctuation
# - Replace url with "URLS"
# - Replace numbers with "NUMBER"
# - Perform stemming (trim word endings with library)

class EmailToWords(BaseEstimator, TransformerMixin):
    def __init__(self, stripHeaders=True, lowercaseConversion=True, punctuationRemoval=True, urlReplace=True, numberReplacement=True, stemming=True):
        self.stripHeaders = stripHeaders
        self.punctuationRemoval = punctuationRemoval
        self.urlReplace = urlReplace
        self.numberReplacement = numberReplacement
        self.stemming = stemming
        self.stemmer = nltk.PorterStemmer()
        self.lowercaseConversion = lowercaseConversion
    
    def html_to_plain(self, email):
        try:
            soup = BeautifulSoup(email.get_content(), 'html.parser')
            return soup.text.replace('\n\n', '')
        except:
            return 'empty'
    
    def email_to_plain(self, email):
        struct = get_email_structure(email)
        for part in email.walk():
            part_content_type = part.get_content_type()
            if part_content_type not in ['text/plain', 'test/html']:
                continue 
            try:
                part_content = part.get_content()
            except:  #in case of encoding issues
                part_content = str(part.get_payload())
            if part_content_type == 'text/plain':
                return part_content 
            else:
                return self.html_to_plain(part)

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_to_words = []
        for email in X:
            text = self.email_to_plain(email)
            if text is None:
                text = 'empty'
            if self.lowercaseConversion:
                text = text.lower()
            
            if self.punctuationRemoval:
                text = text.replace('.', '')
                text = text.replace(',', '')
                text = text.replace('!', '')
                text = text.replace('?', '')
            
            word_count = Counter(text.split())
            if self.stemming:
                stemmed_word_count = Counter()
                for word, count in word_count.items():
                    stemmed_word = self.stemmer.stem(word)
                    stemmed_word_count[stemmed_word] += count
                word_counts = stemmed_word_count
            X_to_words.append(word_count)
        return np.array(X_to_words)

In [152]:
X_few = ham_email[:2]
Xwordcounts = EmailToWords().fit_transform(X_few)
Xwordcounts


array([Counter({'the': 15, 'pick': 9, '-lbrace': 6, 'of': 5, '-rbrace': 5, 'i': 4, 'is': 4, '-list': 4, 'this': 3, '+inbox': 3, '-subject': 3, 'ftp': 3, '-sequence': 3, '18:19:04': 3, 'command': 3, 'delta$': 3, 'from': 3, 'error': 2, '18:19:03': 2, '4852-4852': 2, 'mercury': 2, '1': 2, "that's": 2, 'comes': 2, 'version': 2, 'using': 2, 'on': 2, 'and': 2, 'one': 2, 'date:': 1, 'wed': 1, '21': 1, 'aug': 1, '2002': 1, '10:54:46': 1, '-0500': 1, 'from:': 1, 'chris': 1, 'garrigues': 1, '<cwg-dated-103037728706fa6d@deepeddycom>': 1, 'message-id:': 1, '<10299452874797tmda@deepeddyvirciocom>': 1, '|': 1, "can't": 1, 'reproduce': 1, 'for': 1, 'me': 1, 'it': 1, 'very': 1, 'repeatable': 1, '(like': 1, 'every': 1, 'time': 1, 'without': 1, 'fail)': 1, 'debug': 1, 'log': 1, 'happening': 1, 'pick_it': 1, '{exec': 1, '-rbrace}': 1, '{4852-4852': 1, 'mercury}': 1, 'exec': 1, 'ftoc_pickmsgs': 1, '{{1': 1, 'hit}}': 1, 'marking': 1, 'hits': 1, 'tkerror:': 1, 'syntax': 1, 'in': 1, 'expression': 1, '"int': 

In [153]:
from scipy.sparse import csr_matrix

class WordCountToVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    
    def fit(self, X, y=None):
        total_word_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_word_count[word] += min(count, 10)
        self.most_common = total_word_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(self.most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []

        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))


In [154]:
vocab_transformer = WordCountToVector(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(Xwordcounts)
X_few_vectors.toarray()

array([[164,  15,   9,   5,   4,   6,   3,   5,   3,   2,   4],
       [ 94,   5,   0,   3,   2,   0,   2,   0,   2,   3,   0]],
      dtype=int64)