In [1]:
#basic algorithm
#convert email into feature vector
#add hyperparameters to:
# - strip email headers
# - convert to lowercase
# - remove punctuation
# - replace url with "URL" 
# - replace numbers with "NUMBERS"
# - Preform Stemming (trim word endings with library)

In [2]:
import pandas as pd
import numpy as np
import os
import email
import email.policy
import nltk 
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split

os.listdir('./hamnspam/')

['spam', 'ham']

DATA EXPLORATION

In [3]:
ham_filenames = [name for name in sorted(os.listdir('./hamnspam/ham')) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir('./hamnspam/spam')) if len(name) > 20]


In [4]:
print('Amount of ham files: ', len(ham_filenames))
print('Amount of spam files: ', len(spam_filenames))
print('ham, spam ratio: ', len(ham_filenames)/len(spam_filenames))
print('spam, ham ratio: ', len(spam_filenames)/len(ham_filenames))

Amount of ham files:  2551
Amount of spam files:  501
ham, spam ratio:  5.091816367265469
spam, ham ratio:  0.1963935711485692


In [5]:
def load_emails(is_spam, filename):
    directory = "./hamnspam/spam" if is_spam else "./hamnspam/ham"
    with open(os.path.join(directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_email = [load_emails(is_spam = False, filename = name) for name in ham_filenames]
spam_email = [load_emails(is_spam = True, filename = name) for name in spam_filenames]


TURNING EMAILS TO TO PLAIN TEXT

In [6]:
def get_email_structure(email):
    if isinstance(email, str): #if email is a string, it is basically already in plain text so just return the email type
        return email 
    payload = email.get_payload()
    if isinstance (payload, list): #if email payload is a list then it means there are multiple emails, so we loop through all the emails in the list and return each email type the "get email structure" function
        return 'multipart({})'.format(', '.join([get_email_structure(sub_email) for sub_email in payload])) #kinda recursive lol
    else: 
        return email.get_content_type()

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1 #basically just increasing the value count (frequency) by 1
    return structures 


In [7]:
ham_structure = structures_counter(ham_email)
spam_structure = structures_counter(spam_email)

In [8]:
# - Strip email headers
# - Convert to lowercase
# - Remove punctuation
# - Replace url with "URLS"
# - Replace numbers with "NUMBER"
# - Perform stemming (trim word endings with library)

class EmailToWords(BaseEstimator, TransformerMixin):
    def __init__(self, stripHeaders=True, lowercaseConversion=True, punctuationRemoval=True, urlReplace=True, numberReplacement=True, stemming=True):
        self.stripHeaders = stripHeaders
        self.punctuationRemoval = punctuationRemoval
        self.urlReplace = urlReplace
        self.numberReplacement = numberReplacement
        self.stemming = stemming
        self.stemmer = nltk.PorterStemmer()
        self.lowercaseConversion = lowercaseConversion
    
    def html_to_plain(self, email):
        try:
            soup = BeautifulSoup(email.get_content(), 'html.parser')
            return soup.text.replace('\n\n', '')
        except:
            return 'empty'
    
    def email_to_plain(self, email):
        struct = get_email_structure(email)
        for part in email.walk():
            part_content_type = part.get_content_type()
            if part_content_type not in ['text/plain', 'test/html']:
                continue 
            try:
                part_content = part.get_content()
            except:  #in case of encoding issues
                part_content = str(part.get_payload())
            if part_content_type == 'text/plain':
                return part_content 
            else:
                return self.html_to_plain(part)

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_to_words = []
        for email in X:
            text = self.email_to_plain(email)
            if text is None:
                text = 'empty'
            if self.lowercaseConversion:
                text = text.lower()
            
            if self.punctuationRemoval:
                text = text.replace('.', '')
                text = text.replace(',', '')
                text = text.replace('!', '')
                text = text.replace('?', '')
            
            word_count = Counter(text.split())
            if self.stemming:
                stemmed_word_count = Counter()
                for word, count in word_count.items():
                    stemmed_word = self.stemmer.stem(word)
                    stemmed_word_count[stemmed_word] += count
                word_counts = stemmed_word_count
            X_to_words.append(word_count)
        return np.array(X_to_words)

In [9]:
from scipy.sparse import csr_matrix

class WordCountToVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    
    def fit(self, X, y=None):
        total_word_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_word_count[word] += min(count, 10)
        self.most_common = total_word_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(self.most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []

        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        print(len(data))
        print(len(rows))
        print(len(cols))
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))


Creating A Pipeline (A simple one)  

Basically Just Preprocessing The Data

In [10]:
email_pipeline = Pipeline([
    ('Email To Words', EmailToWords()),
    ('Word Count To Vectors', WordCountToVector()),
])

In [11]:
X = np.array(ham_email + spam_email)
y = np.array([0] * len(ham_email) + [1] * len(spam_email))

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  """Entry point for launching an IPython kernel.


In [12]:
X_augumented_train = email_pipeline.fit_transform(x_train)

284436
284436
284436


In [13]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(solver='liblinear', random_state=42)
score = cross_val_score(log_clf, X_augumented_train, y_train, cv=3)
score.mean()



0.9913974692572478

In [18]:
from sklearn.metrics import precision_score, recall_score

X_augmented_test = email_pipeline.transform(x_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_augumented_train, y_train)

y_pred = log_clf.predict(X_augmented_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

80997
80997
80997
0 0
Precision: 94.68%
Recall: 96.74%


In [24]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(log_clf, open(filename, 'wb'))