Change '../data/' to your local directory where the two folders 'easy_ham' and 'spam' are saved.

In [79]:
dir_data = '../data/'

Read email

In [80]:
import email
from bs4 import BeautifulSoup

def parse_email(file_path):
    with open(file_path, 'rb') as f:
        msg = email.message_from_bytes(f.read())
    # Get email headers
    subject = msg.get('Subject')

    # Read email's body
    body = str(msg.get_payload())

    # Remove HTML tags
    body = BeautifulSoup(body).get_text()
    
    return subject, body

Preprocessing

In [81]:
import spacy
import re

def preprocess(text):
    text = str(text) # make sure input type is string.
    nlp = spacy.load("en_core_web_sm")
    
    text = text.lower() # Make lower case.
    text = re.sub(r'=\n', '', text)  # Remove =\n.
    text = re.sub(r'\n', ' ', text)  # Remove \n.
    text = ' '.join(text.split())  # Replace all whitespaces to one space.
    
    doc = nlp(text)
    processed_tokens = []
    for token in doc:
        if token.like_url: # Check for URLs.
            processed_tokens.append("urllink") # Replace actual URL link to a word 'urllink'
            continue
        if token.like_email: # Check for email addresses.
            processed_tokens.append("emailaddress") # Replace actual email address to a word 'emailaddress'
            continue
        if not token.is_stop and not token.is_punct: # Check for stopwords and punctuations.
            processed_tokens.append(token.lemma_) # Lemmatization.
    text = " ".join(processed_tokens)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Leave only alphabets and whitespaces.
    return text

Preprocessing: TF-IDF

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Add an additional filter for stopwords from TF-IDF.
vectorizer = TfidfVectorizer(stop_words='english')

Pathlib to read files in data folder.

In [83]:
from pathlib import Path

dir_spam = Path(dir_data+"spam")
dir_ham = Path(dir_data+"easy_ham")

files_spam = list(dir_spam.iterdir())
files_ham = list(dir_ham.iterdir())

Test above functions on the first ham mail.

In [84]:
subject, body = parse_email(files_ham[0])
body_preprocess = preprocess(body)
body_tfidf = vectorizer.fit_transform([body_preprocess])
feature_names = vectorizer.get_feature_names_out()
# print(body)
print(f"*Preprocessed body:\n    {body_preprocess}")
print(f"*Feature names for TF-IDF:\n {feature_names}")
print(f"*Shape of TF-IDF:\n      {body_tfidf.shape}")

*Preprocessed body:
    url urllink date supply blake blogs mpt want mozilla look like msie admit evidence pretty compelling recall ask agree mpt  list quote list agree item mpt mention problem course people problem mind mpt agree problem necessarily agree solution problem maybe different idea solve particular issue believe issue need address cover list specifically navigator chrome structure necessarily agree mpt propose default configuration agree chrome structure painfully restrictive customizable toolbar need implement order acquire flexibility deal problem speed argue cut lot useless ui feature chrome help substantially reduce bloat gain speed text editing use chimera mac textfield widget easily painful entire application buggy slow misbehave edit way expect imo chimera usability problem message display yes argument search yeah mess know mess menu structure get blog app separate menu structure complicate order deal multiple application clean separation naturally simplify menu eg e

Create a dataframe with preprocessed data.

In [85]:
from sklearn.model_selection import train_test_split
import pandas as pd

# label = [0] * len(files_ham)
# label.append([1] * len(files_spam))

subject = []
body = []
label = []

for file in files_ham:
    subject_ham, body_ham = parse_email(file)
    subject.append(preprocess(subject_ham))
    body.append(preprocess(body_ham))
    label.append(0)

for file in files_spam:
    subject_spam, body_spam = parse_email(file)
    subject.append(preprocess(subject_spam))
    body.append(preprocess(body_spam))
    label.append(1)

df = pd.DataFrame(
    {
        'subject': subject,
        'body': body,
        
        'target': label
    }
)
df.to_csv(dir_data+'data_spam.csv', index=False) # Load csv file instead of running preprocessing again.

Above cell requires about 24 minutes on preprocessing, so I recommend directly using the preprocessed csv file.

In [86]:
df  = pd.read_csv(dir_data+"data_spam.csv")
df.head(-1)

Unnamed: 0,subject,body,target
0,usability problem mozilla,url urllink date supply blake blogs mpt want m...,0
1,exmh speed,,0
2,hanson sept message national review,hanson good sci fi author plan slip follow lin...,0
3,use new apt null rh upgrade,matthias saou emailaddress write red hat reco...,0
4,satalk funny,wednesday august cet theo van dinter write ...,0
...,...,...,...
2996,low cost easy use conference,low rate service conference easy cent minute ...,1
2997,shape summer,see nbc cbs cnn oprah health discovery actuall...,1
2998,american millionaire reveal secret source weal...,hello ...,1
2999,gain low interest rate year,opportunity knock mortgage rate rise national ...,1
