# Importing Data


In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [2]:
def fetch_spam_dataset(ham_url:str=HAM_URL, spam_url:str=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for file_name, url in (("ham.ter.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, file_name)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [3]:
# fetch_spam_dataset()

In [4]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [5]:
for item in zip(["Spam", "Ham"],[len(spam_filenames), len(ham_filenames)]):
    print(item)

('Spam', 500)
('Ham', 2500)


In [6]:
import email 
import email.policy

def load_email(is_spam:bool, file_name:str, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, file_name), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:

ham_emails = [load_email(is_spam=False, file_name=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, file_name=name) for name in spam_filenames]


In [8]:
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [9]:
print(spam_emails[4].get_content().strip())

I thought you might like these:
1) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.freeyankee.com/cgi/fy2/to.cgi?l=822slim1

2) Fight The Risk of Cancer! 
http://www.freeyankee.com/cgi/fy2/to.cgi?l=822nic1 

3) Get the Child Support You Deserve - Free Legal Advice 
http://www.freeyankee.com/cgi/fy2/to.cgi?l=822ppl1

Offer Manager
Daily-Deals








If you wish to leave this list please use the link below.
http://www.qves.com/trim/?social@linux.ie%7C29%7C134077


-- 
Irish Linux Users' Group Social Events: social@linux.ie
http://www.linux.ie/mailman/listinfo/social for (un)subscription information.
List maintainer: listmaster@linux.ie


In [10]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return f"multipart({','.join([get_email_structure(sub_email) for sub_email in payload])})"
    else:
        return email.get_content_type()

In [15]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [40]:
import pandas as pd
s_ham = structures_counter(ham_emails).most_common()
s = {}

for key,val in s_ham:
    s[key] = val
df = pd.Series(s)

In [43]:
df/len(ham_emails)

text/plain                                                                                                               0.9632
multipart(text/plain,application/pgp-signature)                                                                          0.0264
multipart(text/plain,text/html)                                                                                          0.0032
multipart(text/plain,text/plain)                                                                                         0.0016
multipart(text/plain)                                                                                                    0.0012
multipart(text/plain,application/octet-stream)                                                                           0.0008
multipart(text/plain,text/enriched)                                                                                      0.0004
multipart(text/plain,application/ms-tnef,text/plain)                                                    

In [46]:
s_spam = structures_counter(spam_emails).most_common()
s = {}

for key,val in s_spam:
    s[key] = val
df = pd.Series(s)

In [47]:
df / len(spam_emails)

text/plain                                                             0.436
text/html                                                              0.366
multipart(text/plain,text/html)                                        0.090
multipart(text/html)                                                   0.040
multipart(text/plain)                                                  0.038
multipart(multipart(text/html))                                        0.010
multipart(text/plain,image/jpeg)                                       0.006
multipart(text/html,application/octet-stream)                          0.004
multipart(text/plain,application/octet-stream)                         0.002
multipart(text/html,text/plain)                                        0.002
multipart(multipart(text/html),application/octet-stream,image/jpeg)    0.002
multipart(multipart(text/plain,text/html),image/gif)                   0.002
multipart/alternative                                                  0.002

In [52]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [53]:
ham_emails[0]["Subject"]

'Re: New Sequences Window'

In [54]:
import numpy as np 
from sklearn.model_selection import train_test_split

x = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [81]:
from bs4 import BeautifulSoup

def html_to_plain_text(email):
    soup = BeautifulSoup(email, 'html.parser')
    links = soup.find_all("a")
    for link in links:
        text = soup.new_tag("p")
        text.string = "Hyperlink"
        link.replace_with(text)
    return soup.text    

In [85]:
html_spam_emails = [email for email in x_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html>

<body>

<p><font face="Arial" color="#FF0000"><b>Attention U.S. HomeOwners</b></font> </p>

<p><font face="Arial"><b>If you want to save an extra $30,000 (average savings)</b></font> </p>
<p><font face="Arial"><b>on your mortgage even if you have already refinanced</b></font></p>
<p><a href="http://www.Smartest_Move_U_Could_Make.com%40w%77%77%2E%74%65%72%72%61%2Ee%73/pe%72%73o%6E%61%6C9/chunk102/"><font face="Arial"><b>CLICK HERE</b></font></a></p>
<p><font face="Arial"><b>We also have the lowest rates and most professional</b></font> </p>
<p><font face="Arial"><b>and friendly service you will experience.&nbsp; We will</b></font> </p>
<p><font face="Arial"><b>answer your questions with no obligation.</b></font></p>
<p><a href="http://www.Mortgage_Opportunity_777.com%40w%77%77%2E%74%65%72%72%61%2Ee%73/pe%72%73o%6E%61%6C9/pants105/"><font face="Arial"><b>CLICK HERE</b></font></a></p>
<p><font face="Arial"><b>We have rates as low as 4.65% and Loans for all</b></font></p>
<p><font 

In [86]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")



Attention U.S. HomeOwners 
If you want to save an extra $30,000 (average savings) 
on your mortgage even if you have already refinanced
Hyperlink
We also have the lowest rates and most professional 
and friendly service you will experience.  We will 
answer your questions with no obligation.
Hyperlink
We have rates as low as 4.65% and Loans for all
types of people and situations.
For those of you who have a mortgage and have been 
turned down we can still save you around $30,000.
Hyperlink for a FREE, friendly
quote.
 
 
If you no longer wish to receive our offers and updates Hyperlink 
 and we will promptly honor your request.


 ...


In [87]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [88]:
print(email_to_text(sample_html_spam)[:100], "...")



Attention U.S. HomeOwners 
If you want to save an extra $30,000 (average savings) 
on your mortgag ...


In [107]:
import nltk
import urlextract
import re

stemmer = nltk.PorterStemmer()
url_extractor = urlextract.URLExtract()


In [108]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, 
                 strip_headers=True,
                 lower_case=True,
                 remove_punctuations=True,
                 replace_urls=True,
                 replace_numbers=True,
                 stemming=True) -> None:
        super().__init__()
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuations = remove_punctuations
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming

    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        x_transformed = []
        for email in x:
            text = email_to_text(email) or ''
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text.replace(url, "URL")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuations:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            x_transformed.append(word_counts)
        return np.array(x_transformed)    

In [116]:
X_few = x_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'s': 2, 'like': 2, 'were': 2, 'ye': 1, 'it': 1, 'nice': 1, 'to': 1, 'be': 1, 'back': 1, 'in': 1, 'america': 1, 'flaccid': 1, 'state': 1, 'seem': 1, 'onli': 1, 'yesterday': 1, 'we': 1, 'suffer': 1, 'electil': 1, 'dysfunct': 1, 'mayb': 1, 'if': 1, 'they': 1, 'made': 1, 'the': 1, 'ballot': 1, 'oval': 1, 'look': 1, 'littl': 1, 'blue': 1, 'pill': 1, 'no': 1, 'serious': 1, 'i': 1, 'm': 1, 'here': 1, 'all': 1, 'week': 1, 'you': 1, 'great': 1, 'nite': 1, 'everybodi': 1}),
       Counter({'number': 9, 'the': 7, 'of': 4, 'it': 4, 'and': 3, 's': 3, 'thi': 2, 'gay': 2, 'or': 2, 'numberk': 2, 'in': 2, 'suggest': 2, 'that': 2, 'a': 2, 'rah': 2, 'who': 2, 'for': 2, 'ha': 2, 'been': 2, 'ibuc': 2, 'com': 2, 'at': 1, 'am': 1, 'on': 1, 'gordon': 1, 'mohr': 1, 'wrote': 1, 'calcul': 1, 'elid': 1, 'cours': 1, 'say': 1, 'veri': 1, 'littl': 1, 'almost': 1, 'noth': 1, 'about': 1, 'overal': 1, 'popul': 1, 'behavior': 1, 'straight': 1, 'rel': 1, 'preval': 1, 'individu': 1, 'either': 1, 'group': 1

In [117]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [118]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 19 stored elements in Compressed Sparse Row format>

In [119]:
X_few_vectors.toarray()

array([[ 35,   0,   1,   1,   2,   0,   1,   2,   0,   1,   2],
       [132,   9,   7,   4,   3,   4,   2,   1,   3,   1,   0],
       [  1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=int32)

In [121]:
vocab_transformer.vocabulary_

{'number': 1,
 'the': 2,
 'it': 3,
 's': 4,
 'of': 5,
 'in': 6,
 'were': 7,
 'and': 8,
 'to': 9,
 'like': 10}

In [128]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(x_train)

In [129]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.974) total time=   0.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] END ................................ score: (test=0.980) total time=   0.3s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] END ................................ score: (test=0.989) total time=   0.4s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s finished


0.9809523809523809