# Spam Classifier: Preparing data

In [1]:
import os 

In [2]:
try:
    ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
except Exception:
    ROOT_DIR = os.path.abspath('..')

In [3]:
def get_folder_names(data_path):
    return [os.path.join(data_path, file_name) for file_name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, file_name))]

In [4]:
data_path = os.path.join(ROOT_DIR, 'data')

data_folder_names = get_folder_names(data_path)
data_folder_names

['d:\\repos\\spam-classifier\\data\\easy_ham',
 'd:\\repos\\spam-classifier\\data\\easy_ham_2',
 'd:\\repos\\spam-classifier\\data\\hard_ham',
 'd:\\repos\\spam-classifier\\data\\spam',
 'd:\\repos\\spam-classifier\\data\\spam_2']

In [5]:
import email
import email.policy

In [6]:
def get_files(folder_path: str) -> iter:
    return (file for file in os.listdir(folder_path))

In [7]:
def retrieve_email(file_path: str) -> 'bytes':
    with open(file_path, 'rb') as file:
        return email.parser.BytesParser(policy= email.policy.default).parse(file)

In [8]:
from concurrent import futures

In [9]:
def load_one_data(data_folder_path: str) -> list:
    return [retrieve_email(os.path.join(data_folder_path, file)) for file in get_files(data_folder_path)]

In [10]:
# def load_data(data_folder_path: list) -> dict:
#     emails = {}
#     for folder_path in data_folder_path:
#         folder_name = os.path.basename(folder_path)
#         emails[folder_name] = [retrieve_email(os.path.join(folder_path, file)) for file in get_files(folder_path)]
    
#     return emails

In [11]:
def load_data(data_folder_path: list) ->dict:
    emails = {}
    with futures.ThreadPoolExecutor(len(data_folder_path)) as executor:
        res = executor.map(load_one_data, data_folder_path)
    
    for folder_path, mails in zip(data_folder_path, res):
        folder_name = os.path.basename(folder_path)
        emails[folder_name] = mails

    return emails


In [12]:
# emails = load_data_parallel(data_folder_names)
# emails

In [13]:
emails = load_data(data_folder_names)
emails

{'easy_ham': [<email.message.EmailMessage at 0x2235c0e31f0>,
  <email.message.EmailMessage at 0x2235c0e34c0>,
  <email.message.EmailMessage at 0x2235c0e3280>,
  <email.message.EmailMessage at 0x2235c0e35e0>,
  <email.message.EmailMessage at 0x2235c0e3a60>,
  <email.message.EmailMessage at 0x2235c0e3130>,
  <email.message.EmailMessage at 0x2235c145040>,
  <email.message.EmailMessage at 0x2235c145370>,
  <email.message.EmailMessage at 0x2235c0e3d60>,
  <email.message.EmailMessage at 0x2235c04ff70>,
  <email.message.EmailMessage at 0x2235c1450a0>,
  <email.message.EmailMessage at 0x2235c145cd0>,
  <email.message.EmailMessage at 0x2235c145c10>,
  <email.message.EmailMessage at 0x2235c1454c0>,
  <email.message.EmailMessage at 0x2235c1455e0>,
  <email.message.EmailMessage at 0x2235c145ee0>,
  <email.message.EmailMessage at 0x2235c145d90>,
  <email.message.EmailMessage at 0x2235c145af0>,
  <email.message.EmailMessage at 0x2235c1459d0>,
  <email.message.EmailMessage at 0x2235c145f10>,
  <email

In [14]:
print(emails['easy_ham'][1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [15]:
def get_structures(email):
    if isinstance(email,str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multiapart({})".format(', '.join([get_structures(sub_email) 
                                                  for sub_email in payload]))
    else:
        return email.get_content_type()

    

In [16]:
from collections import Counter 

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_structures(email)
        structures[structure] += 1
    
    return structures

In [17]:
structures_counter(emails['easy_ham']).most_common()

[('text/plain', 2409),
 ('multiapart(text/plain, application/pgp-signature)', 66),
 ('multiapart(text/plain, text/html)', 8),
 ('multiapart(text/plain, text/plain)', 4),
 ('multiapart(text/plain)', 3),
 ('multiapart(text/plain, application/octet-stream)', 2),
 ('multiapart(text/plain, text/enriched)', 1),
 ('multiapart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multiapart(multiapart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multiapart(text/plain, video/mng)', 1),
 ('multiapart(text/plain, multiapart(text/plain))', 1),
 ('multiapart(text/plain, application/x-pkcs7-signature)', 1),
 ('multiapart(text/plain, multiapart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multiapart(text/plain, multiapart(text/plain, text/plain), multiapart(multiapart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multiapart(text/plain, application/x-java-applet)', 1)]

In [18]:
structures_counter(emails['spam']).most_common()


[('text/plain', 219),
 ('text/html', 183),
 ('multiapart(text/plain, text/html)', 45),
 ('multiapart(text/html)', 20),
 ('multiapart(text/plain)', 19),
 ('multiapart(multiapart(text/html))', 5),
 ('multiapart(text/plain, image/jpeg)', 3),
 ('multiapart(text/html, application/octet-stream)', 2),
 ('multiapart(text/plain, application/octet-stream)', 1),
 ('multiapart(text/html, text/plain)', 1),
 ('multiapart(multiapart(text/html), application/octet-stream, image/jpeg)',
  1),
 ('multiapart(multiapart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [19]:
emails.keys()

dict_keys(['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2'])

In [20]:
for header, value in emails['spam'][0].items():
    print(header, ":", value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [21]:
emails['spam'][0]['Subject']

'Life Insurance - Why Pay More?'

In [157]:
import numpy as np 
from sklearn.model_selection import train_test_split

In [224]:
def create_dataset(emails: dict) -> np.array:
    y = []
    mails = []

    for key, values in emails.items():
        v = [1]
        if 'ham' in key:
            v = [0]
        
        y += v * len(values)

        mails += values
            
    return np.array(mails, object), np.array(y)


In [216]:
X, y = create_dataset(emails)

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [218]:
from bs4 import BeautifulSoup

In [219]:
import re
from html import unescape


def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)


In [220]:
html_spam_emails = [email for email in X_train[y_train == 1]
                    if get_structures(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")


Dear cpunks ,

<BODY bgColor=#ffccff>
<TABLE border=0 cellPadding=0 cellSpacing=0 width=475>
  <TBODY>
  <TR>
    <TD align=middle vAlign=top></TD></TR></TBODY></TABLE><BR>
<TABLE>
  <TBODY>
  <TR>
    <TD width="5%"></TD>
    <TD bgColor=#b8ecff borderColor=#0000ff width="90%"><FONT color=#ff0000 
      face="Arial Black" 
      size=6>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Want   
      To Harvest A Lot Of Email&nbsp;&nbsp; Addresses In A Very Short Time?</FONT>   
      <P><B><FONT color=#0000ff face=Arial size=4>Easy Email   
      Searcher</FONT><FONT color=#ff00ff face=Arial size=4>&nbsp; is&nbsp;   
      a&nbsp; powerful&nbsp; Email&nbsp; software&nbsp;&nbsp; that&nbsp;   
      harvests general Email lists from mail servers&nbsp;&nbsp; </FONT><FONT   
      color=#0000ff face=Arial size=4>Easy Email Searcher </FONT><FONT   
      color=#ff00ff face=Arial size=4>can get 100,000 Email</FONT></B> <FONT   
      color=#ff00ff face=Arial size=4><B>addresses di

In [222]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


Dear cpunks ,
               Want
      To Harvest A Lot Of Email   Addresses In A Very Short Time?
      Easy Email
      Searcher  is 
      a  powerful  Email  software   that 
      harvests general Email lists from mail servers   Easy Email Searcher can get 100,000 Email addresses directly from the Email
      servers in only one hour! 
        Easy Email
        Searcher is a 32 bit Windows Program for e-mail marketing. It
        is intended for easy and convenient search large e-mail address lists
        from mail servers. The program can be operated on Windows 95/98/ME/2000
        and NT.
        Easy Email
        Searcher support multi-threads (up to 512
        connections).
        Easy Email
        Searcher has the ability  to reconnect to the mail
        server if the server has disconnected and continue the searching at the
        point where it has been interrupted.
        Easy Email
        Searcher has an ergonomic interface that is easy to set up
        and s

In [225]:
print(email_to_text(sample_html_spam)[:100], "...")


Dear cpunks ,
               Want
      To Harvest A Lot Of Email   Addresses In A Very Short Time?
 ...
