In [211]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)

In [213]:
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [215]:
t = '<tr><td aling="left"><a href="../../issues/51/16.html#article">Phrack World News</a></td>'
strip_tags(t)

'Phrack World News'

In [217]:
import email
import string
import nltk

class Parser:
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.punctuation = list(string.punctuation)

    def parse(self, email_path):
        with open(email_path, errors='replace') as e:
            msg = email.message_from_file(e)
        return None if not msg else self.get_email_content(msg)

    def get_email_content(self, msg):
        subject = self.tokenize(msg['Subject']) if msg['Subject'] else []
        body = self.get_email_body(msg.get_payload(), msg.get_content_type())
        content_type = msg.get_content_type()

        return {"subject":subject,
                 "body": body,
                  "content_type":content_type}

    def get_email_body(self, payload, content_type):
        body = []
        if type(payload) is str and content_type == 'text/plain':
            return self.tokenize(payload)
        elif type(payload) is str and content_type == 'text/html':
            return self.tokenize(strip_tags(payload))
        elif type(payload) is list:
            for p in payload:
                body += self.get_email_body(p.get_payload(), p.get_content_type())
        return body

    def tokenize(self, text):
        for c in self.punctuation:
            text = text.replace(c,"")
        text = text.replace("\t","")
        text = text.replace("\n","")
        tokens = list(filter(None, text.split(" ")))
        return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]

In [219]:
inmail = open("data/inmail.1").read()
print(inmail)

From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007
Return-Path: <RickyAmes@aol.com>
Received: from 129.97.78.23 ([211.202.101.74])
	by speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;
	Sun, 8 Apr 2007 13:07:21 -0400
Received: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100
Message-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>
From: "Tomas Jacobs" <RickyAmes@aol.com>
Reply-To: "Tomas Jacobs" <RickyAmes@aol.com>
To: the00@speedy.uwaterloo.ca
Subject: Generic Cialis, branded quality@ 
Date: Sun, 08 Apr 2007 21:00:48 +0300
X-Mailer: Microsoft Outlook Express 6.00.2600.0000
MIME-Version: 1.0
Content-Type: multipart/alternative;
	boundary="--8896484051606557286"
X-Priority: 3
X-MSMail-Priority: Normal
Status: RO
Content-Length: 988
Lines: 24

----8896484051606557286
Content-Type: text/html;
Content-Transfer-Encoding: 7Bit

<html>
<body bgcolor="#ffffff">
<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0px; margin-bottom: 0px;" align="

In [221]:
import nltk
nltk.download('stopwords')
p  = Parser()
p.parse("data/inmail.1")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'subject': ['gener', 'ciali', 'brand', 'qualiti'],
 'body': ['do',
  'feel',
  'pressur',
  'perform',
  'rise',
  'occasiontri',
  'viagrayour',
  'anxieti',
  'thing',
  'past',
  'willb',
  'back',
  'old',
  'self'],
 'content_type': 'multipart/alternative'}

In [223]:
index = open("full/index").readlines()
index

['spam ../data/inmail.1\n',
 'ham ../data/inmail.2\n',
 'spam ../data/inmail.3\n',
 'spam ../data/inmail.4\n',
 'spam ../data/inmail.5\n',
 'spam ../data/inmail.6\n',
 'spam ../data/inmail.7\n',
 'spam ../data/inmail.8\n',
 'spam ../data/inmail.9\n',
 'ham ../data/inmail.10\n',
 'spam ../data/inmail.11\n',
 'spam ../data/inmail.12\n',
 'spam ../data/inmail.13\n',
 'spam ../data/inmail.14\n',
 'spam ../data/inmail.15\n',
 'spam ../data/inmail.16\n',
 'spam ../data/inmail.17\n',
 'spam ../data/inmail.18\n',
 'spam ../data/inmail.19\n',
 'ham ../data/inmail.20\n',
 'ham ../data/inmail.21\n',
 'spam ../data/inmail.22\n',
 'spam ../data/inmail.23\n',
 'spam ../data/inmail.24\n',
 'spam ../data/inmail.25\n',
 'spam ../data/inmail.26\n',
 'spam ../data/inmail.27\n',
 'spam ../data/inmail.28\n',
 'ham ../data/inmail.29\n',
 'spam ../data/inmail.30\n',
 'ham ../data/inmail.31\n',
 'spam ../data/inmail.32\n',
 'spam ../data/inmail.33\n',
 'ham ../data/inmail.34\n',
 'spam ../data/inmail.35\n',
 

In [225]:
def parse_index(path_to_index, n_elements):
    ret_index = []
    index = open("full/index").readlines()
    for i in range(n_elements):
        mail = index[i].split("../")
        label  = mail[0]
        path  = mail[1][:-1]
        ret_index.append({"label":label, "email_path":path})
    return ret_index



In [227]:
def parse_email(index):
    p = Parser()
    pmail = p.parse(index["email_path"])
    index["label"]
    return pmail, index["label"]

In [229]:
indexes = parse_index("full/index", 10)
indexes

[{'label': 'spam ', 'email_path': 'data/inmail.1'},
 {'label': 'ham ', 'email_path': 'data/inmail.2'},
 {'label': 'spam ', 'email_path': 'data/inmail.3'},
 {'label': 'spam ', 'email_path': 'data/inmail.4'},
 {'label': 'spam ', 'email_path': 'data/inmail.5'},
 {'label': 'spam ', 'email_path': 'data/inmail.6'},
 {'label': 'spam ', 'email_path': 'data/inmail.7'},
 {'label': 'spam ', 'email_path': 'data/inmail.8'},
 {'label': 'spam ', 'email_path': 'data/inmail.9'},
 {'label': 'ham ', 'email_path': 'data/inmail.10'}]

In [231]:
index = parse_index("full/index", 1)

In [233]:
open(index[0]["email_path"]).read()


'From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007\nReturn-Path: <RickyAmes@aol.com>\nReceived: from 129.97.78.23 ([211.202.101.74])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;\n\tSun, 8 Apr 2007 13:07:21 -0400\nReceived: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100\nMessage-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>\nFrom: "Tomas Jacobs" <RickyAmes@aol.com>\nReply-To: "Tomas Jacobs" <RickyAmes@aol.com>\nTo: the00@speedy.uwaterloo.ca\nSubject: Generic Cialis, branded quality@ \nDate: Sun, 08 Apr 2007 21:00:48 +0300\nX-Mailer: Microsoft Outlook Express 6.00.2600.0000\nMIME-Version: 1.0\nContent-Type: multipart/alternative;\n\tboundary="--8896484051606557286"\nX-Priority: 3\nX-MSMail-Priority: Normal\nStatus: RO\nContent-Length: 988\nLines: 24\n\n----8896484051606557286\nContent-Type: text/html;\nContent-Transfer-Encoding: 7Bit\n\n<html>\n<body bgcolor="#ffffff">\n<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0

In [235]:
mail, label = parse_email(index[0])
print("El correo es: ", label)
print(mail)

El correo es:  spam 
{'subject': ['gener', 'ciali', 'brand', 'qualiti'], 'body': ['do', 'feel', 'pressur', 'perform', 'rise', 'occasiontri', 'viagrayour', 'anxieti', 'thing', 'past', 'willb', 'back', 'old', 'self'], 'content_type': 'multipart/alternative'}


In [237]:
from sklearn.feature_extraction.text import CountVectorizer

prep_email = [" ".join(mail["subject"]) + " ".join(mail["body"])]
vectorizer = CountVectorizer()
x = vectorizer.fit(prep_email)
print("Email: ", prep_email, "\n")
print("Caracterisiticas de entrada", vectorizer.get_feature_names_out())

Email:  ['gener ciali brand qualitido feel pressur perform rise occasiontri viagrayour anxieti thing past willb back old self'] 

Caracterisiticas de entrada ['anxieti' 'back' 'brand' 'ciali' 'feel' 'gener' 'occasiontri' 'old'
 'past' 'perform' 'pressur' 'qualitido' 'rise' 'self' 'thing' 'viagrayour'
 'willb']


In [239]:
x = vectorizer.transform(prep_email)
print("\nValues:\n", x.toarray())


Values:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [241]:
#Aprendizaje OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

prep_email = [[w] for w in mail["subject"] + mail["body"]]
enc = OneHotEncoder(handle_unknown="ignore")
x = enc.fit_transform(prep_email)

print("Features:\n", enc.get_feature_names_out())
print("\nValues:\n", x.toarray())

Features:
 ['x0_anxieti' 'x0_back' 'x0_brand' 'x0_ciali' 'x0_do' 'x0_feel' 'x0_gener'
 'x0_occasiontri' 'x0_old' 'x0_past' 'x0_perform' 'x0_pressur'
 'x0_qualiti' 'x0_rise' 'x0_self' 'x0_thing' 'x0_viagrayour' 'x0_willb']

Values:
 [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0

In [243]:
def create_prep_dataset(index_path, n_elements):
    x = []
    y = []
    indexes = parse_index(index_path, n_elements)
    for i in range(n_elements):
        print("\rParsing email: {0}".format(i+1), end='')
        mail, label = parse_email(indexes[i])
        x.append(" ".join(mail["subject"]) + " ".join(mail["body"]))
        y.append(label)

    return x, y

In [245]:
#Entrenamiento del algoritmo
x_train, y_train = create_prep_dataset('full/index', 100)

Parsing email: 100

In [247]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)


In [249]:
print(x_train.toarray())
print("\nValues:\n", len(vectorizer.get_feature_names_out()))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Values:
 4936


In [251]:
import pandas as pd
pd.DataFrame(x_train.toarray(), columns=[vectorizer.get_feature_names_out()])

Unnamed: 0,0000,000000,00085,002,003,00450,009avisit,01,01000u,0107,...,ö¹,öð,öôööµæ,öø³ðåµ,öþ,öˆ,ù8251354545871wov84954mtft,úàí,þîñòµ¼,šè
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [254]:
#Entrenamiento del algoritmo
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, y_train)

In [256]:
x, y = create_prep_dataset('full/index', 150)
x_test = x[100:]
y_test = y[100:]

Parsing email: 150

In [258]:
x_test = vectorizer.transform(x_test)


In [260]:
y_pred = clf.predict(x_test)
y_pred

array(['spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ',
       'spam '], dtype='<U5')

In [262]:
print("Prediccion:\n", y_pred)
print("\nEtiquetas Reales\n", y_test)

Prediccion:
 ['spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam '
 'spam ' 'ham ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'ham ' 'spam '
 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'ham ' 'spam ' 'spam ' 'spam '
 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam '
 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam ' 'spam '
 'spam ' 'spam ' 'spam ' 'spam ' 'spam ']

Etiquetas Reales
 ['spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ']


In [264]:
from sklearn.metrics import accuracy_score

print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.960


In [278]:
x, y = create_prep_dataset('full/index', 22000)

Parsing email: 22000

In [279]:
x_train, y_train = x[:20000], y[:20000]
x_test, y_test = x[20000:], y[20000:]

In [280]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)

In [281]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

In [282]:
x_test = vectorizer.transform(x_test)
print(y_test)

['ham ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'spam ', 'ham ', 'ham ', 'ham ', 'ham ', 'ham ', 'spam ', 'ham ', 'ham ', 'spam ', 'spam ', 'ham ', 'spam ', 'spam ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'spam ', 'ham ', 'spam ', 'ham ', 'ham ', 'ham ', 'spam ', 'spam ', 'ham ', 'ham ', 'spam ', 'spam ', 'spam ', 'spam ', 'ham ', 'spam ', 'ham ', 'spam ', 'ham ', 'spam ', 'spam ', 'spam ', 'ham ', 'ham ', 'ham 

In [288]:
y_pred = clf.predict(x_test)

print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))

Accuracy: 0.991
