<h2><left>Hello, This Notebook Contains example of Email-Classification Algorithm </left></h2>
<h4><left>- We classify emails as Spam or Ham based on scikit-learn Library</left></h4>
<div class="alert alert-block alert-info" style="margin-top: 20px">
        <ul>
            <li><h4>Contents:</h4></li>
    <ul>
        <li><a href="#load">Loading Data</a></li>
        <li><a href="#prepare">Data Preparation</a></li>
        <li><a href="#modeling">Modeling</a></li>
        <ul>
            <li><a href="#log_reg">Logistic Regression</a></li>
            <li><a href="#another_models">More Models</a></li>
        </ul>
    </ul>
    </ul>
</div>
<hr>

<a id="load"></a>
<h3><left>- Getting Data:</left></h3>
<h4><left>- Loading, Reading and Extracting Texts. </left></h4>

----------

In [2]:
import urllib
import tarfile
import os

In [2]:
DOWNLOAD_ROOT= "http://spamassassin.apache.org/old/publiccorpus/"
DATASETS_PATH= os.path.join(os.getcwd(),'datasets')

pathes = {
    'SpamDataSet':os.path.join(DATASETS_PATH,'spam'),
}

file_names ={
    'Spam': "20030228_spam.tar.bz2",
    'Ham':  "20030228_easy_ham.tar.bz2"
}

urls = {
    'Spam': DOWNLOAD_ROOT + "20030228_spam.tar.bz2",
    'Ham': DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
}

In [3]:
def load_data(urls,file_names,direc):
    print('start loading>>>')
    if not os.path.isdir(direc):
        os.makedirs(direc)
    for filename, url in zip(file_names,urls):
        file_path = os.path.join(direc, file_names[filename])
        if not os.path.isfile(file_path):
            print('loading: ',urls[url])
            urllib.request.urlretrieve(urls[url], file_path)
        tar_bz2_file = tarfile.open(file_path)
        print('extracting to: ', file_path)
        tar_bz2_file.extractall(path=direc)
        tar_bz2_file.close()
    print('finished')

In [None]:
load_data(urls,file_names,pathes['SpamDataSet'])

In [4]:
import email
import email.policy

def get_files_in_dir(direc):
    files = [file_name for file_name in sorted(os.listdir(direc)) if len(file_name)>20 ]
    return files
def load_email(direc, file_name):
    with open(os.path.join(direc,file_name),'rb') as f:
        return email.parser.BytesParser(policy= email.policy.default).parse(f)

In [5]:

SPAMS_DATA_PATH = os.path.join(pathes['SpamDataSet'],'spam')
HAMS_DATA_PATH = os.path.join(pathes['SpamDataSet'],'easy_ham')

spam_files= get_files_in_dir(SPAMS_DATA_PATH)
ham_files= get_files_in_dir(HAMS_DATA_PATH)

In [6]:
len(spam_files),len(ham_files)

(500, 2500)

In [7]:
ham_emails = [load_email(HAMS_DATA_PATH,file_name) for file_name in ham_files]
spam_emails = [load_email(SPAMS_DATA_PATH,file_name) for file_name in spam_files]

In [8]:
from collections import Counter

def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()
    
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

def get_emails_by_type(e_type,emails):
    return [email for email in emails
                    if get_email_structure(email) == e_type]

In [9]:
structures_counter(ham_emails)

Counter({'text/plain': 2408,
         'multipart(text/plain, application/pgp-signature)': 66,
         'multipart(text/plain, text/html)': 8,
         'multipart(text/plain, text/enriched)': 1,
         'multipart(text/plain, application/ms-tnef, text/plain)': 1,
         'multipart(text/plain)': 3,
         'multipart(text/plain, application/octet-stream)': 2,
         'multipart(text/plain, text/plain)': 4,
         'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 1,
         'multipart(text/plain, video/mng)': 1,
         'multipart(text/plain, multipart(text/plain))': 1,
         'multipart(text/plain, application/x-pkcs7-signature)': 1,
         'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 1,
         'multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))': 1,
         'multipart(text/plain, application/x-java-applet)': 1})

In [10]:
structures_counter(spam_emails)#.most_common(10)

Counter({'text/html': 183,
         'text/plain': 218,
         'multipart(text/plain, application/octet-stream)': 1,
         'multipart(text/html)': 20,
         'multipart(text/plain, text/html)': 45,
         'multipart(text/plain)': 19,
         'multipart(text/html, text/plain)': 1,
         'multipart(text/html, application/octet-stream)': 2,
         'multipart(multipart(text/html))': 5,
         'multipart(text/plain, image/jpeg)': 3,
         'multipart(multipart(text/html), application/octet-stream, image/jpeg)': 1,
         'multipart(multipart(text/plain, text/html), image/gif)': 1,
         'multipart/alternative': 1})

<a id="prepare"></a>
<h3><left>- Data Preparation:</left></h3>
<h4><left>- Splitting, Cleaning and Tokenization. </left></h4>

In [11]:
from sklearn.model_selection import train_test_split
import numpy as np

X_data= np.array(ham_emails+spam_emails)
y_data = np.array([0]*len(ham_emails) + [1]*len(spam_emails))
X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size=0.2,random_state=42)

In [12]:
import re
from html import unescape

def html_to_text(html):
    text = re.sub(r'<head.*?>.*?</head>','',html)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return  unescape(text)

def email_to_text(email):
    for part in email.walk():
        ctype = part.get_content_type()
        if ctype not in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = part.get_payload()
        if ctype == 'text/plain':
            return str(content)
        elif ctype == 'text/html':
            return html_to_text(content)

In [13]:
import urlextract
from sklearn.base import BaseEstimator , TransformerMixin
from stemming.porter2 import stem

url_extractor = urlextract.URLExtract()

class emailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,**settings):
        self.lower_case = settings.get('lower_case',True) 
        self.strip_header= settings.get('strip_header',True) 
        self.remove_punctuation= settings.get('remove_punctuation',True)  
        self.replace_urls = settings.get('replace_urls',True) 
        self.replace_numbers = settings.get('replace_numbers',True)  
        self.stemming= settings.get('stemming',True)

    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X_transformed = []
        for e in X:
            text = email_to_text(e) or ""
            if self.lower_case: 
                text = text.lower()
                
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " url ")
                    
            if self.replace_numbers:
                text = re.sub('\d+(?:\.\d*(?:[eE]+\d*)?)?','number',text)  
                
            if self.remove_punctuation:
                text = re.sub('\W+',' ',text,flags=re.M)
                
            words_count = Counter(text.split())
            if self.stemming and stem is not None: 
                stemmed_words_count = Counter()
                for (word, count) in words_count.items():
                    stemmed_words_count[stem(word)]+= count
                X_transformed.append(stemmed_words_count)
                
        return np.array(X_transformed)

In [14]:
e2w = emailToWordCounterTransformer()
co= e2w.fit_transform(X_train[:3])
# e2w.lower_case
co

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'has': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesus': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'this': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'be

In [15]:
from scipy.sparse import csr_matrix

class wordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, **settings):
        self.voc_size = settings.get('voc_size',1000)
        
    def fit(self,X,y= None):
        total_count = Counter()
        for word_count in X:
            for (word,count) in word_count.items():
                total_count[word]+= min(count,10)
        self.most_common = total_count.most_common(self.voc_size)
        self.vocabulary_ = {word: index+1 for index, (word,_) in enumerate(self.most_common)}
        
        return self
    def transform(self,X,y=None):
        rows = np.array([])
        cols = np.array([])
        data = np.array([])
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows = np.append(rows,row)
                cols = np.append(cols,self.vocabulary_.get(word,0))
                data= np.append(data,count)
        return csr_matrix((data, (rows, cols)),shape=(len(X), self.voc_size + 1))

In [16]:
w2v = wordCounterToVectorTransformer(voc_size=10)
vec= w2v.fit_transform(co)

vec.toarray()
w2v.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

<a id="modeling"></a>
<h3><left>- Modeling:</left></h3>
<h4><left>- Creating, Training and Validation Model Piplines. </left></h4>

In [23]:
from sklearn.pipeline import Pipeline
features_preprocess = Pipeline([
    ('e2c',emailToWordCounterTransformer()),
    ('c2v',wordCounterToVectorTransformer())
])
X_train_transformed = features_preprocess.fit_transform(X_train)

<a id="log_reg"></a>
<h4><I><left>- Logistic Regression Model. </left></I></h4>

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.985, total=   0.2s
[CV]  ................................................................
[CV] .................................... , score=0.985, total=   0.2s
[CV]  ................................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] .................................... , score=0.993, total=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s finished


0.9874999999999999

In [26]:
# Test Logistic Regression:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = features_preprocess.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.88%
Recall: 97.89%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


<a id="another_models"></a>
<h4><left>- Another Models: SVC, NuSVC, SGD Classifier and more. </left></h4>

In [51]:
from sklearn.metrics import accuracy_score
def classification(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("Accuracy: {:.2f}%".format(100 * accuracy_score(y_test, y_pred)))
    print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
    print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))
    return { 'model' : model, 'predictions': y_pred }

In [52]:
from sklearn.svm import SVC,NuSVC
from sklearn.naive_bayes import MultinomialNB ,BernoulliNB
from sklearn.linear_model import SGDClassifier

svc = SVC(gamma = 'auto')
mnb = MultinomialNB()
bnb =BernoulliNB()
nu_svc = NuSVC()
sgd = SGDClassifier()

In [54]:
mnb_ = classification(mnb,X_train_transformed,X_test_transformed,y_train,y_test)

Accuracy: 98.67%
Precision: 96.77%
Recall: 94.74%


In [55]:
bnb_ = classification(bnb,X_train_transformed,X_test_transformed,y_train,y_test)

Accuracy: 94.67%
Precision: 81.19%
Recall: 86.32%


In [56]:
sgd_ = classification(sgd,X_train_transformed,X_test_transformed,y_train,y_test)

Accuracy: 96.83%
Precision: 86.54%
Recall: 94.74%


In [58]:
svc_= classification(svc,X_train_transformed,X_test_transformed,y_train,y_test)

Accuracy: 96.00%
Precision: 100.00%
Recall: 74.74%


<h3><I><center>...The End... </center></I></h3>