# Chapter 4. Detecting Spam Email with Naive Bayes

## Prepare Data

### Open Legitimate email

In [2]:
file_path = 'enron1/ham/0007.1999-12-14.farmer.ham.txt'
with open(file_path, 'r') as infile:
    ham_sample = infile.read()
print(ham_sample)

Subject: mcmullen gas for 11 / 99
jackie ,
since the inlet to 3 river plant is shut in on 10 / 19 / 99 ( the last day of
flow ) :
at what meter is the mcmullen gas being diverted to ?
at what meter is hpl buying the residue gas ? ( this is the gas from teco ,
vastar , vintage , tejones , and swift )
i still see active deals at meter 3405 in path manager for teco , vastar ,
vintage , tejones , and swift
i also see gas scheduled in pops at meter 3404 and 3405 .
please advice . we need to resolve this as soon as possible so settlement
can send out payments .
thanks


### Open Spam email

In [3]:
file_path = 'enron1/spam/0058.2003-12-21.GP.spam.txt'
with open(file_path, 'r') as infile:
    spam_sample = infile.read()
print(spam_sample)

Subject: stacey automated system generating 8 k per week parallelogram
people are
getting rich using this system ! now it ' s your
turn !
we ' ve
cracked the code and will show you . . . .
this is the
only system that does everything for you , so you can make
money
. . . . . . . .
because your
success is . . . completely automated !
let me show
you how !
click
here
to opt out click here % random _ text



### Build dataset

In [4]:
import glob
import os

In [7]:
emails,labels=[],[]
file_path = 'enron1/spam/'
for filename in glob.glob(os.path.join(file_path,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(1) # 1 = spam
file_path = 'enron1/ham/'
for filename in glob.glob(os.path.join(file_path,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(0) # 0 = legitimate
print(len(emails))
print(len(labels))

5172
5172


### Clean dataset

In [17]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
def is_letter_only(word):
    return word.isalpha()
all_names=set(names.words())
lemmatizer= WordNetLemmatizer()

def clean_text(docs):
    docs_cleaned = []
    for doc in docs:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word)
                              for word in doc.split()
                              if is_letter_only(word)
                               and word not in all_names)
        docs_cleaned.append(doc_cleaned)
    return docs_cleaned

In [18]:
emails_cleaned = clean_text(emails)

*Stop word removal.* The max_features parameter is set to 1000, so it only considers the 1,000 most frequent terms, excluding those that are too common (50% max_df) and too rare (2 min_df). We can definitely tweak this parameter later on in order to achieve higher classification accuracy.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english',max_features=1000,
                    max_df=0.5,min_df=2)
docs_cv = cv.fit_transform(emails_cleaned)

In [20]:
print(docs_cv[0])

  (0, 932)	1
  (0, 968)	1
  (0, 715)	1
  (0, 151)	1
  (0, 585)	1
  (0, 864)	1
  (0, 506)	1
  (0, 691)	1
  (0, 897)	1
  (0, 476)	1
  (0, 72)	1
  (0, 86)	2
  (0, 997)	1
  (0, 103)	1
  (0, 361)	2
  (0, 229)	1
  (0, 363)	2
  (0, 482)	2
  (0, 265)	2


In [22]:
terms = cv.get_feature_names()
print(terms[932])
print(terms[968])
print(terms[715])

unsubscribe
website
read


## Build model

In [27]:
def get_label_index(labels):
    from collections import defaultdict
    label_index = defaultdict(list)
    for index,label in enumerate(labels):
        label_index[label].append(index)
    return label_index
label_index = get_label_index(labels)

In [30]:
label_index

defaultdict(list,
            {1: [0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              15,
              16,
              17,
              18,
              19,
              20,
              21,
              22,
              23,
              24,
              25,
              26,
              27,
              28,
              29,
              30,
              31,
              32,
              33,
              34,
              35,
              36,
              37,
              38,
              39,
              40,
              41,
              42,
              43,
              44,
              45,
              46,
              47,
              48,
              49,
              50,
              51,
              52,
              53,
              54,

### Naive Bayes from scratch

#### Prior

In [31]:
def get_prior(label_index):
    '''
    Compute prior based on training samples
    @param label_index: grouped sample indices by class
    @return: dictionary, with class label as key, correspoding
    prior as the value
    '''
    prior = {label: len(index) for label,index in label_index.items()}
    total_count=sum(prior.values())
    for label in prior:
        prior[label] /= float(total_count)
    return prior

In [32]:
prior = get_prior(label_index)
print('Prior:',prior)

Prior: {1: 0.2900232018561485, 0: 0.7099767981438515}


#### Likelihood

In [34]:
import numpy as np
def get_likelihood(term_matrix,label_index,smoothing=0):
    """
    Compute likelihood based on training samples
    @param term_matrix: sparse matrix of the term frequency features
    @param label_index: grouped sample indices by class
    @param smoothing: integer, additive Laplace smoothing parameter
    @return: dictionary, with class as key, corresponding conditional
    probability P(feature|class) vector as value
    """
    likelihood = {}
    for label, index in label_index.items():
        likelihood[label] = term_matrix[index, :].sum(axis=0) + smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood
smoothing = 1
likelihood = get_likelihood(docs_cv,label_index,smoothing)
len(likelihood[0])

1000

#### Posterior

In [44]:
def get_posterior(term_matrix, prior, likelihood):
    '''
    ... Compute posterior of testing samples, based on prior and likelihood
    ... @param term_matrix: sparse matrix of the term frequency features
    ... @param prior: dictionary, with class label as key, corresponding prior as the value
    ... @param likelihood: dictionary, with class label as key, corresponding
            conditional probability vector as value
    ... @return: dictionary, with class label as key, corresponding
            posterior as value
    '''
    num_docs = term_matrix.shape[0]
    posteriors = []
    for i in range(num_docs):
        posterior = {key:np.log(prior_label) for key,prior_label
                                             in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_matrix.getrow(1)
            counts=term_document_vector.data
            indices=term_document_vector.indices
            for count,index in zip(counts,indices):
                posterior[label]+=np.log(likelihood_label[index])*count
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label]=np.exp(posterior[label]-min_log_posterior)
            except:
                posterior[label]=float('inf')
        sum_posterior=sum(posterior.values())
        for label in posterior:
            if posterior[label]==float('inf'):
                posterior[label]=1.0
            else:
                posterior[label]/=sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

### Train model

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(emails_cleaned,labels,test_size=0.33,random_state=42)

In [39]:
print(len(X_train), len(Y_train))
print(len(X_test), len(Y_test))

3465 3465
1707 1707


In [45]:
term_docs_train = cv.fit_transform(X_train)
label_index = get_label_index(Y_train)
prior=get_prior(label_index)
likelihood=get_likelihood(term_docs_train,label_index,smoothing)

term_docs_test = cv.transform(X_test)
posterior = get_posterior(term_docs_test, prior, likelihood)

#### Evaluate model

In [46]:
correct = 0.0
for pred,actual in zip(posterior,Y_test):
    if actual==1:
        if pred[1]>=0.5:
            correct+=1
    elif pred[0]>0.5:
        correct+=1
print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100))

The accuracy on 1707 testing samples is: 69.8%


### Naive bayes from scikit-learn

In [48]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB(alpha=1.0,fit_prior=True)
clf.fit(term_docs_train,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
prediction_prob = clf.predict_proba(term_docs_test)
prediction_prob[0:10]

array([[1.00000000e+00, 3.96500362e-13],
       [1.00000000e+00, 2.15303766e-81],
       [6.59774100e-01, 3.40225900e-01],
       [1.00000000e+00, 2.28043493e-15],
       [1.00000000e+00, 1.77156705e-15],
       [5.53261316e-05, 9.99944674e-01],
       [0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 3.49697719e-28],
       [1.00000000e+00, 4.43498548e-14],
       [3.39263684e-01, 6.60736316e-01]])

In [53]:
prediction = clf.predict(term_docs_test)
prediction[:10]

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1])

In [54]:
accuracy = clf.score(term_docs_test, Y_test)
print('The accuracy using MultinomialNB is:{0:.1f}%'.format(accuracy*100))

The accuracy using MultinomialNB is:93.0%
