# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [30]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))


data = shuffle(data)

Let's have a look at that DataFrame:

In [31]:
data.count

<bound method DataFrame.count of                                                    class  \
emails/ham/02022.5ecf975278edd8b82be5e9afaaf5f4ff    ham   
emails/ham/00629.370fec99ddca8da57ef5cb0bf30375e5    ham   
emails/ham/01419.97da4f8a986b55cbe1f81bb22836ac58    ham   
emails/spam/00082.0341a767bbaca01fd89b6236ef681257  spam   
emails/ham/00668.c788422df192a179d3a6ddbcb8b8b612    ham   
emails/ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613    ham   
emails/spam/00161.ae33257753c9bdaaadc9221347868496  spam   
emails/ham/02433.9cfb47708291604f2c38393706175160    ham   
emails/spam/00469.ee3b2f31459cc2ec43ae7cae00d40cf6  spam   
emails/ham/02298.501991b65594ba4937fc54a5c23ee1c3    ham   
emails/spam/00466.ecb11c98ec4511b5422b20476d935bd1  spam   
emails/ham/01271.519b987eddc0633ac3a5908c33a1fa2c    ham   
emails/ham/00388.18e2a6069150c2c9139f760fda7668ac    ham   
emails/ham/01626.daf72a49b735dc3319a809ec520f2283    ham   
emails/ham/00674.6dcadfb64e1a333f826a1c7b5c722f5f    ham   
emails/

In [32]:
data.head()

Unnamed: 0,class,message
emails/ham/02022.5ecf975278edd8b82be5e9afaaf5f4ff,ham,"URL: http://www.newsisfree.com/click/-5,830431..."
emails/ham/00629.370fec99ddca8da57ef5cb0bf30375e5,ham,"Church, AA, same diff?\n\n\n\n;-).\n\n\n\nChee..."
emails/ham/01419.97da4f8a986b55cbe1f81bb22836ac58,ham,[Skip Montanaro]\n\n> Any thought to wrapping ...
emails/spam/00082.0341a767bbaca01fd89b6236ef681257,spam,This message is in MIME format. Since your mai...
emails/ham/00668.c788422df192a179d3a6ddbcb8b8b612,ham,"\n\n> On Thu, Sep 19, 2002 at 11:11:47AM -0400..."


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [33]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'][:2000].values)

classifier = MultinomialNB()
targets = data['class'][:2000].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [34]:
# examples = ['Your Apple ID Invoice is here!', "Hi Bob, how about a game of golf tomorrow?"]
examples = data['message'][2000:].values
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
real = data['class'][2000:].values
predictions

array(['ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       '

In [35]:
count = 0
right = 0
for pred in predictions:
    r = real[count]
    if(r == pred):
        right = right + 1
    count = count + 1

accuracy = right / 1000
accuracy

0.953

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.