# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [25]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/ham', 'ham'))




Let's have a look at that DataFrame:

In [26]:
data.head()

Unnamed: 0,class,message
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00001.7848dde101aa985090474a91ec93fcf0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00002.d94f1b97e48ed3b553b3508d116e6a09,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00004.eac8de8d759b7e74154f142194282724,spam,##############################################...
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00005.57696a39d7d84318ce497886896bf90d,spam,I thought you might like these:\n\n1) Slim Dow...


 use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier
 

In [54]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)



MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
examples = ['Free Viagra now!!!', "Free Viagra World of Warcraft now!!!"]
example_counts = vectorizer.transform(examples)

predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'],
      dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [64]:
# Pandas Split Data
train=data.sample(frac=0.8, random_state=200)
test=data.drop(train.index)

vectorizer = CountVectorizer()
counts_new = vectorizer.fit_transform(train['message'].values)
targets_new = train['class'].values
classifier.fit(counts_new, targets_new)

test['class'].head()


C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00018.5b2765c42b7648d41c93b9b27140b23a    spam
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00019.bbc97ad616ffd06e93ce0f821ca8c381    spam
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00049.09e42d433e0661f264a25c7d4ed6e3ea    spam
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00068.d10af636a6082d5172ceb34a944486e6    spam
C:/Users/Lucian-PC/Desktop/DataScience/DataScience-Python3/emails/spam\00074.51aab41b27a9ba7736803318a2e4c8de    spam
Name: class, dtype: object

In [62]:
test_data = vectorizer.transform(test['message'].values)
predict_test = classifier.predict(test_data)
predict_test


array(['ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'spam',
       'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham',
       'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam',
       'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam',
       'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam',
       'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam',
       'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam',
       'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam',
       'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'h