# Chapter 13 - Naive Bayes

This chapter is about starting naively to reduce bias and get the right answer from a Bayesian perspective. Worth reading the spam filter intro about Bayesian formulation and assumptions.

...and underflow of floats...

In [43]:
# bringing in old functions from previous chapters

import re
import math
import glob
from collections import Counter, defaultdict
import random

def split_data(data, prob):
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

def precision(tp, fp, fn, tn):
    return tp / (tp+fp)

# recall is the fraction of the posinives identified
def recall(tp, fp, fn, tn):
    return tp / (tp + fn)

In [52]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates


def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [47]:
# let's make a Class classifier

class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
        
    def classify(self, message):
        return spam_probability(self.word_probs, message)
    
  

In [54]:
# lets test it!
path = r'C:\Users\klada\PycharmProjects\dsfs\SpamAssassin\*\*'

def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data
              
def train_and_test_model(path):
    random.seed(0)
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

#     for _, is_spam, spam_probability in classified:
#         print(is_spam, spam_probability)
    
    counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

    print(counts)
    
    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)
    
train_and_test_model(path)

Counter({(False, False): 737, (True, False): 25, (True, True): 4})
spammiest_hams [('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.021156030237741198), ('Conversations From GDC Europe: Bill Fulton, Zeno Colaco, Harvey Smith', False, 0.03935973252681327), ('"Free" Elvis Costello CD a trojan horse for DRM malware', False, 0.04245442887102641), ('EFFector 15.28: Motions Filed in Morpheus Peer-to-Peer Case,', False, 0.06113067570726612), ('Attn programmers: support offered [FLOSS-Sarai Initiative]', False, 0.15546104822167903)]
hammiest_spams [('[scoop] CEVIRI YAZILIMLARI', True, 4.6256295236250034e-08), ("** You're -Approved-!", True, 4.625629523627452e-08), ('Re: Instant Quote', True, 2.073925960198574e-07), ('Email Marketing', True, 9.717356917327071e-07), ('Immediate Reply Needed', True, 2.1309305436763246e-06)]
spammiest_words [('zzzz', 0.03723404255319149, 0.00022893772893772894), ('guaranteed', 0.047872340425531915, 0.00022893772893772894), ('reps', 0.047872340

## This concludes Chapter 13

This chapter might need revisiting because I had some trouble, but it's also the end of a long coding session so we'll see how I ues it in the future. \

scikit-learn contains BernoulliNB model which is the same Naive Bayes algorithm we implemented here, as well as other variations on the model. Joel lists ways the model can be improved, which I can revisit when I ever use this stuff in the future. 

