In [4]:
import re
import numpy as np
import collections

# 문장을 받아서 단어의 set으로 리턴
def tokenize(message):
    message = message.lower()
    tokens = re.findall("[a-z0-9']+",message)
    return set(tokens)

In [5]:
def count_words(training_set):
    '''
    training_set is tuple.
    (message,isSPAM) 의 리스트
    '''
    # key, [x1,x2] : x1는 스팸메시지에서 나온 word카운트, x2는 햄메시지에서 나온 word카운트
    counts = collections.defaultdict(lambda : [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] +=1
    return counts

In [6]:
def word_probability(counts,total_spams,total_non_spams,k=0.5):
    '''
    P(word | S), P(word | ~S)을 알아야 한다.
    '''
    returnVec = [] # It constains tuple of (word,P(word|S),P(word|~S))
    for word, (spamCount,hamCount) in counts.items():
        w = word
        cond_S = (k + spamCount) / (2*k + total_spams)
        cond_non_S = (k + hamCount) / (2*k + total_non_spams)
        returnVec.append(w,cond_S,cond_non_S)
    return returnVec

In [7]:
def spam_probability(word_probs,message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_ham = 0.0
    
    # 테스트 데이터의 모든 단어에 대하여
    for word, prob_is_spam, prob_is_ham in word_probs:
        # 테스트 대상이 되는 단어들을 message_words라 했을때,
        # 우리가 가지고있는 (word,prob_S,prob_~S)리스트에 속한 단어인 경우
        if word in message_words:
            log_prob_if_spam += np.log(prob_is_spam)
            log_prob_if_ham += np.log(prob_is_ham)
        # 테스트 대상이 되는 단어들을 message_words라 했을때,
        # 우리가 가지고있는 (word,prob_S,prob_~S)리스트에 속하지 않은 단어인 경우
        # 해당 메시지가 스팸일 확률은 그 word를 가졌을때 스팸일 확률을 P(word | S) 라 했을때
        # 1 - P(word | S)는 해당 word를 가지지 않았을때, 스팸일 확률을 의미하므로
        else:
            log_prob_if_spam += np.log(1 - prob_is_spam)
            log_prob_if_ham += np.log(1- prob_is_ham)
    log_prob_if_spam = np.exp(log_prob_if_spam)
    log_prob_if_ham = np.exp(log_prob_if_ham)
    
    return log_prob_if_spam / (log_prob_if_spam + log_prob_if_ham)

In [8]:
class NaiveBayesClassifier:
    def __init__(self,k=0.5):
        self.k = k
        self.word_probs = []
    def train(self,training_set):
        num_spams = len([is_spam for message,is_spam in training_set if is_spam])
        num_hams = len(training_set) - num_spams
        
        word_counts = count_words(training_set)
        self.word_probs = word_probability(word_counts,num_spams,num_hams,self.k)
    def classifiy(self,message):
        return spam_probability(self.word_probs,message)