In [29]:
import re
import glob
import random, math
from machine_learning import split_data
from collections import defaultdict, Counter

In [30]:
#przekonwertowanie malych liter, wyodrebnienie slow oraz usuniecie duplikatow
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

In [31]:
def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [32]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [33]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
#wykonanie iteracji dla kazdego z slow w slowniku
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [34]:
class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

        #policzenie wiadomosci zwyklych oraz spamu
    def train(self, training_set):

        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [35]:
def get_subject_data(path):

    data = []
    #Usuwa slowo "Subject" nie zmieniajac reszty
    subject_regex = re.compile(r"^Subject:\s+")

    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

In [41]:

def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)
    train_data, test_data = split_data(data, 0.5)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.7)
                     for _, is_spam, spam_probability in classified)

    print(counts)
    print('\n\n')
    classified.sort(key=lambda row: row[2])
    
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print('\n\n')

    print("hammiest_spams", hammiest_spams)
    print('\n\n')


    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print('\n\n')
    print("hammiest_words", hammiest_words)

In [42]:
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [44]:
train_and_test_model(r"./spam/*/*")

Counter({(False, False): 1452, (True, True): 145, (True, False): 116, (False, True): 26})



spammiest_hams [("Subject: There's still time to enter Lifetime's New Season Sweeps!", False, 0.9879840719297605), ('The MIME information you requested (last changed 3154 Feb 14)', False, 0.9903077361496861), ('"I meditated in a cave for 12 years and now I\'m here to tell you', False, 0.9950825332495429), ('=?iso-2022-jp?B?UmU6IBskQjswSSkyPTNYJSglcyU4JUslIiVqJXMlME1NJVcbKEI=?=', False, 0.9989289969518657), ('=?iso-8859-1?Q?Matrox_Parhelia=99_now_available?=', False, 0.9997057283083132)]



hammiest_spams [('A revolution in the PC world has arrived.          RSIRTR', True, 0.0004262931153700218), ('Re: girls', True, 0.0006195185177120998), ('Re: Hi', True, 0.0007359278142652607), ('Introducing Chase Platinum for Students with a 0% Introductory APR', True, 0.0015672571297456453), ('Testing a system, please delete', True, 0.0015685834538947101)]



spammiest_words [('norton', 0.026748971193415638,