# proszę podzielić dane na treningowe i testowe z innym współczynnikiem niż 0.75

In [9]:
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob

def tokenize(message):
    message = message.lower()                       # przekonwertowuj na małe litery
    all_words = re.findall("[a-z0-9']+", message)   # wyodrębnij słowa
    return set(all_words)                           # usuń duplikaty


def count_words(training_set):
    #Zbiór danych składa się z par (message,is_spam)
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    # Zamienia word_counts na listę trójelementową w,p (w | spam) i p(w | ~spam).
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # iteracja po wszystkich słowach znajdujących sie w słowniuku
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # Jezeli słowo nie występuję w wiadomości.
        #dodaj logarytm prawdopodobieństwa niezobaczenia go:
        #log(l - prawdopodobieństwo wystąpienia słowa)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # Policz wiadomości będące spamem i niebedące spamem
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # Przetwórz treningowy zbiór danych
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)


def get_subject_data(path):

    data = []

    # Usuwa pierwsze słowo "Subject" i pozostawia resztę
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob zwraca każdą nazwę pliku pasującą do ścieżki wieloznacznej
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    return data

def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # W celu ujednolicenia wyników
    train_data, test_data = split_data(data, 0.60)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)
    
    print('******************************************************')
    print(counts)
    print('******************************************************')
    print('\n\n')
    
    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print('******************************************************')
    print("spammiest_hams", spammiest_hams)
    print('******************************************************')
    print('\r\n')
    
    print('******************************************************')
    print("hammiest_spams", hammiest_spams)
    print('******************************************************')
    print('\r\n')
    
    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print('******************************************************')
    print("spammiest_words", spammiest_words)
    print('******************************************************')
    print('\r\n')
    
    print('******************************************************')
    print("hammiest_words", hammiest_words)
    print('******************************************************')
    print('\r\n')

train_and_test_model(r"./spam/*/*")

******************************************************
Counter({(False, False): 1534, (True, True): 489, (False, True): 130, (True, False): 94})
******************************************************



******************************************************
spammiest_hams [('Professional dog poop scooper has 300 customers', False, 0.9937672327820787), ('Digital service launches with 30 free channels', False, 0.9966400866954005), ("PFI hospital's £97m pay bill could cost NHS billions", False, 0.9994993395930126), ('"I meditated in a cave for 12 years and now I\'m here to tell you', False, 0.9998043865412485), ('Ah ... so they ARE coming for your porn next ...', False, 0.9999379800857013)]
******************************************************


******************************************************
hammiest_spams [('Re: change of plans', True, 0.00017890939686024095), ('[ILUG] Re: whats up -colonize', True, 0.00044203813546092987), ('RE: Own An Automated Shopping Mall                  

# za pomocą pakietu scikit-learn zaimplementuj wybraną wersję algorytmu ‘Naive Bayes’

In [4]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB


#Zdefiniuj dane
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']

play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']


#Najpierw musimy przekonwertować  etykiety ciągów na liczby. na przykład: „Pochmurno”, „Deszczowo”, „Słoneczny” jako 0, 1, 2.
#Jest to znane jako kodowanie etykiet /"label encoding"/. Scikit-learn udostępnia bibliotekę LabelEncoder do kodowania etykiet o wartości od 0 do 
#jednej mniejszej niż liczba klas dyskretnych.
wheather = preprocessing.LabelEncoder()
# Zmień etykiety na numery
wheather_encoded=le.fit_transform(weather)
temp_encoded=le.fit_transform(temp)
label=wheather.fit_transform(play)
#Połączenie obu funkcji (pogodę i temperatury) w jedną zmienną (listę tuples).
features= list(zip(wheather_encoded, temp_encoded)) 
from sklearn.naive_bayes import GaussianNB

#Utwórz klasyfikator Gaussa
model = GaussianNB()

# Wytrenuj model za pomocą zestawów treningowych
model.fit(features,label)

#Etap przewidzenia odpoweidź
predicted= model.predict([[0,2]]) # 0:Overcast, 2:Mild
print ("Predicted Value:", predicted) #1 wskazuje, że gracze mogą „grać”.

Predicted Value: [1]
