In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import math
import os
#os.chdir('./drive/My Drive/Project 3')

# IDEAS
- Lemmatization and Stemming (hard)
- TF-IDF

In [0]:
class Naive_Bayes():
    def __init__(self, stopwords=[]):
        self.stopwords = stopwords
        self._vocab_spam = {}
        self._vocab_ham = {}
        self._num_train_spam = 0
        self._num_train_ham = 0
        self._num_test_spam = 0
        self._num_test_ham = 0
        self._true_positive = 0
        self._true_negative = 0
    
  
  
    def fit(self, filenames, stopwords=[]):
        train_spam, self._num_train_spam = self._tokenizer(filenames[0])
        train_ham, self._num_train_ham = self._tokenizer(filenames[1])
        self._vocab_spam, self._vocab_ham = self._extend(train_spam, train_ham)
    
    
    def predict(self, filenames, stopwords=[], verbose=True):
        # Read in test files
        test_spam, self._num_test_spam = self._tokenizer(filenames[0], False)
        test_ham, self._num_test_ham = self._tokenizer(filenames[1], False)
        num_emails = self._num_train_spam + self._num_train_ham

        # Test spams emails
        for i in range(1, len(test_spam) + 1):
            # Calculate log probability of posteriors
            H_spam, match = self._total_log_proba(test_spam[i], self._vocab_spam, 
                                                  self._num_train_spam, num_emails)
            H_ham, match = self._total_log_proba(test_spam[i], self._vocab_ham, 
                                                 self._num_train_ham, num_emails)

            # Compare log probabilities
            if H_spam > H_ham:
                if verbose:
                    print('TEST {} {}/{} features true {:.3f} {:.3f} spam right'\
                         .format(i, match, len(self._vocab_spam), H_spam, H_ham))
                    self._true_positive += 1
            else:
                if verbose:
                    print('TEST {} {}/{} features true {:.3f} {:.3f} ham wrong'\
                         .format(i, match, len(self._vocab_spam), H_spam, H_ham))

        # Test ham emails
        for i in range(1, len(test_ham) + 1):
            # Calculate log probability of posteriors
            H_spam, match = self._total_log_proba(test_ham[i], self._vocab_spam, 
                                                  self._num_train_spam, num_emails)
            H_ham, match = self._total_log_proba(test_ham[i], self._vocab_ham, 
                                                 self._num_train_ham, num_emails)

            # Compare log probabilities
            if H_ham > H_spam:
                if verbose:
                    print('TEST {} {}/{} features true {:.3f} {:.3f} ham right'\
                         .format(i, match, len(self._vocab_spam), H_spam, H_ham))
                    self._true_negative += 1
            else:
                if verbose:
                    print('TEST {} {}/{} features true {:.3f} {:.3f} spam wrong'\
                         .format(i, match, len(self._vocab_spam), H_spam, H_ham))
        if verbose:
            print(f'Accuracy: {self._true_positive + self._true_negative}'
                  + f'/{self._num_test_ham + self._num_test_ham}')

  
    def accuracy_score(self):
        return (self._true_positive + self._true_negative)/(self._num_test_ham + self._num_test_ham)
  
  
    def recall(self):
        return self._true_positive / self._num_test_spam
  
  
    def precision(self):
        false_positive = self._num_test_ham - self._true_negative
        return self._true_positive / (self._true_positive + false_positive)
    
  
    def f_score(self, beta=1):
        return (1 + beta**2) * self.precision() * self.recall() / ((1 + beta**2) * self.precision() + self.recall())
  
  
  
    '''Input: 
    - path to a txt file 
    - is_train: optional, if we want to process a train or test set. 
                  If train, vocab = {word: frequency}
                  If test, vocab = {email_No: set_of_words}
    - stopwords: optional, prune out very common words
    
    Return: the Bag of Words of that file.
    '''
    def _tokenizer(self, filename, is_train=True):
        file = open(filename, "r")
        lines = file.readlines()
        file.close()

        vocab = {}
        num_emails = 0
        email = []
        for l in lines:
            # Start of the (new) email
            if l == '<SUBJECT>\n':
                email = []
                num_emails += 1
                continue

            # End of the email
            if l in ['</BODY>\n', '</BODY>']:
                # If test, vocab = {email_No: set_of_words}
                if not is_train:
                    vocab[num_emails] = set(email)
                    continue

                # If train, vocab = {word: frequency}
                for word in set(email):
                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1

            # Process each line in the email
            # Ignore tags
            if l in ['</SUBJECT>\n', '<BODY>\n', ]:
                continue
          
            # Do simple cleaning
            l = l.replace('\n','')
            l = l.lower()

            # Get separated words for each email
            for word in l.split(' '):
                if word not in self.stopwords and word != '':
                    email.append(word)
                    
        return (vocab, num_emails)
    
    
    '''
    Combine spam and ham vocab_dict, and aslo transform both:
    if a word is in one dict but not the other, add the pair {word: 0} 
    to the other
    '''  
    def _extend(self, d1, d2):
        new_d1 = {}
        new_d2 = {}
        all_words = set(d1.keys()).union(set(d2.keys()))

        for word in all_words:
            if word not in d1:
                new_d1[word] = 0
            else:
                new_d1[word] = d1[word]

        for word in all_words:
            if word not in d2:
                new_d2[word] = 0
            else:
                new_d2[word] = d2[word]

        return new_d1, new_d2
  

    '''
    Calculate the log probability with Laplace smoothing
    '''
    def _total_log_proba(self, text, vocab, num_class, num_total):
        total_log_proba = math.log(num_class / num_total)
        match = 0
        
        for word in vocab:
            if word in text:
                match += 1
                total_log_proba = total_log_proba \
                                  + math.log((vocab[word] + 1) / (num_class + 2))
            else:
                total_log_proba = total_log_proba \
                                  + math.log((num_class - vocab[word] + 1) / (num_class + 2))
        
        return total_log_proba, match
  

In [24]:

train_spam = input('Name of spam training file: ')
train_ham = input('Name of ham training file: ')
test_spam = input('Name of spam testing file: ')
test_ham = input('Name of ham testing file: ')

with open('stopwords.txt', "r") as file:
    stopwords = file.read()
stopwords = stopwords.split('\n')

nb = Naive_Bayes()
nb.fit([train_spam, train_ham])
nb.predict([test_spam, test_ham])
print(f'Accuracy: {nb.accuracy_score()}\n'
     + f'F1: {nb.f_score()}')


Name of spam training file: ./data/train-spam.txt
Name of ham training file: ./data/train-ham.txt
Name of spam testing file: ./data/test-spam.txt
Name of ham testing file: ./data/test-ham.txt
TEST 1 257/78082 features true -1229.814 -1149.145 ham wrong
TEST 2 142/78082 features true -591.417 -557.911 ham wrong
TEST 3 55/78082 features true -206.369 -257.586 spam right
TEST 4 0/78082 features true -51.540 -131.653 spam right
TEST 5 859/78082 features true -4158.445 -3698.980 ham wrong
TEST 6 43/78082 features true -182.540 -226.682 spam right
TEST 7 10/78082 features true -73.022 -146.472 spam right
TEST 8 0/78082 features true -51.540 -131.653 spam right
TEST 9 14/78082 features true -84.320 -167.577 spam right
TEST 10 105/78082 features true -367.449 -441.174 spam right
TEST 11 43/78082 features true -153.265 -217.650 spam right
TEST 12 11/78082 features true -69.529 -139.552 spam right
TEST 13 142/78082 features true -526.630 -575.843 spam right
TEST 14 240/78082 features true -1151.

In [25]:
nb.f_score(0.5)

0.43916720884840604

In [26]:
nb2 = Naive_Bayes(stopwords=stopwords)
nb2.fit([train_spam, train_ham])
nb2.predict([test_spam, test_ham])

TEST 1 200/77955 features true -1101.828 -1070.968 ham wrong
TEST 2 97/77955 features true -494.347 -497.442 spam right
TEST 3 35/77955 features true -166.749 -213.960 spam right
TEST 4 0/77955 features true -37.337 -88.112 spam right
TEST 5 748/77955 features true -3859.830 -3518.053 ham wrong
TEST 6 30/77955 features true -152.172 -183.821 spam right
TEST 7 5/77955 features true -49.338 -98.789 spam right
TEST 8 0/77955 features true -37.337 -88.112 spam right
TEST 9 8/77955 features true -64.338 -124.987 spam right
TEST 10 72/77955 features true -306.833 -389.690 spam right
TEST 11 27/77955 features true -121.744 -176.394 spam right
TEST 12 7/77955 features true -52.472 -97.773 spam right
TEST 13 102/77955 features true -452.710 -521.325 spam right
TEST 14 197/77955 features true -1061.990 -1077.204 spam right
TEST 15 40/77955 features true -185.183 -230.956 spam right
TEST 16 29/77955 features true -138.716 -187.831 spam right
TEST 17 64/77955 features true -295.205 -357.193 spam r

In [27]:
nb2.f_score(0.5)

0.45264888102162093