In [1]:
import pandas as pd
import numpy as np
import math 
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Question 1

data_path = "./"

# 1. loads the data file;
training_data = pd.read_csv(data_path + 'training.csv') 
validation_data = pd.read_csv(data_path + 'validation.csv')
test1_data = pd.read_csv(data_path + 'test1.csv') 
test2_data = pd.read_csv(data_path + 'test2.csv') 

# 2. load txt file
text_file = open('censored_list_test1.txt', 'r')
list_censored_test1 = text_file.read().split()
text_file.close()

text_file = open('censored_list_test2.txt', 'r')
list_censored_test2 = text_file.read().split()
text_file.close()

In [3]:
#Question 2

def cleaning_sms(col):
    # convert to lower-case
    col = col.str.lower()
    #remove digits
    col= col.str.replace('\d+', ' ')
    #remove punctuation
    col = col.str.replace('[^\w\s]',' ')
    return col

training_data["sms"] = cleaning_sms(training_data["sms"])
validation_data["sms"] = cleaning_sms(validation_data["sms"])
test1_data["sms"]= cleaning_sms(test1_data["sms"])
test2_data["sms"] = cleaning_sms(test2_data["sms"])

In [4]:
#Question 3

class NaiveBayesForSpam:
#These two functions trains the  Naive Bayes classifier, i.e get the labeled SMS(messages for which we know whether they're spam or ham), 
#iterate over each individual message and identify

    def train (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
        self.likelihoods = np.array (self.likelihoods).T
        
    def train2 (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        spamkeywords = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            if prob1 * 20 < prob2:
                self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
                spamkeywords.append (w)
        self.words = spamkeywords
        self.likelihoods = np.array (self.likelihoods).T

#Returning predictions, bayes's theorem is applied in this function 
    def predict (self, message):
        posteriors = np.copy (self.priors)
        for i, w in enumerate (self.words):
            if w in message.lower():  # convert to lower-case
                posteriors *= self.likelihoods[:,i]
            else:                                   
                posteriors *= np.ones (2) - self.likelihoods[:,i]
            posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
        if posteriors[0] > 0.5:
            return ['ham', posteriors[0]]
        return ['spam', posteriors[1]]    

#Calcauting accuracy score and confusion matrix of the classifiers
    def score (self, messages, labels):
        ## | TP | FP |
        ## | FN | TN |
        confusion = np.zeros(4).reshape (2,2)
        for m, l in zip (messages, labels):
            if self.predict(m)[0] == 'ham' and l == 'ham':
                confusion[0,0] += 1
            elif self.predict(m)[0] == 'ham' and l == 'spam':
                confusion[0,1] += 1
            elif self.predict(m)[0] == 'spam' and l == 'ham':
                confusion[1,0] += 1
            elif self.predict(m)[0] == 'spam' and l == 'spam':
                confusion[1,1] += 1
        return (confusion[0,0] + confusion[1,1]) / float (confusion.sum()), confusion

### Question 4

#### The functions  above are used to calculate prior probability based on Bayes theorem
__train function__ 
* creates a set of all the words present in the spam and ham messages
* calculates the frequency with each word is present in a sentence, this frequency represents the ratio of the times a spam/ham message contains such words. It is noted that this ratio is restricted to be lower than 0.95

__train2 function__
 the function performs the same procedures as train 1 with the following differences
* In train2 function a new list is generated named "spamkeywords", which contains letters with high probability to appear in spamMessage, actually those that appear more than 20 times more often in spam than in ham messages
* Also, the prior probability(likelihood function) only contains the letters with high probability

__predict function__
* Initially all the words in message are converted to lower case 
* Then the function uses the naives bayes method to calculate the posterior probability as a proportion of likelihood and prior probability
* After performing a normalisation, to deal with computational issues, the message is categorised as  spam or ham according to the value of the posterior probability calculated


__score function__
* The function aggregates the number the messages based on whether they have been categorised correctly or wrongly . Essentialy it calculates and returns the confusion matrix

In [5]:
#Question 5
training_ham = list(training_data[training_data.label =='ham'].sms)
training_spam = list(training_data[training_data.label =='spam'].sms)

naive_bayes_training1=NaiveBayesForSpam()
naive_bayes_training1.train(training_ham,training_spam)

naive_bayes_training2=NaiveBayesForSpam()
naive_bayes_training2.train2(training_ham,training_spam)

In [19]:
#Question 6
confusion_matrix1 = naive_bayes_training1.score(validation_data.sms, validation_data.label)
confusion_matrix2 = naive_bayes_training2.score(validation_data.sms, validation_data.label)

print('The accuracy rate of classifer train is',confusion_matrix1[0])
print('The accuracy rate of classifer train2 is',confusion_matrix2[0])
print('Therefore, classifer 2 has a higher accuracy rate, so it is better, while the difference is not significant')

The accuracy rate of classifer train is 0.958
The accuracy rate of classifer train2 is 0.962
Therefore, classifer 2 has a higher accuracy rate, so it is better, while the difference is not significant


# Question 7

The train2 is faster because it uses far less words in order to calculate the posterior probability, consequently the list of words we are comparing each word in the validation set is way shorter and far less calculations are taking place, for example comparisons and multiplications. 

The train2 is more accurate because it uses words that are 20+ times more often present more in spam than in ham messages. Consequently it is more probable to be more important than the classifiers that are ignored. 
Those "non-important" classifier might be misleading as it may cause noise. 

In [16]:
#Q8
false_positives1 = confusion_matrix1[1][0][1]
false_positives2 = confusion_matrix2[1][0][1]
print('If we use train classifier, there are',false_positives1,'false positive.')
print('If we use train2 classifier, there are',false_positives2,'false positive.')
print('In order to reduce the number of false positives at the expense of false negatives we could decrease the limit at the threshold in the last part of  "predict" function at the line "if posteriors[0] > 0.5:" from 0.5 to a smaller number.')

If we use train classifier, there are 27.0 false positive.
If we use train2 classifier, there are 35.0 false positive.
In order to reduce the number of false positives at the expense of flase negatives we could decrease the limit at the threshold in the last part of  "predict" function at the line "if posteriors[0] > 0.5:" from 0.5 to a smaller number.


# Question 9
We will ignore the missing variables when we predict the probability. Consequently this formula will become $P(Y = C_j | X_1 = x_1, ..., X_p = x_p) = P(Y = C_j | X_1 = x_1, ..., X_{j-1} = x_{j-1}, X_{k+1} = x_{k+1}, ..., X_p = x_p)$ 

# Question 10

In [38]:
#for list_censored_test1
class NaiveBayesForSpam_updated_censored1:
    #rewritten the class with changed the predict function:
    def train (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
        self.likelihoods = np.array (self.likelihoods).T
        
    def train2 (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        spamkeywords = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            if prob1 * 20 < prob2:
                self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
                spamkeywords.append (w)
        self.words = spamkeywords
        self.likelihoods = np.array (self.likelihoods).T

    def predict (self, message):
        posteriors = np.copy (self.priors)
        for i, w in enumerate (self.words) :
            if (w not in list_censored_test1): # This algorithm ignores the censored that are included in test1.txt
                if w in message.lower(): 
                    posteriors *= self.likelihoods[:,i]
                else:                                   
                    posteriors *= np.ones (2) - self.likelihoods[:,i]
                posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
        if posteriors[0] > 0.5:
            return ['ham', posteriors[0]]
        return ['spam', posteriors[1]]    

    def score (self, messages, labels):
        ## | TP | FP |
        ## | FN | TN |
        confusion = np.zeros(4).reshape (2,2)
        for m, l in zip (messages, labels):
            if self.predict(m)[0] == 'ham' and l == 'ham':
                confusion[0,0] += 1
            elif self.predict(m)[0] == 'ham' and l == 'spam':
                confusion[0,1] += 1
            elif self.predict(m)[0] == 'spam' and l == 'ham':
                confusion[1,0] += 1
            elif self.predict(m)[0] == 'spam' and l == 'spam':
                confusion[1,1] += 1
        return (confusion[0,0] + confusion[1,1]) / float (confusion.sum()), confusion

In [39]:
training_ham = list(training_data[training_data.label =='ham'].sms)
training_spam = list(training_data[training_data.label =='spam'].sms)

#train test1:
naive_bayes_test_censored1 = NaiveBayesForSpam_updated_censored1()
naive_bayes_test_censored2 = NaiveBayesForSpam_updated_censored1()

naive_bayes_test_censored1.train(training_ham, training_spam)
naive_bayes_test_censored2.train2(training_ham, training_spam)

confusion_matrix_test_censored1 = naive_bayes_test_censored1.score(test1_data.sms, test1_data.label)
confusion_matrix_test_censored2 = naive_bayes_test_censored2.score(test1_data.sms, test1_data.label)

print('The accuracy rate of classifer train is',confusion_matrix_test_censored1[0])
print('The accuracy rate of classifer train2 is',confusion_matrix_test_censored2[0])

The accuracy rate of classifer train is 0.9680933852140078
The accuracy rate of classifer train2 is 0.9727626459143969


# Question 11

In [40]:
class NaiveBayesForSpam_updated_censored2:
    #rewritten the class with changed the predict function:
    def train (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
        self.likelihoods = np.array (self.likelihoods).T
        
    def train2 (self, hamMessages, spamMessages):
        self.words = set (' '.join (hamMessages + spamMessages).split())
        self.priors = np.zeros (2)
        self.priors[0] = float (len (hamMessages)) / (len (hamMessages) + len (spamMessages))
        self.priors[1] = 1.0 - self.priors[0]
        self.likelihoods = []
        spamkeywords = []
        for i, w in enumerate (self.words):
            prob1 = (1.0 + len ([m for m in hamMessages if w in m])) / len (hamMessages)
            prob2 = (1.0 + len ([m for m in spamMessages if w in m])) / len (spamMessages)
            if prob1 * 20 < prob2:
                self.likelihoods.append ([min (prob1, 0.95), min (prob2, 0.95)])
                spamkeywords.append (w)
        self.words = spamkeywords
        self.likelihoods = np.array (self.likelihoods).T

    def predict (self, message):
        posteriors = np.copy (self.priors)
        for i, w in enumerate (self.words):
            if (w not in list_censored_test2): # This algorithm ignores the censored that are included in test2.txt
                if w in message.lower(): 
                    posteriors *= self.likelihoods[:,i]
                else:                                   
                    posteriors *= np.ones (2) - self.likelihoods[:,i]
                posteriors = posteriors / np.linalg.norm (posteriors)  # normalise
        if posteriors[0] > 0.5:
            return ['ham', posteriors[0]]
        return ['spam', posteriors[1]]    

    def score (self, messages, labels):
        ## | TP | FP |
        ## | FN | TN |
        confusion = np.zeros(4).reshape (2,2)
        for m, l in zip (messages, labels):
            if self.predict(m)[0] == 'ham' and l == 'ham':
                confusion[0,0] += 1
            elif self.predict(m)[0] == 'ham' and l == 'spam':
                confusion[0,1] += 1
            elif self.predict(m)[0] == 'spam' and l == 'ham':
                confusion[1,0] += 1
            elif self.predict(m)[0] == 'spam' and l == 'spam':
                confusion[1,1] += 1
        return (confusion[0,0] + confusion[1,1]) / float (confusion.sum()), confusion

In [41]:
training_ham = list(training_data[training_data.label =='ham'].sms)
training_spam = list(training_data[training_data.label =='spam'].sms)

#train test1:
naive_bayes_test2_censored1 = NaiveBayesForSpam_updated_censored2()
naive_bayes_test2_censored2 = NaiveBayesForSpam_updated_censored2()

naive_bayes_test2_censored1.train(training_ham, training_spam)
naive_bayes_test2_censored2.train2(training_ham, training_spam)

confusion_matrix_test2_censored1 = naive_bayes_test2_censored1.score(test2_data.sms, test2_data.label)
confusion_matrix_test2_censored2 = naive_bayes_test2_censored2.score(test2_data.sms, test2_data.label)

print('The accuracy rate of classifer train is',confusion_matrix_test2_censored1[0])
print('The accuracy rate of classifer train2 is',confusion_matrix_test2_censored2[0])

The accuracy rate of classifer train is 0.9665629860031104
The accuracy rate of classifer train2 is 0.9611197511664075


In [42]:
print('The accuracy on test2 are slightly smaller than the accuracy on test1 but is sitll very high. This makes sense since 30% keywords for test2 are removed,while 10%keywords for test 1 are removed')

The accuracy on test2 are slightly smaller than the accuracy on test1 but is sitll very high. This makes sense since 30% keywords for test2 are removed
