In [1]:
# divide the data into training and test sets

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
df = pd.read_csv('./messages.txt', sep='\t', names=["label", "message"])



In [2]:
# divie the data in two groups: training and test examples

# split data into train and test sets without using sklearn
# 80% of the data is used for training and 20% for testing
# the data is shuffled before splitting
# X_train and X_test are the messages
# y_train and y_test are the labels


# shuffled_df = df.sample(frac=1)
train_size = int(0.8 * len(df))

# get everything up to the train_size (not included)
X_train = df['message'][:train_size]
y_train = df['label'][:train_size]

#get everything after the train_size (included)
X_test = df['message'][train_size:]
y_test = df['label'][train_size:]



In [3]:
# Parse both the training and test examples to generate both the spam and ham data sets
# The spam data set contains all the spam messages in the training set
# The ham data set contains all the ham messages in the training set

spam = []
ham = []

for i in range(len(X_train)):
    if y_train.iloc[i] == 'spam':
        spam.append(X_train.iloc[i])
    else:
        ham.append(X_train.iloc[i])

for i in range(len(X_test)):
    if y_test.iloc[i] == 'spam':
        spam.append(X_train.iloc[i])
    else:
        ham.append(X_train.iloc[i])        






In [4]:
# Generate a dictionary from the training data 

def make_Dictionary(data):
    all_words = []    
    for i in range(len(data)):
        words = data.iloc[i].split()
        all_words += words
    # create a dictionary of words with their frequency frequency
    dictionary = Counter(all_words)

    for item in list(dictionary):
    # del words that are not alphabetic
    # del words that are of length 1
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    # return the 3000 first most common words               
    dictionary = dictionary.most_common(3000)
    return dictionary



In [5]:
dictionnary = make_Dictionary(X_train)

In [6]:
# Extract features from both the training data and test data

def extract_features(data):
    features_matrix = np.zeros((len(data), 3000))
    for i,line in enumerate(data):
        words = line.split()
        for word in words:
            wordID = 0
            # for each word, check if it is in the dictionary
            for j, d in enumerate(dictionnary):
                # if the most common word in the dictionary is the word we are checking
                if d[0] == word:
                    # set the wordID to the position of the word in the dictionary
                    wordID = j
                    features_matrix[i, wordID] = words.count(word)
             # increment the docID to go to the next email in the features_matrix                    
    return features_matrix

In [7]:
train_matrix = extract_features(X_train)
test_matrix = extract_features(X_test)

# Fitting the data

In [8]:
#calculate prior probability of spam and ham

prior_spam = len(spam)/(len(spam)+len(ham))
prior_ham = len(ham)/(len(spam)+len(ham))


# get the test and train data labels
train_labels = np.zeros(len(y_train))
test_labels = np.zeros(len(y_test))

# prepare training set
for i in range(len(y_train)):
    if y_train.iloc[i] == 'ham':
        train_labels[i] = 0
    else : 
        train_labels[i] = 1
        
train_matrix = extract_features(X_train)

# prepare the test set 
for i in range(len(y_test)):
    if y_test.iloc[i] == 'ham':
        test_labels[i] = 0
    else : 
        test_labels[i] = 1

test_matrix = extract_features(X_test)



# we have a list of all the spam and ham words
# the train matrix dataframe contains the number of times each word appears in each message
# using the train labels, we can calculate the likelihood of each word in the spam and ham messages
# by seperating the the train matrix into spam and ham individual dataframes
spam_train_matrix = []
ham_train_matrix = []

for i, word in enumerate(train_matrix):
    if(train_labels[i] == 1):
        spam_train_matrix.append(word)
    else:
        ham_train_matrix.append(word)


spam_train_matrix = np.array(spam_train_matrix)  
ham_train_matrix = np.array(ham_train_matrix)

# function that returns an array of the likelihood of each word in the message being spam or ham
def calculate_likelihood(train_matrix):
    # p(f1,f2,f3,...|spam) = p(f1|spam) * p(f2|spam) * p(f3|spam)
    # p(word|spam) = (number of times word appears in spam messages) / (number of words in spam messages)
    # we add 1 to the numerator and 3000 to the denominator to avoid 0 probability (laplace smoothing)
    # 3000 because there are 3000 words in the dictionary (add 1 for each)
    return (train_matrix.sum(axis = 0) + 1) / (train_matrix.sum() + 3000)
        
        

# p(f1,f2,f3,...|ham) = p(f1|ham) * p(f2|ham) * p(f3|ham)
# p(word|ham) = (number of times word appears in ham messages) / (number of words in ham messages)
        
spam_likelihoods = calculate_likelihood(spam_train_matrix)
ham_likelihoods = calculate_likelihood(ham_train_matrix)


# Predict data with test matrix

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def predict(matrix):
    results = []
    for j,message in enumerate(matrix):
        spam_like = 1
        ham_like = 1
        for i, word_freq in enumerate(message):
            if word_freq != 0:
                spam_like *= pow(spam_likelihoods[i], word_freq)
                ham_like *= pow(ham_likelihoods[i], word_freq)
        if(spam_like * prior_spam > ham_like * prior_ham):
            results.append(1)
        else:
            results.append(0)
    return results


results2 = predict(test_matrix)

print(confusion_matrix(test_labels, results2))
print(accuracy_score(test_labels, results2))

[[865   9]
 [ 13 113]]
0.978


### Test our results by comparing with the sklearn methods 

In [10]:
# implement the Naive Bayes 
from sklearn.naive_bayes import MultinomialNB


model = MultinomialNB()


# train the model
model.fit(train_matrix, train_labels)


result = model.predict(test_matrix)

# we can see that our results are the same as for sklearn
print(confusion_matrix(test_labels, result))

[[865   9]
 [ 13 113]]


# Class implementation (not finished)

In [11]:
# train the model using Naive Bayes classifier

#import multinoial Naive Bayes model from sklearn

# we create a Naive Bayes class 

class MultinomialNB:
    spam_train_matrix = []
    ham_train_matrix = []
    prior_spam = 1
    prior_ham  = 1
    # create class constructor
    
    def __init__(self):
       self.prior_spam = len(spam)/(len(spam)+len(ham))
       self.prior_ham = len(ham)/(len(spam)+len(ham))
      
    #     

    
    def fit(self,X,y):
        for i, word in enumerate(X):
            if(y[i] == 1):
                self.spam_train_matrix.append(word)
            else:
                self.ham_train_matrix.append(word)

        spam_train_matrix = np.array(spam_train_matrix) + 1 
        ham_train_matrix = np.array(ham_train_matrix) + 1
    

    def predict(self,X):
        results = []
        for j,message in enumerate(X):
            spam_like = 1
            ham_like = 1
            for i, word_freq in enumerate(message):
                if word_freq != 0:
                    spam_like *= pow(self.spam_likelihoods[i], word_freq)
                    ham_like *= pow(self.ham_likelihoods[i], word_freq)
            if(spam_like * prior_spam > ham_like * prior_ham):
                results.append(1)
            else:
                results.append(0)
        return results
        

    
