In [41]:
# library installation

!pip3 install numpy
!pip3 install matplotlib



In [42]:
# import necessary packages

import numpy as np
import matplotlib.pyplot as plt
import csv
import random

In [43]:
# function : extract data

def extract_data() :
    data = [];
    
    with open('spam.csv', encoding="ISO-8859-1") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
#                 print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                data.append(row)
                line_count += 1
        return data

In [44]:
# function : data preprocessing
# returns training data, testing data, output, and word frequency
def pre_process_data(data) :

    # remove the stop words from the set, as they do not contribute to whether email is spam or not 
    stop_words = ["", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
                  "you", "your", "yours", "yourself", "yourselves", "he", "him", 
                  "his", "himself", "she", "her", "hers", "herself", "it", "its",
                  "itself", "they", "them", "their", "theirs", "themselves", "what",
                  "which", "who", "whom", "this", "that", "these", "those", "am", "is",
                  "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
                  "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because",
                  "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
                  "between", "into", "through", "during", "before", "after", "above", "below",
                  "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
                  "further", "then", "once", "here", "there", "when", "where", "why", "how", "all",
                  "any", "both", "each", "few", "more", "most", "other", "some", "such", "no",
                  "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
                  "can", "will", "just", "don", "should", "now"]
    # [SOURCE] : https://gist.github.com/sebleier/554280

    # (1) Create vector of spam, ham
    Y = []
    for i in range(len(data)) :
        Y.append(data[i][0])

    Y = np.array(Y)

    # (2) Consider only those elements without special characters => a-z, A-Z, 0-9

    word_freq = {}
    altered_data = []

    for i in range(len(data)) :
        dataRow = data[i][1 : ]
        dataRow = " ".join(dataRow)
        newDataRow = ""
        # consider only letters that are alphanumeric
        for j in range(len(dataRow)) :
            if(dataRow[j].isalnum() == False) :
                tmp = " " 
            else :
                tmp = dataRow[j]
            newDataRow = newDataRow + tmp
        newDataRow = newDataRow.split(" ")

        # convert every word to lower case
        for j in range(len(newDataRow)) :
            newDataRow[j] = newDataRow[j].lower()
            
        new_altered_data_row = []
        word_set = set()

        # only consider those words that aren't in the stop words dictionary
        for j in range(len(newDataRow)) :
            word = newDataRow[j]
            if(word in stop_words) :
                continue
            else :
                word_set.add(word) # we don't want duplicate entries
        for word in word_set :
            new_altered_data_row.append(word)
        altered_data.append(new_altered_data_row)
    
    altered_data = np.array(altered_data)
        
    # (3) divide into training and testing data => 70% training and 30% test
        
    train_size = int((altered_data.shape[0] * 7) / 10) # 70% train data
    X_train_indices = random.sample(range(0, altered_data.shape[0]), train_size)

    X_train, X_test, Y_train, Y_test = [], [], [], []
    for i in range(altered_data.shape[0]) :
        if(i in X_train_indices) :
            X_train.append(altered_data[i])
            Y_train.append(Y[i])
        else :
            X_test.append(altered_data[i])
            Y_test.append(Y[i])

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)
    
    # (4) Make a dictionary for all the word count
    for i in range(len(X_train)) :
        for j in range(len(X_train[i])) :
            word = X_train[i][j]
            if(word not in stop_words) :
                if(word not in word_freq) :
                    word_freq[word] = 1
                else:
                    word_freq[word] = word_freq[word] + 1
                    
    return (X_train, X_test, Y_train, Y_test, word_freq)

In [45]:
# transform the training and testing data
def X_transform(data, word_freq) :
    new_data = []
    for i in range(data.shape[0]):
        tmp = []
        for val in word_freq :
            if val in data[i] : 
                tmp.append(float(1));
            else :
                tmp.append(float(0));
        tmp = np.array(tmp)
        new_data.append(tmp)

    new_data = np.array(new_data)
    return new_data

def Y_transform(data) :
    new_data = []
    for i in range(data.shape[0]) :
        if(data[i] == "spam") :
            new_data.append(float(1))
        else :
            new_data.append(float(0))
    new_data = np.array(new_data)
    return new_data

In [46]:
# get all necessary data
data = extract_data()
(X_train, X_test, Y_train, Y_test, word_freq) = pre_process_data(data)

# transform data
X_train = X_transform(X_train, word_freq)
X_test = X_transform(X_test, word_freq)
Y_train = Y_transform(Y_train)
Y_test = Y_transform(Y_test)

In [47]:
# required function for naive baysian
# we have used the bernoulli model 

# prediction for y
def y_prediction(Y_train) :
    den = float(Y_train.shape[0])
    num = 0
    for i in range(Y_train.shape[0]) :
        num = num + Y_train[i]
    num = float(num)
    return float(num / den)

# probability for some y value
def y_probability(Y_train, y_val) :
    phi = y_prediction(Y_train)
    val1 = phi ** y_val
    val2 = (float(1) - phi) ** (1 - y_val)
    
    return val1 * val2

# prediction for x_j(jth word of dictionary) when y = 1
def x_prediction_1(X_train, Y_train, word_index) :
    den = 0
    for i in range(Y_train.shape[0]) :
        den = den + Y_train[i]
    den = float(den)
    
    num = 0
    for i in range(X_train.shape[0]) :
        if(Y_train[i] == 1 and X_train[i][word_index] == 1) :
            num = num + 1
    num = float(num)
    
    return float((num + float(1))/ (den + float(2)))

# prediction for x when y = 0
def x_prediction_0(X_train, Y_train, word_index) :
    den = 0
    for i in range(Y_train.shape[0]) :
        den = den + (float(1) - Y_train[i])
    den = float(den)
    
    num = 0
    for i in range(X_train.shape[0]) :
        if(Y_train[i] == 0 and X_train[i][word_index] == 1) :
            num = num + 1
    num = float(num)
    
    return float((num + float(1))/ (den + float(2)))

# probability for some value of y,  for some jth word of dictionary
def x_probability(X_train, Y_train, phi, x_val) :
    val1 = phi ** x_val
    val2 = (1 - phi) ** (1 - x_val)
    
    return val1  * val2

In [48]:
# store values because it's taking lot of time
y_prob_1 = y_probability(Y_train, 1)
y_prob_0 = y_probability(Y_train, 0)

x_phi_vals_1 = np.zeros(X_train.shape[1])
x_phi_vals_0 = np.zeros(X_train.shape[1])

for i in range(X_train.shape[1]) :
    x_phi_vals_1[i] = x_prediction_1(X_train, Y_train, i)
    x_phi_vals_0[i] = x_prediction_0(X_train, Y_train, i)

In [49]:
# check accuracy for training data

total_correct = 0
for i in range(X_train.shape[0]) :
    if((i + 1) % 500 == 0) :
        print("Doing prediction for : ", i + 1)
    
    den = float(0)
    den1 = y_prob_1
    for j in range(X_train.shape[1]) :
        den1 = den1 * x_probability(X_train, Y_train, x_phi_vals_1[j], X_train[i][j])
    den = den + den1
    den1 = y_prob_0
    for j in range(X_train.shape[1]) :
        den1 = den1 * x_probability(X_train, Y_train, x_phi_vals_0[j], X_train[i][j])
    den = den + den1

    # for spam prediction
    num = float(0)
    num1 = y_prob_1
    for j in range(X_train.shape[1]) :
        num1 = num1 * x_probability(X_train, Y_train, x_phi_vals_1[j], X_train[i][j])
    num = num + num1
    
    spam_prediction = num / den
    
    # for spam prediction
    num = float(0)
    num1 = y_prob_0
    for j in range(X_train.shape[1]) :
        num1 = num1 * x_probability(X_train, Y_train, x_phi_vals_0[j], X_train[i][j])
    num = num + num1
    
    ham_prediction = num / den
    
    if(spam_prediction >= ham_prediction and Y_train[i] == 1) :
        total_correct = total_correct + 1
    elif(spam_prediction < ham_prediction and Y_train[i] == 0) :
        total_correct = total_correct + 1
                
percentage_accuracy = (float(total_correct) / float(X_train.shape[0])) * 100
print('Percentage accuracy on training data: ', percentage_accuracy)

Doing prediction for :  500
Doing prediction for :  1000
Doing prediction for :  1500
Doing prediction for :  2000
Doing prediction for :  2500
Doing prediction for :  3000
Doing prediction for :  3500
Percentage accuracy on training data:  98.64102564102564


In [50]:
total_correct, X_train.shape[0]

(3847, 3900)

In [51]:
# check accuracy for testing data

total_correct = 0
for i in range(X_test.shape[0]) :
    if((i + 1) % 300 == 0) :
        print("Doing prediction for : ", i + 1)
    
    den = float(0)
    den1 = y_prob_1
    for j in range(X_test.shape[1]) :
        den1 = den1 * x_probability(X_train, Y_train, x_phi_vals_1[j], X_test[i][j])
    den = den + den1
    den1 = y_prob_0
    for j in range(X_test.shape[1]) :
        den1 = den1 * x_probability(X_train, Y_train, x_phi_vals_0[j], X_test[i][j])
    den = den + den1

    # for spam prediction
    num = float(0)
    num1 = y_prob_1
    for j in range(X_test.shape[1]) :
        num1 = num1 * x_probability(X_train, Y_train, x_phi_vals_1[j], X_test[i][j])
    num = num + num1
    
    spam_prediction = num / den
    
    # for spam prediction
    num = float(0)
    num1 = y_prob_0
    for j in range(X_test.shape[1]) :
        num1 = num1 * x_probability(X_train, Y_train, x_phi_vals_0[j], X_test[i][j])
    num = num + num1
    
    ham_prediction = num / den
    
    if(spam_prediction >= ham_prediction and Y_test[i] == 1) :
        total_correct = total_correct + 1
    elif(spam_prediction < ham_prediction and Y_test[i] == 0) :
        total_correct = total_correct + 1
                
percentage_accuracy = (float(total_correct) / float(X_test.shape[0])) * 100
print('Percentage accuracy on testing data: ', percentage_accuracy)

Doing prediction for :  300
Doing prediction for :  600
Doing prediction for :  900
Doing prediction for :  1200
Doing prediction for :  1500
Percentage accuracy on testing data:  97.72727272727273


In [52]:
total_correct, X_test.shape[0]

(1634, 1672)