In [1]:
# import libraries
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
# read csv file, generate list and messages as seperate labels
df = pd.read_csv ('spam.csv', encoding = "ISO-8859-1", usecols=[0, 1])
labels = df['v1'].tolist()
label_transform = {"ham": 0, 'spam': 1}
labels = [label_transform[label] for label in labels]
messages = df['v2'].tolist()

# bag of words representation
bow_vectorizer = CountVectorizer(stop_words='english')
bow_messages = bow_vectorizer.fit_transform(messages).todense()
bow_messages = [b.tolist()[0] for b in bow_messages]

# seperate dataset into training set and validation set
all_indexs = [i for i in range(len(labels))]

# choose 75% of messages 
training_indexs = random.sample(all_indexs, k=int(len(labels)*0.75))
validation_indexs = list(set(all_indexs) - set(training_indexs))
training_labels = [labels[i] for i in training_indexs]
training_messages = [bow_messages[i] for i in training_indexs]
validation_labels = [labels[i] for i in validation_indexs]
validation_messages = [bow_messages[i] for i in validation_indexs]

print('number of messages: ', len(bow_messages))
print('number of training mesaages: ', len(training_messages))
print('number of validation messages: ', len(validation_messages))

number of messages:  5572
number of training mesaages:  4179
number of validation messages:  1393


In [2]:
# naive bayes
# assum the last column is a model
import math

def trainModel(train_set, train_label):
    spam_set = [train_set[i] for i in range(len(train_set)) if train_label[i] == 1]
    ham_set = [train_set[i] for i in range(len(train_set)) if train_label[i] == 0]
    spam_prob = len(spam_set) / float(len(train_set)) 
    ham_prob = 1.0-spam_prob
    spam_set_mean, spam_set_stdev = trainSet(spam_set)
    ham_set_mean, ham_set_stdev = trainSet(ham_set)
    return (spam_set_mean, spam_set_stdev, ham_set_mean, ham_set_stdev, spam_prob, ham_prob)
    
def mean(data):
    return sum(data)/float(len(data))

def stdev(data):
    avg = mean(data)
    variance = sum([(x-avg)**2 for x in data]) / float(len(data)-1)
    return math.sqrt(variance)

def logGaussianProbability(x, mean, stdev):
    if stdev == .0:
        stdev = 0.01
    temp = -((x-mean)**2 / (2 * stdev**2 ))
    return temp-math.log(math.sqrt(2 * math.pi) * stdev)
    
def trainSet(s):
    set_mean = []
    set_stdev = []
    for i in range(len(s[0])):
        # note that the last column is label
        column = [d[i] for d in s]
        me = mean(column)
        st = stdev(column)
        set_mean.append(me)
        set_stdev.append(st)
    return (set_mean, set_stdev)

def probability(d, set_mean, set_stdev, prob):
    # make a single prediction
    prob = math.log(prob)
    for i in range(len(d)):
        mean = set_mean[i]
        stdev = set_stdev[i]
        prob += logGaussianProbability(d[i], mean, stdev)
    return prob
        
def predictions(data, spam_set_mean, spam_set_stdev, ham_set_mean, ham_set_stdev, spam_prob, ham_prob):
    # assume each row is a set of data
    re = [] # 0->ham, 1->spam
    for d in data:
        prob1 = probability(d, spam_set_mean, spam_set_stdev, spam_prob)
        prob2 = probability(d, ham_set_mean, ham_set_stdev, ham_prob)
        if prob1 >= prob2:
            re.append(1)
        else:
            re.append(0)
    return re

In [3]:
spam_set_mean, spam_set_stdev, ham_set_mean, ham_set_stdev, spam_prob, ham_prob = trainModel(training_messages, training_labels)
print(spam_prob)
print(ham_prob)

0.1349605168700646
0.8650394831299354


In [81]:
print(len(training_messages[0]))
print(len(validation_messages[0]))
print(len(spam_set_mean))

8404
8404
8404


In [84]:
preds = predictions(validation_messages, spam_set_mean, spam_set_stdev, ham_set_mean, ham_set_stdev, spam_prob, ham_prob)
print(len(validation_messages))
print(len(preds))

1393
1393


In [87]:
false_pred = []
for i in range(len(validation_labels)):
    if validation_labels[i] != preds[i]:
        false_pred.append((i, validation_labels[i], preds[i]))
print(len(false_pred))
print(false_pred)

160
[(21, 1, 0), (24, 0, 1), (27, 0, 1), (40, 0, 1), (42, 0, 1), (54, 0, 1), (80, 0, 1), (107, 0, 1), (109, 0, 1), (114, 0, 1), (133, 0, 1), (150, 0, 1), (156, 0, 1), (193, 0, 1), (202, 0, 1), (215, 0, 1), (242, 0, 1), (253, 0, 1), (294, 0, 1), (321, 0, 1), (323, 0, 1), (327, 0, 1), (332, 0, 1), (347, 0, 1), (355, 0, 1), (367, 1, 0), (380, 1, 0), (383, 0, 1), (398, 0, 1), (403, 0, 1), (411, 0, 1), (413, 0, 1), (421, 0, 1), (431, 0, 1), (432, 0, 1), (433, 0, 1), (436, 0, 1), (437, 0, 1), (442, 0, 1), (453, 0, 1), (458, 0, 1), (472, 0, 1), (477, 0, 1), (487, 0, 1), (495, 0, 1), (497, 0, 1), (509, 0, 1), (525, 0, 1), (527, 0, 1), (528, 0, 1), (532, 0, 1), (535, 0, 1), (537, 0, 1), (542, 0, 1), (551, 0, 1), (562, 0, 1), (568, 0, 1), (575, 0, 1), (586, 0, 1), (590, 0, 1), (592, 0, 1), (602, 0, 1), (620, 0, 1), (628, 0, 1), (631, 0, 1), (646, 0, 1), (649, 0, 1), (658, 0, 1), (665, 0, 1), (668, 0, 1), (669, 1, 0), (670, 0, 1), (672, 0, 1), (676, 0, 1), (678, 1, 0), (701, 0, 1), (704, 0, 1), (