# Text Classification
*Complete and hand in this completed worksheet (including its outputs and any supporting code outside of the worksheet) with your assignment submission. Please check the pdf file for more details.*

In this exercise you will:
    
- implement a of spam classifier with **Naive Bayes method** for real world email messages
- learn the **training and testing phase** for Naive Bayes classifier  
- get an idea of the **precision-recall** tradeoff

In [142]:
# some basic imports
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [143]:
# ham_train contains the occurrences of each word in ham emails. 1-by-N vector
ham_train = np.loadtxt('ham_train.csv', delimiter=',')
# spam_train contains the occurrences of each word in spam emails. 1-by-N vector
spam_train = np.loadtxt('spam_train.csv', delimiter=',')
# N is the size of vocabulary.
N = ham_train.shape[0]
# There 9034 ham emails and 3372 spam emails in the training samples
num_ham_train = 9034
num_spam_train = 3372
# Do smoothing
x = np.vstack([ham_train, spam_train]) + 1

# ham_test contains the occurences of each word in each ham test email. P-by-N vector, with P is number of ham test emails.
i,j,ham_test = np.loadtxt('ham_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
ham_test_tight = scipy.sparse.coo_matrix((ham_test, (i - 1, j - 1)))
ham_test = scipy.sparse.csr_matrix((ham_test_tight.shape[0], ham_train.shape[0]))
ham_test[:, 0:ham_test_tight.shape[1]] = ham_test_tight
# spam_test contains the occurences of each word in each spam test email. Q-by-N vector, with Q is number of spam test emails.
i,j,spam_test = np.loadtxt('spam_test.txt').T
i = i.astype(np.int)
j = j.astype(np.int)
spam_test_tight = scipy.sparse.csr_matrix((spam_test, (i - 1, j - 1)))
spam_test = scipy.sparse.csr_matrix((spam_test_tight.shape[0], spam_train.shape[0]))
spam_test[:, 0:spam_test_tight.shape[1]] = spam_test_tight


## Now let's implement a ham/spam email classifier. Please refer to the PDF file for details

In [144]:
from likelihood import likelihood
# Implement a ham/spam email classifier, and calculate the accuracy of your classifier

# Hint: you can directly do matrix multiply between scipy.sparse.coo_matrix and numpy.array.
# Specifically, you can use sparse_matrix * np_array to do this. Note that when you use "*" operator
# between numpy array, this is typically an elementwise multiply.

# begin answer

# 0 - ham, 1 - spam
l_train = likelihood(x)
ratio_train = l_train[1] / l_train[0]
top_10_index = np.argsort(ratio_train)[::-1][0:10]

print(top_10_index)

word_map_file = open("all_word_map.txt", "r")
line = word_map_file.readline()
word_map = []
while line:
    word_map.append(line.split()[0])
    line = word_map_file.readline()

top_10_word = []
for i in range(10):
    top_10_word.append(word_map[top_10_index[i]])

print(top_10_word)
    

[30032 75525 38175 45152  9493 65397 37567 13612 56929  9452]
['nbsp', 'viagra', 'pills', 'cialis', 'voip', 'php', 'meds', 'computron', 'sex', 'ooking']


In [145]:
num_train_total = num_ham_train + num_spam_train
# 0 - ham, 1 - spam
prior = np.array([num_ham_train, num_spam_train]) / num_train_total

l_train_log = np.log(l_train)

# [post_1, post_2]
ham_posterior_matrix = ham_test * l_train_log.T + np.log(prior)
ham_miss = np.sum(ham_posterior_matrix[:, 0] < ham_posterior_matrix[:, 1])

print("ham true value: {}, miss: {}".format(ham_test.shape[0], ham_miss))

# [post_1, post_2]
spam_posterior_matrix = spam_test * l_train_log.T + np.log(prior)
spam_miss = np.sum(spam_posterior_matrix[:, 0] > spam_posterior_matrix[:, 1])

print("spam true value: {}, miss: {}".format(spam_test.shape[0], spam_miss))

accuracy = 1 - ((spam_miss + ham_miss) * 1.0 / (ham_test.shape[0] + spam_test.shape[0]))    

print("accuracy: {}".format(accuracy))

precision = (spam_test.shape[0] - spam_miss) * 1.0 / ((spam_test.shape[0] - spam_miss) + ham_miss)

recall = 1.0 * (spam_test.shape[0] - spam_miss) / spam_test.shape[0]

print("precision: {}, recall: {}".format(precision, recall))

ham true value: 3011, miss: 28
spam true value: 1124, miss: 31
accuracy: 0.9857315598548972
precision: 0.9750223015165032, recall: 0.9724199288256228
