In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
TOKEN_SPAM_PROB_FILE = 'prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'prob-ham.txt'
TOKEN_ALL_PROB_FILE = 'prob-all.txt'

TEST_FEATURE_MATRIX = 'test-features.txt'
TEST_TARGET_FILE = 'test-target.txt'

PROB_SPAM = 0.3318170331058883

VOCAB_SIZE = 2500

In [3]:
# Features
x_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
# Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')
# Token Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

## Set the Prior

$$P(Spam \, | \, X) = \frac{P(X \, | \, Spam) \, P(Spam)} {P(X)}$$

In [4]:
joint_log_spam = x_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_spam

array([-4722.14830944, -1305.58276457,  -309.50407213, ...,
        -286.45007326,  -186.3597443 , -3254.94876172])

In [5]:
joint_log_ham = x_test.dot(np.log(prob_token_ham)) + np.log(1 - PROB_SPAM)
joint_log_ham

array([-5523.48751036, -1407.80825526,  -364.67866321, ...,
        -259.63517403,  -157.12276535, -3123.74039585])

# Making Predictions

### Checking for the higher joint probability

$$P(Spam \, | \, X) \, > \, P(Ham \, | \, X)$$
<center>**OR**</center>
<br>
$$P(Spam \, | \, X) \, < \, P(Ham \, | \, X)$$

In [6]:
prediction = joint_log_spam > joint_log_ham

In [7]:
prediction

array([ True,  True,  True, ..., False, False, False])

# Metrics and Evaluation

In [8]:
correct_pred = (y_test == prediction).sum()
print('Docs classified correctly', correct_pred)
wrong_pred = x_test.shape[0] - correct_pred
print('Docs classified incorrectly', wrong_pred)

Docs classified correctly 1638
Docs classified incorrectly 68


In [10]:
# Accuracy
correct_pred/len(x_test)

0.9601406799531067

In [11]:
fraction_wrong = wrong_pred / len(x_test)
print(f'Accuracy of the model is {round(100*(1-fraction_wrong), 2)}%')
print(f'Fraction classified incorrectly is {round(100*fraction_wrong, 2)}%')

Accuracy of the model is 96.01%
Fraction classified incorrectly is 3.99%
