# Notebook imports

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [22]:
TOKEN_SPAM_PROB_FILE = "SpamData/03_Testing/prob-spam.txt"
TOKEN_HAM_PROB_FILE = "SpamData/03_Testing/prob-nonspam.txt"
TOKEN_ALL_PROB_FILE = "SpamData/03_Testing/prob-all-tokens.txt"

TEST_FEATURE_MATRIX = "SpamData/03_Testing/test-features.txt"
TEST_TARGET_FILE = "SpamData/03_Testing/test-target.txt"

VOCAB_SIZE = 2500

# Load data

In [23]:
# Features
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter = " ")
# Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter = " ")
# TOKEN Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter = " ")
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter = " ")
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter = " ")

# Calculating the Joint Probability
### Using the dot product

In [24]:
print("shape of the dot product is:", X_test.dot(prob_token_spam).shape)

shape of the dot product is: (1724,)


## Set the Prior

In [25]:
PROB_SPAM = 0.3116

In [26]:
np.log(prob_token_spam)

array([ -4.4075528 ,  -5.25362761,  -4.99003004, ..., -12.09417414,
       -10.30241467,  -9.0496517 ])

## Joint probability in log format

$$P(Spam \, | \, X) = \frac{P(X \, | \, Spam \,) \, P(Spam)} {P(X)} $$

In [27]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)

$$P(Ham \, | \, X) = \frac{P(X \, | \, Ham \,) \, (1-P(Spam))} {P(X)} $$

In [28]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(1 - PROB_SPAM)

In [29]:
joint_log_ham.size

1724

# Making Predictions
### Check for the higher joint probability
$$P(Spam \, | \, X) \, > \, P(Ham \, | \, X)$$
### <center>OR</center>
<br>
$$P(Spam \, | \, X) \, < \, P(Ham \, | \, X)$$

In [30]:
prediction = joint_log_spam > joint_log_ham

In [34]:
print(prediction * 1)
print(prediction)

[1 1 1 ... 0 0 0]
[ True  True  True ... False False False]


### Simplified calculation
$$P(X \, | \, Spam \,) \, P(Spam) \neq \frac{P(X \, | \, Spam \,) \, P(Spam)} {P(X)} $$

In [None]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_ham = X_test.dot(np.log(prob_token_ham)) + np.log(1 - PROB_SPAM)