In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'
TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'
VOCAB_SIZE = 2500

In [3]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

In [4]:
X_test.shape

(1724, 2500)

In [5]:
y_test.shape

(1724,)

In [6]:
prob_token_spam.shape

(2500,)

In [7]:
prob_token_ham.shape

(2500,)

In [8]:
prob_all_tokens.shape

(2500,)

In [9]:
X_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [6., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [5., 5., 2., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [10]:
y_test

array([1., 1., 1., ..., 0., 0., 0.])

In [11]:
prob_token_spam[:5]

array([0.0113343 , 0.00517291, 0.00699623, 0.01049435, 0.00706793])

In [12]:
prob_token_ham[:5]

array([0.02112981, 0.01018445, 0.00805391, 0.00351968, 0.0063604 ])

In [13]:
prob_all_tokens[:5]

array([0.0170787 , 0.00810264, 0.00767713, 0.00660439, 0.00673653])

In [14]:
X_test.dot(prob_token_spam).shape

(1724,)

In [15]:
PROB_SPAM = 1250/4015
PROB_SPAM

0.31133250311332505

In [16]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)

In [17]:
joint_log_spam.shape

(1724,)

In [18]:
joint_log_spam[:10]

array([  20.64804564,    1.12497813,   18.61296364,   16.31342042,
         19.77952803,  -13.8037317 ,   24.97109184,    1.12497813,
         11.32271694, -153.5320515 ])

In [19]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens))  + np.log(1-PROB_SPAM)

In [20]:
joint_log_ham.shape

(1724,)

In [21]:
joint_log_ham[:10] 

array([-59.75328939, -10.81756237, -37.04275009, -59.17941671,
       -55.31708219,  -6.35442713, -53.22846275, -10.81756237,
       -37.66211755,   2.85143448])

In [22]:
prediction = joint_log_spam > joint_log_ham

In [23]:
prediction

array([ True,  True,  True, ..., False, False, False])

In [24]:
prediction[:10]

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
       False])

In [25]:
prediction[:10]*1

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [26]:
y_test[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [27]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM) 

In [28]:
joint_log_ham = X_test.dot(np.log(prob_token_ham))  + np.log(1-PROB_SPAM)

In [29]:
joint_log_spam[:10]

array([-1224.46613544,  -264.43974711,  -413.91570187, -1470.49116113,
       -1242.85998161,  -358.72433176,  -544.10939281,  -264.43974711,
        -575.83306952, -2219.20779385])

In [30]:
joint_log_ham[:10]

array([-1304.86747047,  -276.38228761,  -469.5714156 , -1545.98399826,
       -1317.95659183,  -351.27502719,  -622.3089474 ,  -276.38228761,
        -624.81790401, -2062.82430787])

In [31]:
prediction = joint_log_spam > joint_log_ham

In [32]:
prediction[:10]*1

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [33]:
y_test[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [34]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM) 
joint_log_ham = X_test.dot(np.log(prob_token_ham))  + np.log(1-PROB_SPAM)
joint_log_spam[:10]

array([-1224.46613544,  -264.43974711,  -413.91570187, -1470.49116113,
       -1242.85998161,  -358.72433176,  -544.10939281,  -264.43974711,
        -575.83306952, -2219.20779385])

In [35]:
joint_log_ham[:10]

array([-1304.86747047,  -276.38228761,  -469.5714156 , -1545.98399826,
       -1317.95659183,  -351.27502719,  -622.3089474 ,  -276.38228761,
        -624.81790401, -2062.82430787])

In [36]:
prediction = joint_log_spam > joint_log_ham
prediction[:10]*1

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [37]:
y_test[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])