In [93]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [94]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

In [95]:
True_Positive = ((y_pred == 1) & (target == 1)).sum()
False_Negative = ((y_pred == 0) & (target == 1)).sum()
True_Negative = ((y_pred == 0) & (target == 0)).sum()
False_Positive = ((y_pred == 1) & (target == 0)).sum()

In [96]:
print("True positives are :  {}".format(True_Positive))
print("False negatives are :  {}".format(False_Negative))
print("True negatives are :  {}".format(True_Negative))
print("False positives are :  {}".format(False_Positive))

True positives are :  198
False negatives are :  549
True negatives are :  4770
False positives are :  55


In [97]:
confusion_matrix = np.array([[True_Negative,False_Positive],[False_Negative,True_Positive]])
print(confusion_matrix)

[[4770   55]
 [ 549  198]]


In [98]:
Sensitivity = round(198/(198 + 549),2)
Recall = Sensitivity
Specificity = round(4770/(4770 + 55),2)
Precision = round(198/(198 + 55),2)
F1_Score = 2*(Precision*Recall)/(Precision+Recall)

In [100]:
print("Sensitivity or True Positive Rate or Recall is :   {}".format(Recall))
print("Specificity or True Negative Rate is :             {}".format(Specificity))
print("Precision is :                                     {}".format(Precision))
print("F1_Score is :                                      {}".format(F1_Score))

Sensitivity or True Positive Rate or Recall is :   0.27
Specificity or True Negative Rate is :             0.99
Precision is :                                     0.78
F1_Score is :                                      0.40114285714285713


In [102]:
# Use sklearn's confusion_matrix
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(target, y_pred)
print(con_mat)

[[4770   55]
 [ 549  198]]
