In [1]:
spam_text= ['Send us your password', 'review us', 'Send your password', 'Send us your account']
ham_text= ['Send us your review', 'review your password']

In [3]:
spam = dict()
ham = dict()
for phrase in spam_text:
    words = phrase.split()
    for word in words:
        if word.lower() in spam:
            spam[word.lower()] += 1
        else:
            spam[word.lower()] = 1
            
for phrase in ham_text:
    words = phrase.split()
    for word in words:
        if word.lower() in ham:
            ham[word.lower()] += 1
        else:
            ham[word.lower()] = 1
            
print(spam)
print(ham)

{'send': 3, 'us': 3, 'your': 3, 'password': 2, 'review': 1, 'account': 1}
{'send': 1, 'us': 1, 'your': 2, 'review': 2, 'password': 1}


In [10]:
### Prob for each word in spam email
for word in spam.keys():
    print(word + ": "+ str(spam[word]/sum(spam.values())))

send: 0.23076923076923078
us: 0.23076923076923078
your: 0.23076923076923078
password: 0.15384615384615385
review: 0.07692307692307693
account: 0.07692307692307693


In [11]:
### Prob for each word in ham email
for word in ham.keys():
    print(word + ": "+ str(ham[word]/sum(ham.values())))

send: 0.14285714285714285
us: 0.14285714285714285
your: 0.2857142857142857
review: 0.2857142857142857
password: 0.14285714285714285


In [12]:
### prob of spam if word is password, use bayes rule
(0.15384615384615385*(4/6))/((0.15384615384615385*(4/6)) + (0.14285714285714285*(2/6)))

0.6829268292682927

In [14]:
### prob of ham if word is password, use bayes rule
(0.14285714285714285*(2/6))/((0.14285714285714285*(2/6)) + (0.15384615384615385*(4/6)))

0.3170731707317073

3. which line(s) of the method calculates the probabilty of ham and spam?
    lines: 8-11
        self.num_messages['spam'] = sum(1 for label in Y if label == 1)
        self.num_messages['ham'] = sum(1 for label in Y if label == 0)
        self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n)
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n)

4. which line(s) of the method calculates the spam and ham dictionaries?
    lines: 15-24
        for x, y in zip(X, Y):
            c = 'spam' if y == 1 else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count
            
5. which line(s) compares the scores of ham or spam based on log probabilities?
    lines 20-23
        if spam_score > ham_score:
            result.append(1)
        else:
            result.append(0)


In [38]:
import os
import re
import string
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""

    # clean up our string by removing punctuation
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)

    #  tokenize our string into words
    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    # count up how many of each word appears in a list of words.
    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        # Compute log class priors (the probability that any given message is spam/ham),
        # by counting how many messages are spam/ham, 
        # dividing by the total number of messages, and taking the log.
        n = len(X)
        self.num_messages['spam'] = sum(1 for label in Y if label == 'spam')
        self.num_messages['ham'] = sum(1 for label in Y if label == 'ham')
        self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n )
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n )
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}

        # for each (document, label) pair, tokenize the document into words.
        for x, y in zip(X, Y):
            c = 'spam' if y == 'spam' else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            # For each word, either add it to the vocabulary for spam/ham, 
            # if it isn’t already there, and update the number of counts. 
            for word, count in counts.items():
                # Add that word to the global vocabulary.
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    # function to actually output the class label for new data.
    def predict(self, X):
        result = []
        # Given a document...
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            # We iterate through each of the words...
            for word, _ in counts.items():
                if word not in self.vocab: continue
                # ... and compute log p(w_i|Spam), and sum them all up. The same will happen for Ham
                # add Laplace smoothing
                # https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf
                log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham
            
            # Then we add the log class priors...
            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']

            # ... and check to see which score is bigger for that document.
            # Whichever is larger, that is the predicted label!
            if spam_score > ham_score:
                result.append('spam')
            else:
                result.append('ham')
        return result
        

# TODO: Fill in the below function to make a prediction, 
# your answer should match the final number in the below output (0.9641)
if __name__ == '__main__':
    data = pd.read_csv('Datasets/spam.csv',encoding='latin-1')
    data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
    data = data.rename(columns={"v1":'label', "v2":'text'})
    print(data.head())
    tags = data["label"]
    texts = data["text"]
    X, y = texts, tags
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    
    SD = SpamDetector()
    SD.fit(X_train,y_train)
    
    y_pred = SD.predict(X_test.values)
    
    print(len(X))
    
    print(SD.num_messages)
    
    print(sum(SD.num_messages.values())/len(X))
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5572
{'spam': 544, 'ham': 3635}
0.75
[[1190    0]
 [  72  131]]


In [34]:
(1224+117)/(1224+117+51+1)

0.9626704953338119

In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Prepare the dataset
data = pd.read_csv('Datasets/spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]

# create texts and tags
X, y = texts, tags

# split the data into train vs test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# transform text into numerical vectors
vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
print(X_train_dtm)

# instantiate Multinomial Naive Bayes model
nb = MultinomialNB()
# fit to model, with the trained part of the dataset
nb.fit(X_train_dtm, y_train)
X_test_dtm = vectorizer.transform(X_test)
# make prediction
y_pred_class = nb.predict(X_test_dtm)
# test accurarcy of prediction
metrics.accuracy_score(y_test, y_pred_class)

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  (0, 3286)	1
  (0, 4747)	2
  (0, 1896)	1
  (0, 875)	2
  (0, 6599)	2
  (0, 801)	1
  (0, 5258)	1
  (0, 7209)	3
  (0, 1559)	1
  (0, 913)	1
  (0, 6623)	3
  (0, 1050)	1
  (0, 5980)	1
  (0, 3530)	1
  (0, 919)	1
  (0, 802)	1
  (0, 819)	1
  (0, 5712)	1
  (0, 6727)	1
  (0, 2112)	1
  (0, 5065)	2
  (0, 7373)	1
  (0, 4176)	2
  (0, 1535)	2
  (0, 6604)	1
  :	:
  (4176, 4747)	1
  (4176, 3252)	1
  (4176, 3416)	1
  (4176, 2304)	1
  (4176, 6638)	1
  (4176, 4450)	1
  (4176, 7163)	1
  (4176, 4219)	1
  (4176, 1590)	1
  (4176, 3439)	1
  (4176, 4833)	1
  (4176, 4894)	1
  (4177, 3647)	1
  (4177, 3252)	1
  (4177, 6074)	1
  (4177, 4125)	1
  (4177, 3162)	1
  (4177

0.9856424982053122