# Spam Classifier using Bayes Theorem
This is simple naive bayes classifier, or a classifier using Bayes Theorem to compute probabilities given new data, used to classify messages as either spam or not spam.

In [None]:
import pandas as pd

In [None]:
dataframe = pd.read_csv("spam.csv")
print(dataframe)

In [None]:
n_spam = 0
n_total = 0
n_total_spam_words = 0
n_total_ham_words = 0

# Initializes dictionary mapping words to frequencies
# in both spam and not spam (ham) messages
word_freq_ham = {}
word_freq_spam = {}

label, text = (dataframe['v1'], dataframe['v2'])

for i in range(len(dataframe['v1'])):
    for word in text[i].split():
        try:
            if label[i] == "spam":
                word_freq_spam[word] += 1
            else:
                word_freq_ham[word] += 1
        except:
            if label[i] == "spam":
                word_freq_spam[word] = 1
            else:
                word_freq_ham[word] = 1
                
        if label[i] == "spam":
            n_total_spam_words += 1
        else:
            n_total_ham_words += 1
    
    if label[i] == "spam":
        n_spam += 1
        
    n_total += 1
    
# Gets prior knowledge of the probability of a given
# message being spam and ham
spam_prior = n_spam/n_total
ham_prior = (n_total - n_spam)/n_total

print(spam_prior)
print(ham_prior)

In [146]:
def classify_spam(text):
    spam = 1
    ham = 1
    
    # Finding joint probability for both spam and
    # ham for every word
    for word in text.split():
        
        # Get the frequency of the word appearing given that
        # the message is spam
        if word in word_dist_spam:
            freq_in_spam = word_dist_spam[word]
        else:
            # Applies laplace smoothing, setting frequency to 1
            # if the word did not appear in the training data
            freq_in_spam = 1
            
        # Likewise, get the frequency of the word appearing given
        # the message is ham
        if word in word_dist_ham:
            freq_in_ham = word_dist_ham[word]
        else:
            freq_in_ham = 1
            
        # Divide frequency by total to get probability
        
        # P(word|spam)
        prob_in_spam = freq_in_spam / n_total_spam_words
        
        # P(word|ham)
        prob_in_ham = freq_in_ham / n_total_ham_words
        
        # P(word)
        prob_in_message = (freq_in_ham + freq_in_spam) / (n_total_ham_words + n_total_spam_words)
            
        # Computing joint probability for every word
        
        # Dividing P(word|spam)/P(word)
        spam *= prob_in_spam / prob_in_message
        
        # Dividing P(word|ham)/P(word)
        ham *= prob_in_ham / prob_in_message
        
    # Applying prior knowledge, multiplying by P(spam) and P(ham)
    spam *= spam_prior
    ham *= ham_prior
    
    # 1 means spam, 0 means ham
    return int(spam > ham)

In [147]:
print(classify_spam("Ever wanted to go on an Alaskan Cruise? Call 555-5555 to go for free!"))
print(classify_spam("Hey check this cool vid out"))
print(classify_spam("Click here to claim your reward"))
print(classify_spam("Do you think water is wet?"))
print(classify_spam("Hey, I was wondering if we were still planning to meet up today."))
print(classify_spam("Big awards await if you complete this week's challenges!"))

1
0
1
0
0
1
