# Spam or Ham?

## About
A very simple Naive Bayes classifier of spam vs legit text messages. Dataset used for training is [SMS Spam Collection Dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset).



In [1]:
# Imports
import kaggle

import numpy as np
import pandas as pd

import re
from sklearn.model_selection import train_test_split
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
# Downloading data
!kaggle datasets download -p ../datasets --unzip uciml/sms-spam-collection-dataset



Downloading sms-spam-collection-dataset.zip to ../datasets




  0%|          | 0.00/208k [00:00<?, ?B/s]
100%|##########| 208k/208k [00:00<00:00, 619kB/s]


In [3]:
# Loading data
spam_df = pd.read_csv('../datasets/spam.csv', encoding='latin-1')
spam_df = spam_df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
spam_df = spam_df.rename(columns={"v1":"target", "v2":"text"})

print('data shape: ', spam_df.shape, '\n')
print('head: \n', spam_df.head())



data shape:  (5572, 2) 

head: 
   target                                               text
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# (Very) Brief exploration
print(spam_df.target.value_counts(), '\n')
print(spam_df.groupby("target").describe(), '\n')



ham     4825
spam     747
Name: target, dtype: int64 

        text                                                               
       count unique                                                top freq
target                                                                     
ham     4825   4516                             Sorry, I'll call later   30
spam     747    653  Please call our customer service representativ...    4 



In [5]:
# Parsing data
def get_words(text):
    """Converts given text into unordered and uncounted bag of words."""
    return set(re.split('\W+', text)).difference({''})

spam_df['bag_of_words'] = [get_words(text) for text in spam_df.text]

# Splitting into train/test sets
X_train, X_test, y_train, y_test = train_test_split(spam_df['bag_of_words'], spam_df['target'], 
                                                    test_size=0.2, random_state=42, stratify=spam_df['target'])



In [6]:
# Creating Counters
target_counter = Counter()
word_counters = {
    'spam': Counter(), 
    'ham': Counter()
}
all_words = set()



In [7]:
# Defining Naive Bayes classifier
def prior_prob_of_target(target):
    """Evaluates probability of the given target ('spam' or 'ham') using the counters."""
    return target_counter[target] / sum(target_counter.values())

def word_prob_given_target(word, target, pi=0.5, alpha=1e-6):
    """Calculates probability of a word occurence in the text, conditional on the target of text. 
    Adds a small constant to deal with out-of-dictionary cases."""
    return word_counters[target][word] / target_counter[target] * (1-alpha) + pi * alpha

def text_prob_given_target(text, target):
    """Calculates probability of the text conditional on target."""
    if isinstance(text, str):
        text = get_words(text)
    prob = 1.0
    for word in all_words:
        if word in text:
            prob *= word_prob_given_target(word, target)
        else:
            prob *= 1 - word_prob_given_target(word, target)
    return prob

def target_prob_given_text(text, target):
    """Calculates probability of the target (spam or ham) conditional on the text."""
    joint_probs = {
        a_target: prior_prob_of_target(a_target) * text_prob_given_target(text, a_target)
        for a_target in target_counter.keys()
    }
    
    return joint_probs[target] / sum(joint_probs.values())



In [8]:
# Training
for target, words in zip(y_train, X_train):
    word_counters[target].update(words)
    target_counter.update([target])
    all_words.update(words)



In [9]:
# Testing
threshold = 0.5

test_spam_probabilities = [target_prob_given_text(text, 'spam') for text in X_test]
test_predictions = ['spam' if spamness > threshold else 'ham' for spamness in test_spam_probabilities]

accuracy = sum(1 if y_pred == y_real else 0 for y_pred, y_real in zip(test_predictions, y_test)) / len(y_test)
print('Test set accuracy: ', accuracy)



Test set accuracy:  0.9856502242152466
