In [1]:
import pandas as pd
import numpy as np 

In [2]:
column_names = ['Label', 'SMS']
spam_df = pd.read_csv('SMSSpamCollection', sep = '\t', header = None,
                    names = column_names)

In [3]:
spam_df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning 

In [4]:
# Removal of Punctuation
spam_df["SMS"] = spam_df['SMS'].str.replace('\W', ' ')
spam_df["SMS"] = spam_df['SMS'].str.lower()

In [5]:
# Creating list to store all vocab
vocabulary = []

spam_df["SMS"] = spam_df["SMS"].str.split()

In [6]:
for row in spam_df["SMS"]:
    for word in row:
        vocabulary.append(word)

In [7]:
# Getting Rid of Duplicates via Set 
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

## Data Exploration

In [8]:
spam_df.shape

(5572, 2)

In [9]:
spam_df["Label"].value_counts(normalize = True).to_frame()

Unnamed: 0,Label
ham,0.865937
spam,0.134063


In [10]:
# Randomize the dataset
data_randomized = spam_df.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Training/Test split
spam_training = data_randomized[:training_test_index].reset_index(drop=True)
spam_test = data_randomized[training_test_index:].reset_index(drop=True)

print(spam_training.shape)
print(spam_test.shape)

(4458, 2)
(1114, 2)


## Testing if the percentage of spam and ham are the same 

In [11]:
spam_training["Label"].value_counts(normalize = True).to_frame()

Unnamed: 0,Label
ham,0.86541
spam,0.13459


In [12]:
spam_test["Label"].value_counts(normalize = True).to_frame()

Unnamed: 0,Label
ham,0.868043
spam,0.131957


The percentages are the same 

## Dictionary and Word Count 

### For the Training Dataset 

In [13]:
word_counts_per_sms = {unique_word: [0] * len(spam_training['SMS']) for unique_word 
                       in vocabulary}

for index, sms in enumerate(spam_training['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1 

In [14]:
df_word_count = pd.DataFrame(word_counts_per_sms)

In [15]:
# concat train and test datasets 
df_concat = pd.concat([spam_training, df_word_count], axis = 1)

In [16]:
df_concat.head()

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


## Calculating N(Spam), N(Ham), N(Vocab)

In [17]:
# total number of unique words 
n_vocab = len(vocabulary)

In [18]:
# N(Spam) - number of words in all spam messages
n_spam_messages  = df_concat.loc[df_concat["Label"] == 'spam', 'SMS'].to_frame()
n_spam_transition = n_spam_messages["SMS"].apply(len)
n_spam = n_spam_transition.sum()

# N(Ham) - number of words in all ham messages 
n_ham_messages = df_concat.loc[df_concat["Label"] == 'ham', 'SMS'].to_frame()
n_ham_transition = n_ham_messages["SMS"].apply(len)
n_ham = n_ham_transition.sum()

In [19]:
# alpha 
alpha = 1

In [20]:
# p(ham), p(spam)
p_spam = len(n_spam_messages ) / len(df_concat)
p_ham = len(n_ham_messages) / len(df_concat)

## Calculating the algorithims

## Spam Calculation

In [21]:
spam = df_concat[df_concat["Label"] == "spam"]
ham = df_concat[df_concat["Label"] == "ham"]

In [22]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
    
    # spam 
    p_w_sum_spam = spam[word].sum()
    p_w_spam = (p_w_sum_spam + alpha)/(n_spam + alpha * n_vocab)
    parameters_spam[word] = p_w_spam
    
    # ham
    p_w_sum_ham = ham[word].sum()
    p_w_ham = (p_w_sum_ham + alpha)/(n_ham + alpha * n_vocab)
    parameters_ham[word] = p_w_ham

In [38]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]     
    
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
    
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')
    

In [39]:
classify("Sounds good, Tom, then see you there")

P(Spam|message): 3.5930933949011804e-25
P(Ham|message): 6.382909689630265e-21
Label: Ham


In [40]:
classify("WINNER!! This is the secret code to unlock the money: C3421.")

P(Spam|message): 9.291456143381514e-26
P(Ham|message): 1.6951393560046432e-27
Label: Spam
