In [1]:
import pandas as pd 

In [2]:
spam_df = pd.read_csv("Data\SMSSpamCollection.csv",sep='\t',header=None,names=['Label', 'SMS'])

In [3]:
spam_df.shape

(5572, 2)

In [4]:
spam_df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam_df["Label"].unique()

array(['ham', 'spam'], dtype=object)

In [6]:
spam_df["Label"].replace("ham","non-spam",inplace=True)

In [7]:
spam_df.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,non-spam,"Sorry, I'll call later"
freq,4825,30


In [8]:
randomized_df = spam_df.sample(frac=1,random_state=1)
training_index = round(randomized_df.shape[0] * 0.8)
training_df = randomized_df[:training_index].reset_index(drop=True)
test_df = randomized_df[training_index:].reset_index(drop=True)

In [9]:
#clean SMS column
training_df["SMS"].replace("\W"," ",regex=True,inplace=True)
training_df["SMS"] = training_df["SMS"].str.lower()

In [10]:
training_df.head()

Unnamed: 0,Label,SMS
0,non-spam,yep by the pretty sculpture
1,non-spam,yes princess are you going to make me moan
2,non-spam,welp apparently he retired
3,non-spam,havent
4,non-spam,i forgot 2 ask ü all smth there s a card on ...


In [11]:
training_df["SMS list"] = training_df["SMS"].str.split()

In [12]:
vocabulary =[]
for row in training_df["SMS list"]:
    for element in row:
        vocabulary.append(element)


In [13]:
vocabulary = list(set(vocabulary))

In [14]:
word_counts_per_sms = {unique_word: [0] * len(training_df['SMS list']) for unique_word in vocabulary}
for index, sms in enumerate(training_df['SMS list']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [15]:
word_df = pd.DataFrame(word_counts_per_sms)
word_df.head()

Unnamed: 0,bani,wrench,realised,method,darren,subscribe,only,atrocious,virtual,cheap,...,fishrman,exciting,havin,dorothy,reality,300p,tirunelvali,09095350301,passionate,brdget
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
merged_df = pd.concat([training_df, word_df], axis=1)

In [17]:
merged_df

Unnamed: 0,Label,SMS,SMS list,bani,wrench,realised,method,darren,subscribe,only,...,fishrman,exciting,havin,dorothy,reality,300p,tirunelvali,09095350301,passionate,brdget
0,non-spam,yep by the pretty sculpture,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,non-spam,yes princess are you going to make me moan,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,non-spam,welp apparently he retired,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,non-spam,havent,[havent],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,non-spam,i forgot 2 ask ü all smth there s a card on ...,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,non-spam,sorry i ll call later in meeting any thing re...,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,non-spam,babe i fucking love you too you know fuck...,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,u ve been selected to stay in 1 of 250 top bri...,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,non-spam,hello my boytoy geeee i miss you already a...,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
spam = merged_df[merged_df["Label"] == "spam"]
no_spam = merged_df[merged_df["Label"] == "non-spam"]

In [56]:
num_of_spam_words = sum((len(row) for row in spam["SMS list"]))
num_of_no_spam_words = sum((len(row) for row in no_spam["SMS list"]))
num_of_vocabulary_words = len(vocabulary)
laplace_smoothing = 1
spam_probability = len(spam) / len(merged_df)
no_spam_probability = len(no_spam) / len(merged_df)

In [57]:
spam_parameters = {unique_word:0 for unique_word in vocabulary}
no_spam_parameters = spam_parameters.copy()

In [58]:
for word in vocabulary:
    spam_word_count = spam[word].sum()
    spam_word_probability = (spam_word_count + laplace_smoothing) / (num_of_spam_words + laplace_smoothing * num_of_vocabulary_words)
    spam_parameters[word] = spam_word_probability
    
    no_spam_word_count = no_spam[word].sum()
    no_spam_word_probability = (no_spam_word_count + laplace_smoothing) / (num_of_no_spam_words + laplace_smoothing * num_of_vocabulary_words)
    no_spam_parameters[word] = no_spam_word_probability

In [62]:
import re

def new_message_classification_test(message:str):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = spam_probability
    p_no_spam_given_message = no_spam_probability

    for word in message:
        if word in spam_parameters:
            p_spam_given_message *= spam_parameters[word]
        else:
            pass
        if word in no_spam_parameters:
            p_no_spam_given_message *= no_spam_parameters[word]
        else:
            pass

    print('Spam Probability is equal to:', p_spam_given_message)
    print('No Spam Probability is equal to:', p_no_spam_given_message)

    if p_no_spam_given_message > p_spam_given_message:
        print('Label: No Spam')
    elif p_spam_given_message > p_no_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, classification failed')

In [63]:
new_message_classification_test('WINNER!! This is the secret code to unlock the money: C3421.')

Spam Probability is equal to: 1.3481290211300841e-25
No Spam Probability is equal to: 1.9368049028589875e-27
Label: Spam


In [64]:
new_message_classification_test("Sounds good, Tom, then see u there")

Spam Probability is equal to: 2.4372375665888117e-25
No Spam Probability is equal to: 3.687530435009238e-21
Label: No Spam
