In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
messages = pd.read_csv('SMSSpamCollection.ppt', sep='\t', header=None, names=['Label','SMS'])
messages.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Label    5572 non-null object
SMS      5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
messages['Label'].value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [10]:
round(len(messages)*0.8)

4458

In [12]:
# Create a training set & a test set
random_dataset = messages.sample(frac=1, random_state=1)
training_set_index = round(len(random_dataset)*0.8)
training_set = random_dataset[:training_set_index].reset_index(drop=True)
test_set = random_dataset[training_set_index:].reset_index(drop=True)
print('There are {:,} data in Training set'.format(training_set.shape[0]))
print('There are {:,} data in Test set'.format(test_set.shape[0]))

There are 4,458 data in Training set
There are 1,114 data in Test set


In [15]:
# Split the SMS
training_set['SMS_split'] = training_set['SMS'].str.replace('\W',' ').str.lower().str.split()
training_set.head()

Unnamed: 0,Label,SMS,SMS_split
0,ham,"Yep, by the pretty sculpture","[yep, by, the, pretty, sculpture]"
1,ham,"Yes, princess. Are you going to make me moan?","[yes, princess, are, you, going, to, make, me,..."
2,ham,Welp apparently he retired,"[welp, apparently, he, retired]"
3,ham,Havent.,[havent]
4,ham,I forgot 2 ask ü all smth.. There's a card on ...,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [17]:
vocabulary = []
for sms in training_set['SMS_split']:
    for word in sms:
#         vocabulary.append(word)
        if word not in vocabulary:
            vocabulary.append(word)
# vocabulary = list(set(vocabulary))        
len(vocabulary)

7783

In [53]:
# Transform training set into other format with each column is an unique word
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}
for index,sms in enumerate(training_set['SMS_split']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
word_counts_per_sms['smth'][:5]

[0, 0, 0, 0, 2]

In [54]:
word_counts = pd.DataFrame(word_counts_per_sms)
training_set_final = pd.concat([training_set, word_counts], axis=1)
training_set_final.head()

Unnamed: 0,Label,SMS,SMS_split,yep,by,the,pretty,sculpture,yes,princess,...,beauty,hides,secrets,n8,jewelry,related,trade,arul,bx526,wherre
0,ham,"Yep, by the pretty sculpture","[yep, by, the, pretty, sculpture]",1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"Yes, princess. Are you going to make me moan?","[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,ham,Welp apparently he retired,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,Havent.,[havent],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,I forgot 2 ask ü all smth.. There's a card on ...,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
p_spam = training_set_final['Label'].value_counts(normalize=True)['spam']
p_ham = training_set_final['Label'].value_counts(normalize=True)['ham']
print('P(spam): {}'.format(p_spam))
print('P(ham): {}'.format(p_ham))

P(spam): 0.13458950201884254
P(ham): 0.8654104979811574


In [59]:
spam_sms = training_set_final[training_set_final['Label']=='spam']
ham_sms = training_set_final[training_set_final['Label']=='ham']

n_spam = spam_sms['SMS_split'].apply(len).sum()
n_ham = ham_sms['SMS_split'].apply(len).sum()

print('N(Spam): {:,}'.format(n_spam))
print('N(Ham): {:,}'.format(n_ham))

N(Spam): 15,190
N(Ham): 57,237


In [61]:
p_words_given_spam = {}
p_words_given_ham = {}

for word in vocabulary:
    n_word_given_spam = spam_sms[word].sum()
    p_words_given_spam[word] = (n_word_given_spam+1) / (n_spam+len(vocabulary))
    
    n_word_given_ham = ham_sms[word].sum()
    p_words_given_ham[word] = (n_word_given_ham+1) / (n_ham+len(vocabulary))

p_words_given_spam['smth']

4.3529360553693465e-05

In [104]:
import re
def classify(message):
    sms = re.sub('\W', ' ', message)
    sms = sms.lower().split()
    
    p_spam_given_sms = p_spam
    p_ham_given_sms = p_ham
    
    for word in sms:
        if word in vocabulary:
            p_spam_given_sms *= p_words_given_spam[word]
            p_ham_given_sms *= p_words_given_ham[word]
    print('P(Spam|Message): {}'.format(p_spam_given_sms))
    print('P(Ham|Message): {}'.format(p_ham_given_sms))
    if p_spam_given_sms > p_ham_given_sms:
        print('Label: Spam')
    elif p_spam_given_sms < p_ham_given_sms:
        print('Label: Ham')
    else:
        print('Need human clarification!')

In [105]:
classify(training_set_final[training_set_final['Label']=='ham']['SMS'].iloc[1])

P(Spam|Message): 1.2853873686328375e-29
P(Ham|Message): 1.9966832013159345e-24
Label: Ham


In [106]:
classify(training_set_final[training_set_final['Label']=='spam']['SMS'].iloc[1])

P(Spam|Message): 6.202754805579458e-86
P(Ham|Message): 1.125065771898147e-107
Label: Spam


In [82]:
test_set.head()

Unnamed: 0,Label,SMS
0,ham,Aight should I just plan to come up later toni...
1,ham,Die... I accidentally deleted e msg i suppose ...
2,spam,Welcome to UK-mobile-date this msg is FREE giv...
3,ham,This is wishing you a great day. Moji told me ...
4,ham,Thanks again for your reply today. When is ur ...


In [93]:
def classify_test(message):
    sms = re.sub('\W', ' ', message)
    sms = sms.lower().split()
    
    p_spam_given_sms = p_spam
    p_ham_given_sms = p_ham
    
    for word in sms:
        if word in vocabulary:
            p_spam_given_sms *= p_words_given_spam[word]
            p_ham_given_sms *= p_words_given_ham[word]
    
    if p_spam_given_sms > p_ham_given_sms:
        return 'spam'
    elif p_spam_given_sms < p_ham_given_sms:
        return 'ham'
    else:
        return 'Need human clarification!'

In [94]:
test_set['predict'] = test_set['SMS'].apply(classify_test)
test_set.tail()

Unnamed: 0,Label,SMS,predict
1109,spam,This is the 2nd time we have tried 2 contact u...,spam
1110,ham,Will ü b going to esplanade fr home?,ham
1111,ham,"Pity, * was in mood for that. So...any other s...",ham
1112,ham,The guy did some bitching but I acted like i'd...,ham
1113,ham,Rofl. Its true to its name,ham


In [99]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predict']:
        correct += 1
print('Number of Correct: {:,}'.format(correct))
print('Accuracy Ratio: {:,.2f}%'.format(correct/total*100))

Number of Correct: 1,104
Accuracy Ratio: 99.10%


In [102]:
incorrect_set = test_set[test_set['Label'] != test_set['predict']]
incorrect_set

Unnamed: 0,Label,SMS,predict
56,spam,Money i have won wining number 946 wot do i do...,ham
99,ham,Gettin rdy to ship comp,spam
142,ham,Have you laid your airtel line to rest?,spam
218,spam,"Hi babe its Chloe, how r u? I was smashed on s...",ham
245,ham,Anytime...,spam
404,ham,Nokia phone is lovly..,spam
491,spam,"Hi this is Amy, we will be sending you a free ...",ham
588,ham,We have sent JD for Customer Service cum Accou...,spam
646,ham,A Boy loved a gal. He propsd bt she didnt mind...,Need human clarification!
912,spam,dating:i have had two of these. Only started a...,ham


In [119]:
def p_detail(message):
    
    print(message, '\n')
    sms = re.sub('\W', ' ', message)
    sms = sms.lower().split()
    
    for word in sms:
        print('P({w}|Spam): {p}'.format(w=word, p=p_words_given_spam[word]))
        print('P({w}|Ham): {p}'.format(w=word, p=p_words_given_ham[word]))
    print('\n')
    classify(message)

In [120]:
p_detail(incorrect_set['SMS'].iloc[0])

Money i have won wining number 946 wot do i do next 

P(money|Spam): 0.00021764680276846734
P(money|Ham): 0.0006920947400799754
P(i|Spam): 0.002219997388238367
P(i|Ham): 0.0369424792371578
P(have|Spam): 0.004875288382013668
P(have|Ham): 0.0055059981544140265
P(won|Spam): 0.0026988203543289947
P(won|Ham): 0.00024607812980621346
P(wining|Spam): 8.705872110738693e-05
P(wining|Ham): 1.537988311288834e-05
P(number|Spam): 0.0011752927349497236
P(number|Ham): 0.0008458935712088588
P(946|Spam): 8.705872110738693e-05
P(946|Ham): 1.537988311288834e-05
P(wot|Spam): 8.705872110738693e-05
P(wot|Ham): 0.0002922177791448785
P(do|Spam): 0.0010447046532886433
P(do|Ham): 0.004860043063672716
P(i|Spam): 0.002219997388238367
P(i|Ham): 0.0369424792371578
P(do|Spam): 0.0010447046532886433
P(do|Ham): 0.004860043063672716
P(next|Spam): 0.000565881687198015
P(next|Ham): 0.0004921562596124269


P(Spam|Message): 9.097839917029218e-40
P(Ham|Message): 7.527669451157541e-37
Label: Ham


In [123]:
classify('Winner! You got a big prize!!!')

P(Spam|Message): 7.78241441621437e-19
P(Ham|Message): 9.021395929282776e-20
Label: Spam


In [127]:
n_spam_word = {}
n_ham_word = {}
for word in vocabulary:
    n_spam_word[word] = spam_sms[word].sum()
    n_ham_word[word] = ham_sms[word].sum()
    
print(max(n_spam_word, key=n_spam_word.get), n_spam_word[max(n_spam_word, key=n_spam_word.get)])

to 546


In [128]:
print(max(n_ham_word, key=n_ham_word.get), n_ham_word[max(n_ham_word, key=n_ham_word.get)])

i 2401
