In [1]:
import pandas as pd
sms_spam = pd.read_csv("D:\Code Python\spam\SMSSpamCollection", sep= "\t", header= None, names=["Label", "SMS"])
print(sms_spam)

     Label                                                SMS
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [2]:
sms_spam["Label"].value_counts(normalize= True)

Label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64

In [3]:
sms_spam = sms_spam.sample(frac= 1, random_state= 1)
train_size = round(len(sms_spam) * 0.8)
train_set = sms_spam[ : train_size].reset_index(drop= True)
test_set = sms_spam[train_size :].reset_index(drop= True)
print(train_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [4]:
train_set['Label'].value_counts(normalize=True) 

Label
ham     0.86541
spam    0.13459
Name: proportion, dtype: float64

In [5]:
test_set['Label'].value_counts(normalize=True) 

Label
ham     0.868043
spam    0.131957
Name: proportion, dtype: float64

In [6]:
train_set["SMS"] = train_set["SMS"].str.replace("\W", " ")
train_set["SMS"] = train_set["SMS"].str.lower()
print(train_set.head(3))

  Label                                            SMS
0   ham                   yep, by the pretty sculpture
1   ham  yes, princess. are you going to make me moan?
2   ham                     welp apparently he retired


In [7]:
train_set["SMS"] = train_set["SMS"].str.split()
vocabulary = []
for sentence in train_set["SMS"]:
    for word in sentence:
        vocabulary.append(word)
vocabulary = list(set(vocabulary))
print(vocabulary)
print(len(vocabulary))

11860


In [8]:
word_counts = {unique_word : [0] * len(train_set) for unique_word in vocabulary}
for index, sentence in enumerate(train_set["SMS"]):
    for _word in sentence:
        word_counts[_word][index] += 1

In [9]:
word_counts = pd.DataFrame(word_counts)
word_counts.head()

Unnamed: 0,5wb,nosy,pan,huiming,bc,adventuring,each,07008009200,plans?,tease,...,still.maybe,mono,"fine.,",solved,evening...,mobile!,"green,",started.india,ettans,fren
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
train_set_final = pd.concat((train_set, word_counts), axis= 1)
train_set_final.head()

Unnamed: 0,Label,SMS,5wb,nosy,pan,huiming,bc,adventuring,each,07008009200,...,still.maybe,mono,"fine.,",solved,evening...,mobile!,"green,",started.india,ettans,fren
0,ham,"[yep,, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes,, princess., are, you, going, to, make, m...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent.],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth.., there's, a...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
spam_messages = train_set_final[train_set_final["Label"] == "spam"]
ham_messages = train_set_final[train_set_final["Label"] == "ham"]
p_spam = len(spam_messages) / len(train_set_final)
p_ham = len(ham_messages) / len(train_set_final)
n_words_spam = spam_messages["SMS"].apply(len)
#print(spam_messages)
#print(n_words_spam)
n_words_spam_sum = n_words_spam.sum()

n_words_ham = ham_messages["SMS"].apply(len)
n_words_ham_sum = n_words_ham.sum()
n_vocabulary = len(vocabulary)
#print(n_vocabulary)
alpha = 1

In [18]:
parameters_spam = {unique_words : 0 for unique_words in vocabulary}
parameters_ham = {unique_words : 0 for unique_words in vocabulary}
for word in vocabulary:
    n_unique_word_spam = spam_messages[word].sum()
    p_unique_word_spam = (n_unique_word_spam + alpha) / (n_words_spam_sum + alpha*n_vocabulary)
    parameters_spam[word] = p_unique_word_spam

    n_unique_word_ham = ham_messages[word].sum()
    p_unique_word_ham = (n_unique_word_ham + alpha) / (n_words_spam_sum +  alpha*n_vocabulary)
    parameters_ham[word] = p_unique_word_ham

"""print(parameters_spam)
print(parameters_ham)"""

'print(parameters_spam)\nprint(parameters_ham)'

In [24]:
import re
def classify(message):
    message = re.sub("\W", " ", message)
    message = message.lower().split()
    p_words_spam = p_spam
    p_words_ham = p_ham

    for word in message:
        if word in parameters_spam:
            p_words_spam *= parameters_spam[word]
        if word in parameters_ham:
            p_words_ham *= parameters_ham[word]
    
    if p_words_spam > p_words_ham:
        return "spam"
    if p_words_ham > p_words_spam:
        return "ham"

In [25]:
print(test_set)

     Label                                                SMS
0      ham          Later i guess. I needa do mcat study too.
1      ham             But i haf enuff space got like 4 mb...
2     spam  Had your mobile 10 mths? Update to latest Oran...
3      ham  All sounds good. Fingers . Makes it difficult ...
4      ham  All done, all handed in. Don't know if mega sh...
...    ...                                                ...
1109   ham  We're all getting worried over here, derek and...
1110   ham  Oh oh... Den muz change plan liao... Go back h...
1111   ham  CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
1112  spam  Text & meet someone sexy today. U can find a d...
1113   ham                            K k:) sms chat with me.

[1114 rows x 2 columns]


In [27]:
from sklearn.metrics import accuracy_score
predict = []
for message in test_set["SMS"]:
    predict.append(classify(message))

for index in range(len(test_set)):
    print("{} -->   real:   {}      predict:       {}".format(index, test_set["Label"][index], predict[index]))

print("accuracy_score: ", accuracy_score(predict, test_set["Label"]))

0 -->   real:   ham      predict:       ham
1 -->   real:   ham      predict:       ham
2 -->   real:   spam      predict:       spam
3 -->   real:   ham      predict:       ham
4 -->   real:   ham      predict:       ham
5 -->   real:   ham      predict:       ham
6 -->   real:   ham      predict:       ham
7 -->   real:   ham      predict:       ham
8 -->   real:   ham      predict:       ham
9 -->   real:   ham      predict:       ham
10 -->   real:   ham      predict:       ham
11 -->   real:   ham      predict:       ham
12 -->   real:   ham      predict:       ham
13 -->   real:   ham      predict:       ham
14 -->   real:   ham      predict:       ham
15 -->   real:   ham      predict:       ham
16 -->   real:   ham      predict:       ham
17 -->   real:   ham      predict:       ham
18 -->   real:   ham      predict:       ham
19 -->   real:   spam      predict:       spam
20 -->   real:   ham      predict:       ham
21 -->   real:   ham      predict:       ham
22 -->   real:  