In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer
import numpy as np
from math import log
import math

In [2]:
df = pd.read_csv('SmsCollection_new.csv', delimiter=";")
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# lowercase text
df = df.apply(lambda x: x.str.lower())
df

Unnamed: 0,label,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [4]:
#remove every nonalphanumerical 
df['text'] = df['text'].apply(lambda x: re.sub('[^0-9a-zA-Z]+', ' ', x))
df

# remove every nonalphabetical
df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]+', ' ', x))
df







Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that so any other suggest...
5570,ham,the guy did some bitching but i acted like i d...


In [5]:
# divide data into training and test set
df = df.sample(frac=1).reset_index(drop=True)
train_set = df.head(int(len(df)*(80/100)))
test_set = df.tail(int(len(df)*(20/100)))
print(len(train_set))
print(len(test_set))

4457
1114


In [6]:
df = train_set
# calculate chance spam class and ham class
p_spam = df[df['label'] == 'spam'].count() / len(df)
p_ham = df[df['label'] == 'ham'].count() / len(df)
print(p_spam)
print(p_ham)

label    0.13821
text     0.13821
dtype: float64
label    0.86179
text     0.86179
dtype: float64


In [7]:
# create dictionaries to save word occurrence in
spam_dict = {}
ham_dict = {}

spam_df = df[df['label'] == 'spam'].text.values
ham_df = df[df['label'] == 'ham'].text.values
spam = ' '.join(spam_df).split()
ham = ' '.join(ham_df).split()



In [8]:
#removing stopwords
stop_words = set(stopwords.words('english'))

for word in spam:
    if word in stop_words:
        spam.remove(word)
for word in ham:
    if word in stop_words:
        ham.remove(word)


In [9]:
#stemming words
stemmer = PorterStemmer()

spam = [stemmer.stem(word) for word in spam]
ham = [stemmer.stem(word) for word in ham]   
    

In [10]:
# fill dictionaries of each class with word occurrence of said class
for word in spam:
    if word not in spam_dict:
        spam_dict[word] = 0 
    spam_dict[word] += 1

for word in ham:
    if word not in ham_dict:
        ham_dict[word] = 0 
    ham_dict[word] += 1
    
print(len(spam_dict))
print(len(ham_dict))

1691
4932


In [11]:
# create spam and ham scores for each word
spam_score_dict = {}
ham_score_dict = {}
temp_dict = {**spam_dict , **ham_dict}

for word in temp_dict:
    if word in spam_dict:
        x = spam_dict[word]
    else:
        x = 0
    if word in ham_dict:
        y = ham_dict[word]
    else:
        y = 0
    spam_score = x / (x + y)
    spam_score_dict[word] = spam_score
    
for word in temp_dict:
    if word in spam_dict:
        x = spam_dict[word]
    else:
        x = 0
    if word in ham_dict:
        y = ham_dict[word]
    else:
        y = 0
    ham_score = y / (x + y)
    ham_score_dict[word] = ham_score


In [12]:
# equalize based on vocabulary size
dif_spam = (sum(ham_dict.values()))/(sum(spam_dict.values()))
dif_ham = 1/dif_spam

print(dif_spam)
print(dif_ham)

for word in spam_dict:
   spam_dict[word] *= dif_ham
for word in ham_dict:
   ham_dict[word] *= dif_spam

3.5460867952522257
0.2820009936978635


In [13]:
# calculate probabilities of features given a certain class
# the k is for Laplace smoothing
def chance_word_in_class(word, spam_dict, ham_dict, k = 0):
    if word in spam_dict:
        x = spam_dict.get(word)
    else: 
        x = 0
    chance_spam = (x + k) / (sum(spam_dict.values()) + k)
    if word in ham_dict:
        x = ham_dict.get(word)
    else: 
        x = 0
    chance_ham = (x + k) / (sum(ham_dict.values()) + k)
    return chance_spam, chance_ham

In [14]:
# labeling when using the spam and ham score
def labeling(message):

    chances_spam = []
    chances_ham = []

    for word in message.split():
        chance_spam_word, chance_ham_word = chance_word_in_class(word, spam_dict, ham_dict)
        try:
           chance_spam_word, chance_ham_word = chance_spam_word * spam_score_dict[word], chance_ham_word * ham_score_dict[word]
        except KeyError:
           pass
        try:
            if spam_score_dict[word] >= 0.95:
                chance_spam_word, chance_ham_word = 1, 0
        except KeyError:
            pass
        try:
            if ham_score_dict[word] >= 0.95:
                chance_spam_word, chance_ham_word = 0, 1
        except KeyError:
            pass
        chances_spam.append(chance_spam_word)
        chances_ham.append(chance_ham_word)


    chances_spam = [i for i in chances_spam if i != 0]
    chances_ham = [i for i in chances_ham if i != 0]

    if 1 in chances_spam:
        label = 'spam'
    elif 1 in chances_ham:
        label = 'ham'
    else:

        chance_spam = np.prod(list(map(math.log10, chances_spam)))
        chance_ham = np.prod(list(map(math.log10, chances_ham)))

        if chance_spam > chance_ham:
            label = 'spam'
        elif chance_ham > chance_spam:
            label = 'ham'
        else:
            label ='neutral'

    return label

In [15]:
# labeling when not using the spam and ham score

# def labeling(message):

#     chances_spam = []
#     chances_ham = []

#     for word in message.split():
#         chance_spam_word, chance_ham_word = chance_word_in_class(word, spam_dict, ham_dict)
#         chances_spam.append(chance_spam_word)
#         chances_ham.append(chance_ham_word)


#     chances_spam = [i for i in chances_spam if i != 0]
#     chances_ham = [i for i in chances_ham if i != 0]

#     chance_spam = np.prod(list(map(math.log10, chances_spam)))
#     chance_ham = np.prod(list(map(math.log10, chances_ham)))

#     if chance_spam > chance_ham:
#         label = 'spam'
#     elif chance_ham > chance_spam:
#         label = 'ham'
#     else:
#         label ='neutral'

#     return label

In [16]:
# evaluation
golden_labels = []
labels = []

for label in test_set['label']:
    golden_labels.append(label)

for message in test_set['text']:
    labeled = labeling(message)
    labels.append(labeled)
    
accurates = 0
errors = 0
    
for i in range(0, len(labels)):
    if labels[i] == golden_labels[i]:
        accurates += 1
    if labels[i] != golden_labels[i]:
        errors += 1

accuracy = accurates / len(labels)

print('Accuracy:', accuracy)   

Accuracy: 0.9488330341113106
