### Make necessary imports

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import string
import re
from nltk.stem.porter import PorterStemmer
import random
from sklearn.model_selection import train_test_split

### Import dataset

In [4]:
data=pd.read_csv("spam.csv", encoding='ISO-8859-1')

In [5]:
data.shape

(5572, 5)

In [6]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Remove undesired columns

In [7]:
data=data.drop(["Unnamed: 2","Unnamed: 3" ,"Unnamed: 4"], axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


### Create numrical equivalent to label categorical column

In [9]:
data['label'] = data['v1'].map({'ham': 0, 'spam': 1})
data.head()

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Extract all the mails into a list of content, label pairs

In [10]:
texts = []
for index, row in data.iterrows():
    texts.append((row['v2'], row['label']))
texts[0:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  0),
 ('Ok lar... Joking wif u oni...', 0),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  1)]

### Remove white-spaces and punctuations
### Tokenize the strings by separation out individual words

In [11]:
tokenized = []
for t in texts:
    msg = t[0]
    msg = re.sub('[' + string.punctuation + ']', ' ', msg)
    msg = re.sub('[\n\t\r]', '', msg)
    words = msg.split()
    tokenized.append((words, t[1]))
tokenized[0:2] # First element

[(['Go',
   'until',
   'jurong',
   'point',
   'crazy',
   'Available',
   'only',
   'in',
   'bugis',
   'n',
   'great',
   'world',
   'la',
   'e',
   'buffet',
   'Cine',
   'there',
   'got',
   'amore',
   'wat'],
  0),
 (['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'], 0)]

### Removing stopwords as defined in the NLTK library
### Also words of length less than 3 are removed, as they appear too often and do not contribute too much to help put a label on a text

In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 
filtered = []
for t in tokenized:
    words = t[0]
    stop_removed = []
    for word in words:
        if word not in stop_words and len(word) > 2:
            stop_removed.append(word)
    filtered.append((stop_removed, t[1]))

filtered[0:2]

[nltk_data] Error loading stopwords: <urlopen error Tunnel connection
[nltk_data]     failed: 503 Service Unavailable>


[(['jurong',
   'point',
   'crazy',
   'Available',
   'bugis',
   'great',
   'world',
   'buffet',
   'Cine',
   'got',
   'amore',
   'wat'],
  0),
 (['lar', 'Joking', 'wif', 'oni'], 0)]

### Perform Stemming, or converting morphologically similar words to root word, as defined in NLTK library

In [13]:
stemmer = PorterStemmer()
stemmed = []
for t in filtered:
    words = t[0]
    stemmed_words = []
    for word in words:
        stemmed_word = stemmer.stem(word.lower())
        stemmed_words.append(stemmed_word)
    stemmed.append((stemmed_words, t[1]))
stemmed[0:2]

[(['jurong',
   'point',
   'crazi',
   'avail',
   'bugi',
   'great',
   'world',
   'buffet',
   'cine',
   'got',
   'amor',
   'wat'],
  0),
 (['lar', 'joke', 'wif', 'oni'], 0)]

### Counting number of texts each word occurs in and eliminate those appearing only once

In [14]:
word_count = {}
for t in stemmed:
    words = t[0]
    occured = []
    for word in words:
        if word not in word_count:
            word_count[word] = 1
        elif word not in occured:
            word_count[word] += 1
            occured.append(word)

for i in range(len(stemmed)):
    stemmed[i] = (list(filter(lambda x: word_count[x] > 1, stemmed[i][0])), stemmed[i][1])

stemmed[0:2]

[(['point',
   'crazi',
   'avail',
   'bugi',
   'great',
   'world',
   'buffet',
   'cine',
   'got',
   'wat'],
  0),
 (['lar', 'joke', 'wif', 'oni'], 0)]

### Split the dataset into training and testing(70-30 split)

In [15]:
number_of_hams = data.label.value_counts()[0]
number_of_spams = data.label.value_counts()[1]
total = number_of_hams + number_of_spams

train_data = stemmed[:int(0.7*total)]
test_data = stemmed[int(0.7*total):]

### Train the Naive Nayes Chassifier i:e, computing P(Spam|Word)

In [16]:
# We need to train these 4 possibilities:
# 1) Probability that a word occurs in spam mails
# 2) Probability that a word occurs in ham mails
# 3) Probability that any given mail is spam
# 4) Probability that any given mail is ham

def prob_occurs_in_spam(word):
    count = 0
    total_spams = 0
    for t in train_data:
        msg = t[0]
        if t[1] == 1:
            total_spams += 1
            if word in msg:
                count += 1
    return count/total_spams
             

def prob_occurs_in_ham(word):
    count = 0
    total_hams = 0
    for t in train_data:
        msg = t[0]
        if t[1] == 0:
            total_hams += 1
            if word in msg:
                count += 1
    return count/total_hams

def total_spams_and_hams(train_data):
    total_spams = 0
    total_hams = 0
    for t in train_data:
        if t[1] == 1:
            total_spams += 1  
        else:
            total_hams += 1 
    return total_spams, total_hams


prob_spam = total_spams_and_hams(train_data)[0]/len(train_data)
prob_ham = total_spams_and_hams(train_data)[1]/len(train_data)

# Apply the Bayes Theorem
def prob_spam_given_word(word):
    return (prob_occurs_in_spam(word)*prob_spam)/((prob_occurs_in_spam(word)*prob_spam + prob_occurs_in_ham(word)*prob_ham))

word = 'free'
print('Probability that a message is spam given the word "{}" is: {}'.format(word, prob_spam_given_word(word)))

Probability that a message is spam given the word "free" is: 0.7531645569620252


### We store probabilities of a message being spam given a word in a dictionary for every word

In [17]:
probabilities = {}
for t in train_data:
    msg = t[0]
    for word in msg:
        if word not in probabilities:
            prob = prob_spam_given_word(word)
            if prob == 0:
                probabilities[word] = 0.001
            else:
                probabilities[word] = prob

### Testing the model on unseen mails

In [18]:
from functools import reduce

def prob_is_spam(words):
    probs_spam = []
    for word in words:
        if word in probabilities:
            probs_spam.append(probabilities[word])
        else:
            probs_spam.append(0.5) 
    probs_not_spam = list(map(lambda prob: 1-prob, probs_spam))
    product_spam = reduce(lambda x, y: x * y, probs_spam, 1) 
    product_not_spam = reduce(lambda x, y: x * y, probs_not_spam, 1)
    return product_spam/(product_spam + product_not_spam)


### Check accuracy on test data and compute confusion matrix to evaluate performance

In [19]:
total_correct = 0
true_spam_as_spam = 0
true_spam_as_ham = 0
true_ham_as_ham = 0
true_ham_as_spam = 0

predicted=[]
for t in test_data:
    guess = -1
    words = t[0]
    answer = t[1]
    prob_spam = prob_is_spam(words)
    guess = 1 if prob_spam > 0.95 else 0
    predicted.append(guess)
    if guess == answer:
        total_correct += 1
        if answer == 0: # true negative
            true_ham_as_ham += 1
        else: # true positive
            true_spam_as_spam += 1 
    else:
        if answer == 0: # false positive
            true_ham_as_spam += 1
        else: # true negative
            true_spam_as_ham += 1

            
true_spams = total_spams_and_hams(test_data)[0]
true_hams = total_spams_and_hams(test_data)[1]

print('Total test mails: ', len(test_data))
print('Correctly Predicted: ', total_correct)
print('Accuracy: ', total_correct*100/(true_spams+true_hams))
print('Ham precision: ', true_ham_as_ham/(true_ham_as_ham + true_spam_as_ham))
print('Ham recall: ', true_ham_as_ham/(true_ham_as_ham + true_ham_as_spam))
print('Spam precision: ', true_spam_as_spam/(true_spam_as_spam + true_ham_as_spam)) 
print('Spam recall: ', true_spam_as_spam/(true_spam_as_spam + true_spam_as_ham))
print('-------------------------------')
confusion_mat = pd.crosstab(data['label'][int(0.7*total):], np.array(predicted), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion_mat)

Total test mails:  1672
Correctly Predicted:  1595
Accuracy:  95.39473684210526
Ham precision:  0.9850958126330731
Ham recall:  0.961218836565097
Spam precision:  0.7870722433460076
Spam recall:  0.9078947368421053
-------------------------------
Predicted     0    1   All
Actual                    
0          1388   56  1444
1            21  207   228
All        1409  263  1672
