## Import libraries

In [80]:
import pandas as pd
import re
import random

## read the csv file

In [69]:
df = pd.read_csv('spam.csv', encoding = 'latin')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Number of emails

In [70]:
len(df)

5572

## Drop the NaN columns

In [71]:
column = df.columns
removed_col = []
for c in column:
    if df[c].isnull().values.any():
        removed_col.append(c)
        
removed_col

['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

In [72]:
df = df.drop(labels=removed_col, axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Remove empty string

In [73]:
def clean_list(line):
    removed_words = []
    for i in range(len(line) - 1):
        if line[i] == '':
            removed_words.append(line[i])
        else:
            for s in line[i]:
                if s < 'a' and s > 'Z':
                    #print(w)
                    removed_words.append(line[i])
                    break
                elif s > 'z':
                    #print(w)
                    removed_words.append(line[i])
                    break
                elif s < 'A':
                    #print(w)
                    removed_words.append(line[i])
                    break
                    
    last_word = line[len(line)-1]
    last_word = last_word[:-1]
    line[-1] = last_word
    
    if last_word == '':
        removed_words.append(last_word)
    else:
        for s in last_word:
            if s < 'a' and s > 'Z':
                #print(w)
                removed_words.append(last_word)
                break
            elif s > 'z':
                #print(w)
                removed_words.append(last_word)
                break
            elif s < 'A':
                #print(w)
                removed_words.append(last_word)
                break

    
    for i in removed_words:
        line.remove(i)
        
    return line

## Split the email string into a list of words

In [74]:
def clean_email(msg, label):
    data = {}
    
    line = re.split(r"[:,. ]", msg)
    line = clean_list(line)
    
    data['email'] = line
    data['label'] = label
    
    return data

## Clean the dataset

In [75]:
email_list = []

# iterate through all emails
for i in range(len(df)):
    msg = df.iloc[i, 1]
    label = df.iloc[i, 0]
    email = clean_email(msg, label)
    
    email_list.append(email)

email_list[:2]

[{'email': ['Go',
   'until',
   'jurong',
   'point',
   'crazy',
   'Available',
   'only',
   'in',
   'bugis',
   'n',
   'great',
   'world',
   'la',
   'e',
   'buffet',
   'Cine',
   'there',
   'got',
   'amore',
   'wat'],
  'label': 'ham'},
 {'email': ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni'], 'label': 'ham'}]

## Split the dataset

In [76]:
tot_email = len(email_list)
mid = int(0.7*tot_email)

# shuffle the dataset
random.shuffle(email_list)

train_data = email_list[:mid]
test_data = email_list[mid:]

print(f'train data: {len(train_data)}')
print(f'test data: {len(test_data)}')

train data: 3900
test data: 1672


## calculate frequency of each word in the email 

In [77]:
def find_freq(cur_dict, freq_list):
    for i in cur_dict['email']:
        if i not in freq_list:
            freq_list[i] = 1
        else:
            freq_list[i] += 1
            
    return freq_list

## Use naive bayes to find probabilty of each words

In [78]:
spam_freq = {}
ham_freq = {}

for i in train_data:
    if i['label'] == 'spam':
        spam_freq = find_freq(i, spam_freq)
    elif i['label'] == 'ham':
        ham_freq = find_freq(i, ham_freq)
        
ham_word_cnt = 0
spam_word_cnt = 0
for i in spam_freq:
    spam_word_cnt += spam_freq[i]
for i in ham_freq:
    ham_word_cnt += ham_freq[i]
    
spam_prob = {}
ham_prob = {}
for i in spam_freq:
    spam_prob[i] = spam_freq[i] / spam_word_cnt
for i in ham_freq:
    ham_prob[i] = ham_freq[i] / ham_word_cnt
    


In [79]:
num_right = 0
# iterate through each email
for email in test_data:
    check = ''
    prob_in_spam = 1
    prob_in_ham = 1
    for w in email['email']:
        if w in spam_prob and w in ham_prob:
            prob_in_spam *= spam_prob[w]
            prob_in_ham *= ham_prob[w]
        
    if prob_in_spam > prob_in_ham:
        check = 'spam'
    else:
        check = 'ham'
        
    if email['label'] == check:
        num_right += 1
        
accuracy = num_right / len(test_data)
print(f'accuracy is: {accuracy * 100:.3f} %')

accuracy is: 89.653 %
