# Homework 4: Naive Bayes Spam Filter
Sumbitted by: Christian Daniel P. Dy Quiangco Jr.

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import email as em
import re
import os
from sklearn.naive_bayes import GaussianNB

### Data Preprocessing
Load data labels and stop words

In [2]:
data_labels = pd.read_csv('Data/trec06p-cs280/labels', sep = ' ', names = ['class', 'file'])
data_labels['file'] = data_labels['file'].map(lambda x: x.lstrip('../data/'))

stop_words = list(open('stop_words.txt', 'r').read().split())

In [3]:
data_labels

Unnamed: 0,class,file
0,ham,000/000
1,spam,000/001
2,spam,000/002
3,ham,000/003
4,spam,000/004
...,...,...
37817,spam,126/017
37818,spam,126/018
37819,spam,126/019
37820,spam,126/020


Load email content to a dataframe for easier handling

In [50]:
def email_extractor(df, stop_w = True):
    content = []
    for i in df['file']:
        with open('Data/trec06p-cs280/data/' + i, 'rb') as file:
            data = file.read().decode('ISO-8859-1')
            msg = em.message_from_string(data)
            
        # If message is multipart
        if msg.is_multipart():
            for part in msg.walk():
                ctype = part.get_content_type()
                cdispo = str(part.get('Content-Disposition'))
                
                if ctype == 'text/plain' and 'attachment' not in cdispo:
                    payload = part.get_payload()  # decode
                    break
        else:
            payload = msg.get_payload()
        
        # add email
        email = cleaned_strings(payload.split())
        if stop_w:
            email = [word for word in email if word not in stop_words]
        content.append(email)
    return content
            
def cleaned_strings(word_list):
    words = []
    for word in word_list:
        if re.match('^[a-zA-Z]+$', word.rstrip(',').rstrip('.')):
            words.append(word.lower().rstrip(',').rstrip('.'))
    return words


In [5]:
df = data_labels.copy()
df['content'] = email_extractor(data_labels)
df.head(20)

Unnamed: 0,class,file,content
0,ham,000/000,"[mailing, list, queried, weeks, ago, running, ..."
1,spam,000/001,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,spam,000/002,"[academic, qualifications, prestigious, redite..."
3,ham,000/003,"[greetings, verify, subscription, list, charte..."
4,spam,000/004,"[chauncey, conferred, luscious, continued, ton..."
5,ham,000/005,"[quiet, quiet, well, straw, poll, running]"
6,ham,000/006,"[working, departed, totally, bell, labs, recom..."
7,spam,000/007,"[nbc, today, body, diet, beaches, magazines, h..."
8,spam,000/008,"[oil, sector, going, crazy, weekly, gift, kkpt..."
9,spam,000/009,"[magic, perfect, weekends]"


Separate into Testing and Training Sets

In [6]:
df_train = df.iloc[:df.loc[data_labels['file'] == '071/000'].index[0],:]
df_test = df.iloc[df.loc[data_labels['file'] == '071/000'].index[0]:,:]

df_train_ham = df_train[df_train['class'] == 'ham']
df_train_spam = df_train[df_train['class'] == 'spam']

In [7]:
df_train_spam

Unnamed: 0,class,file,content
1,spam,000/001,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,spam,000/002,"[academic, qualifications, prestigious, redite..."
4,spam,000/004,"[chauncey, conferred, luscious, continued, ton..."
7,spam,000/007,"[nbc, today, body, diet, beaches, magazines, h..."
8,spam,000/008,"[oil, sector, going, crazy, weekly, gift, kkpt..."
...,...,...,...
21294,spam,070/294,[]
21295,spam,070/295,[]
21296,spam,070/296,"[special, offer, adobe, video, collection, ado..."
21297,spam,070/297,"[html, public, html, body]"


Find the 10000 most reoccuring words

In [8]:
def generate_vocab(word_lists):
    vocabulary = {}
    for content in word_lists:
        for word in content:
            if word in vocabulary:
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
    sort_vocab = {r: vocabulary[r] for r in sorted(vocabulary, key=vocabulary.get, reverse=True)}
    return sort_vocab

In [9]:
vocab = generate_vocab(df_train['content'])
top_vocab = {A:N for (A,N) in [x for x in vocab.items()][:10000]}
top_vocab

{'will': 11625,
 'board': 4889,
 'company': 4270,
 'list': 4247,
 'gold': 3942,
 'time': 3729,
 'send': 3641,
 'adobe': 3560,
 'help': 3544,
 'message': 3184,
 'program': 3161,
 'nil': 3066,
 'studies': 2720,
 'professional': 2688,
 'work': 2685,
 'university': 2483,
 'good': 2482,
 'ms': 2372,
 'stock': 2292,
 'number': 2289,
 'well': 2189,
 'office': 2166,
 'file': 2150,
 'problem': 2140,
 'info': 2129,
 'de': 2111,
 'microsoft': 2099,
 'current': 2068,
 'hb': 2045,
 'corp': 1982,
 'windows': 1980,
 'development': 1976,
 'find': 1970,
 'handyboard': 1959,
 'pro': 1950,
 'china': 1949,
 'email': 1855,
 'great': 1842,
 'code': 1821,
 'people': 1790,
 'power': 1780,
 'read': 1778,
 'best': 1755,
 'system': 1709,
 'handy': 1659,
 'motor': 1631,
 'today': 1630,
 'call': 1610,
 'data': 1561,
 'ic': 1554,
 'market': 1535,
 'set': 1534,
 'free': 1528,
 'mail': 1518,
 'oil': 1493,
 'xp': 1474,
 'additional': 1461,
 'big': 1460,
 'text': 1442,
 'offer': 1428,
 'days': 1417,
 'address': 1408,
 

### Creating the feature matrices

In [10]:
def get_features(df):
    features_df = pd.DataFrame(df['content'], columns=top_vocab.keys())
    for index, row in df.iterrows():
        temp_set = set(row['content'])
        for word in temp_set:
            if word in top_vocab.keys():
                features_df.loc[index][word] = row['content'].count(word)
    return features_df.fillna(0)

In [11]:
train_spam_fm = get_features(df_train_spam)
train_ham_fm = get_features(df_train_ham)

In [12]:
train_spam_fm.head(10)

Unnamed: 0,will,board,company,list,gold,time,send,adobe,help,message,...,mailout,referencing,heaton,choke,simplest,remembering,tentatively,viewing,insist,tweak
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
train_ham_fm.head(10)

Unnamed: 0,will,board,company,list,gold,time,send,adobe,help,message,...,mailout,referencing,heaton,choke,simplest,remembering,tentatively,viewing,insist,tweak
0,2,0,0,7,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Computing the Priors

In [14]:
N_train = len(df_train)
N_train_ham = len(df_train_ham)
N_train_spam = len(df_train_spam)

prior_ham = N_train_ham/N_train
prior_spam = N_train_spam/N_train

print("The prior probabilities for ham is: ", prior_ham, "\nThe prior probabilities for spam is: ", prior_spam)

The prior probabilities for ham is:  0.3531924882629108 
The prior probabilities for spam is:  0.6468075117370892


### Computing the Likelihood of each word

In [15]:
def get_occurances(word_lists):
    vocabulary = dict((x,0) for x in top_vocab.keys())
    for content in word_lists:
        for word in content:
            if word in top_vocab.keys():
                vocabulary[word] += 1
    return vocabulary

Compute for base probabilities

In [16]:
count_train_spam = get_occurances(df_train_spam['content'])
count_train_ham = get_occurances(df_train_ham['content'])

In [17]:
probabilities = pd.DataFrame(top_vocab.items(), columns=["word", "count"])
probabilities['s_count'] = count_train_spam.values()
probabilities['h_count'] = count_train_ham.values()
probabilities['s_prob'] = probabilities['s_count']/(probabilities['count'].sum())
probabilities['h_prob'] = probabilities['h_count']/(probabilities['count'].sum())

probabilities

Unnamed: 0,word,count,s_count,h_count,s_prob,h_prob
0,will,11625,5244,6381,0.004832,0.005879
1,board,4889,626,4263,0.000577,0.003928
2,company,4270,4034,236,0.003717,0.000217
3,list,4247,846,3401,0.000779,0.003134
4,gold,3942,3852,90,0.003549,0.000083
...,...,...,...,...,...,...
9995,remembering,12,3,9,0.000003,0.000008
9996,tentatively,12,0,12,0.000000,0.000011
9997,viewing,12,0,12,0.000000,0.000011
9998,insist,12,8,4,0.000007,0.000004


Compute with laplace smoothing

In [18]:
laplace_prob = probabilities.copy()
laplace_prob['alpha_v'] = 0
alpha = 1
alpha_v = len(top_vocab)

laplace_prob.loc[(laplace_prob['s_count'] == 0 ), 'alpha_v'] = alpha_v
laplace_prob.loc[(laplace_prob['s_count'] == 0 ), 's_count'] = alpha
laplace_prob.loc[(laplace_prob['h_count'] == 0 ), 'alpha_v'] = alpha_v
laplace_prob.loc[(laplace_prob['h_count'] == 0 ), 'h_count'] = alpha
laplace_prob['s_prob'] = laplace_prob['s_count']/(laplace_prob['count'].sum() + laplace_prob['alpha_v'])
laplace_prob['h_prob'] = laplace_prob['h_count']/(laplace_prob['count'].sum() + laplace_prob['alpha_v'])

laplace_prob

Unnamed: 0,word,count,s_count,h_count,s_prob,h_prob,alpha_v
0,will,11625,5244,6381,4.831568e-03,0.005879,0
1,board,4889,626,4263,5.767661e-04,0.003928,0
2,company,4270,4034,236,3.716732e-03,0.000217,0
3,list,4247,846,3401,7.794634e-04,0.003134,0
4,gold,3942,3852,90,3.549046e-03,0.000083,0
...,...,...,...,...,...,...,...
9995,remembering,12,3,9,2.764055e-06,0.000008,0
9996,tentatively,12,1,12,9.129402e-07,0.000011,10000
9997,viewing,12,1,12,9.129402e-07,0.000011,10000
9998,insist,12,8,4,7.370813e-06,0.000004,0


### Classifying the emails 

In [19]:
df_train

Unnamed: 0,class,file,content
0,ham,000/000,"[mailing, list, queried, weeks, ago, running, ..."
1,spam,000/001,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,spam,000/002,"[academic, qualifications, prestigious, redite..."
3,ham,000/003,"[greetings, verify, subscription, list, charte..."
4,spam,000/004,"[chauncey, conferred, luscious, continued, ton..."
...,...,...,...
21295,spam,070/295,[]
21296,spam,070/296,"[special, offer, adobe, video, collection, ado..."
21297,spam,070/297,"[html, public, html, body]"
21298,ham,070/298,"[mounted, infrared, demodulator, hb, realised,..."


In [20]:
def classification(df):
    s_prob_list, h_prob_list = [], []
    for words in df['content']:
        s_prob, h_prob = 0, 0
        for word in set(words):
            if word in top_vocab.keys():
                s_prob += float(laplace_prob.loc[(laplace_prob['word'] == word ), 's_prob'])
                h_prob += float(laplace_prob.loc[(laplace_prob['word'] == word ), 'h_prob'])
        if (s_prob > 0) and (h_prob > 0):
            s_prob_list.append((s_prob*prior_spam)/((h_prob*prior_ham)+(s_prob*prior_spam)))
            h_prob_list.append((h_prob*prior_ham)/((h_prob*prior_ham)+(s_prob*prior_spam)))
        else:
            s_prob_list.append(s_prob)
            h_prob_list.append(h_prob)
    return s_prob_list, h_prob_list

s_prob_train, h_prob_train = classification(df_train)

In [21]:
predict_class = []
for index in range(len(s_prob_train)):
    if s_prob_train[index] < h_prob_train[index]:
        predict_class.append("ham")
    else:
        predict_class.append("spam")

df_train_class = df_train.copy()
df_train_class['s_prob'] = s_prob_train
df_train_class['h_prob'] = h_prob_train
df_train_class['predict_class'] = predict_class

In [22]:
df_train_class

Unnamed: 0,class,file,content,s_prob,h_prob,predict_class
0,ham,000/000,"[mailing, list, queried, weeks, ago, running, ...",0.433740,0.566260,ham
1,spam,000/001,"[luxury, watches, buy, rolex, rolex, cartier, ...",0.921098,0.078902,spam
2,spam,000/002,"[academic, qualifications, prestigious, redite...",0.717989,0.282011,spam
3,ham,000/003,"[greetings, verify, subscription, list, charte...",0.417616,0.582384,ham
4,spam,000/004,"[chauncey, conferred, luscious, continued, ton...",0.339959,0.660041,ham
...,...,...,...,...,...,...
21295,spam,070/295,[],0.000000,0.000000,spam
21296,spam,070/296,"[special, offer, adobe, video, collection, ado...",0.933234,0.066766,spam
21297,spam,070/297,"[html, public, html, body]",0.592156,0.407844,spam
21298,ham,070/298,"[mounted, infrared, demodulator, hb, realised,...",0.272841,0.727159,ham


Computing with log probabilities

In [23]:
def class_log(df):
    s_prob_list, h_prob_list = [], []
    for words in df['content']:
        s_prob, h_prob = 0, 0
        for word in set(words):
            if word in top_vocab.keys():
                s_prob += np.log10(float(laplace_prob.loc[(laplace_prob['word'] == word ), 's_prob'])) 
                h_prob += np.log10(float(laplace_prob.loc[(laplace_prob['word'] == word ), 'h_prob']))
        s_prob_list.append(s_prob+log_spam)
        h_prob_list.append(h_prob+log_ham)
    return s_prob_list, h_prob_list
    
log_ham = np.log10(prior_ham)
log_spam = np.log10(prior_spam)

s_prob_log, h_prob_log = class_log(df_train)

In [24]:
predict_class = []
for index in range(len(s_prob_log)):
    if s_prob_log[index] < h_prob_log[index]:
        predict_class.append("ham")
    else:
        predict_class.append("spam")
        
df_train_class_log = df_train.copy()
df_train_class_log['predict_class'] = predict_class
df_train_class_log

Unnamed: 0,class,file,content,predict_class
0,ham,000/000,"[mailing, list, queried, weeks, ago, running, ...",ham
1,spam,000/001,"[luxury, watches, buy, rolex, rolex, cartier, ...",spam
2,spam,000/002,"[academic, qualifications, prestigious, redite...",spam
3,ham,000/003,"[greetings, verify, subscription, list, charte...",ham
4,spam,000/004,"[chauncey, conferred, luscious, continued, ton...",ham
...,...,...,...,...
21295,spam,070/295,[],spam
21296,spam,070/296,"[special, offer, adobe, video, collection, ado...",spam
21297,spam,070/297,"[html, public, html, body]",spam
21298,ham,070/298,"[mounted, infrared, demodulator, hb, realised,...",ham


### Testing the Classifier

In [25]:
s_log_test, h_log_test = class_log(df_test)

In [26]:
predict_class = []
for index in range(len(s_log_test)):
    if s_log_test[index] < h_log_test[index]:
        predict_class.append("ham")
    else:
        predict_class.append("spam")

df_test_class_log = df_test.copy()
df_test_class_log['predict_class'] = predict_class
df_test_class_log

Unnamed: 0,class,file,content,predict_class
21300,spam,071/000,"[hesitantly, derive, perverse, satisfaction, c...",spam
21301,ham,071/001,"[things, perform, experiment, display, will, r...",ham
21302,spam,071/002,"[best, offer, viggra, ci, ialis, vaiium, xa, n...",spam
21303,spam,071/003,"[de, ar, wne, cr, matter, ow, real, st, mmed, ...",spam
21304,spam,071/004,"[special, offer, adobe, video, collection, ado...",spam
...,...,...,...,...
37817,spam,126/017,"[great, news, expec, infinex, ventures, starte...",spam
37818,spam,126/018,"[oil, sector, going, crazy, weekly, gift, kkpt...",spam
37819,spam,126/019,"[suffering, pain, depression, help, verified, ...",spam
37820,spam,126/020,"[prosperous, future, increased, money, earning...",spam


### Performance Evaluation

In [44]:
def performance(df):
    df['equal'] =np.where((df['class'] == df['predict_class']), 'true', 'false')
    FP = len(df[(df['class'] == 'ham') & (df['equal'] == 'false')])
    FN = len(df[(df['class'] == 'spam') & (df['equal'] == 'false')])
    TP = len(df[(df['class'] == 'ham') & (df['equal'] == 'true')])
    TN = len(df[(df['class'] == 'spam') & (df['equal'] == 'true')])
    
    Acc = (TN+TP)/(TN+TP+FN+FP)
    Rec = (TP)/(TP+FN)
    Prec = (TP)/(TP+FP)
    return Acc, Rec, Prec

In [49]:
Acc, Rec, Prec = performance(df_test_class_log)
print("The Accuracy value is:\t", Acc, "\nThe Recall value is:\t", Rec, "\nThe Precision value is:\t", Prec)

The Accuracy value is:	 0.9139934632611064 
The Recall value is:	 0.8114985862393967 
The Precision value is:	 0.9589753109337293
