In [11]:
import glob
import errno
import pandas as pd
import math

In [12]:
path= '/train'
hampath = path+'/ham/*.txt'
spampath = path+'/spam/*.txt'
print(hampath)
print(spampath)
test_path = '/test'
test_hampath = test_path+'/ham/*.txt'
test_spampath = test_path+'/spam/*.txt'
print(test_hampath)
print(test_spampath)

/Users/karankatiyar/Desktop/train/ham/*.txt
/Users/karankatiyar/Desktop/train/spam/*.txt
/Users/karankatiyar/Desktop/test/ham/*.txt
/Users/karankatiyar/Desktop/test/spam/*.txt


### Loading ham and spam into dataframe

In [13]:
def load_data(path, class_value):
    l=[]
    files = glob.glob(path)
    for name in files:
        try:
            with open(name) as f:
                data = f.read() 
                l.append(data)
        except:
            continue
    df=pd.DataFrame(l,columns=['content'])
    print(df.shape)
    df['label']=class_value
    print(df.head())
    return df

ham_df = load_data(hampath, 0)
spam_df = load_data(spampath, 1)

#Combining both ham and spam into 1 training dataframe
lcombined=pd.DataFrame()
lcombined= pd.concat([ham_df, spam_df], ignore_index=True, sort =False,axis=0)
print(lcombined.head())
print(lcombined.shape)

(340, 1)
                                             content  label
0  Subject: meter 1031 baytown exxon\ndaren - the...      0
1  Subject: re : allocation exceptions\ndaren - m...      0
2  Subject: enron / hpl actuals for january 29 , ...      0
3  Subject: re : issue\nfyi - see note below - al...      0
4  Subject: data validation\nbrenda ,\nwe met thi...      0
(122, 1)
                                             content  label
0  Subject: what up , , your cam babe\nwhat are y...      1
1  Subject: re : orderr your mdedicationns now\nm...      1
2  Subject: nb real vallum , x . anax , l . evitr...      1
3  Subject: account # 20367 s tue , 28 jun 2005 1...      1
4  Subject: system information - january 5 th\nch...      1
                                             content  label
0  Subject: meter 1031 baytown exxon\ndaren - the...      0
1  Subject: re : allocation exceptions\ndaren - m...      0
2  Subject: enron / hpl actuals for january 29 , ...      0
3  Subject: re : issue

# 1. Implement the multinomial Naive Bayes algorithm

### Creating Vocabulary

In [14]:
def get_set(df):
    set_hs = set()
    for index, row in df.iterrows():
        tempset = set(df.iloc[index,0].split())
        for setvalue in tempset:
            set_hs.add(setvalue)
    print(len(set_hs))
    return set_hs
def get_word_count(set_hs,df):
    hs_word_count = dict.fromkeys(set_hs,0)
    for index, row in df.iterrows():
        templist = list(df.iloc[index,0].split())
        for word in templist:
            if word in hs_word_count:
                hs_word_count[word] +=1
    print(hs_word_count)
    return hs_word_count
#Noting unique words from ham
set_ham = get_set(ham_df)
    
#Noting unique words from spam
set_spam = get_set(spam_df)


#counting unique words from ham
ham_word_count = get_word_count(set_ham,ham_df)

#counting unique words from spam
spam_word_count = get_word_count(set_spam,spam_df)

#Creating Vocabulary
vocabulary = set()
for word in set_spam:
    vocabulary.add(word)
for word in set_ham:
    vocabulary.add(word)
print(len(vocabulary))

5903
6205
10386


### Calculating Priors

In [15]:
def get_list(df):
    list_hs_docs = list()
    for index, row in df.iterrows():
        list_hs_docs.append(df.iloc[index,0])
    return list_hs_docs
list_ham_docs = get_list(ham_df)
list_spam_docs = get_list(spam_df)
prior_ham = len(list_ham_docs)/(len(list_ham_docs)+len(list_spam_docs))
prior_spam = len(list_spam_docs)/(len(list_ham_docs)+len(list_spam_docs))
print(prior_ham)
print(prior_spam)

0.7359307359307359
0.26406926406926406


### Getting conditional probabilities

In [16]:
def get_prob_word_given_class(sh_word_count):
    prob_word_given_sh = dict()
    total_count=0
    for key in sh_word_count.keys():
        total_count = total_count + sh_word_count[key]+1
    print(total_count)
    for key in sh_word_count.keys():
        prob_word_given_sh[key] = (sh_word_count[key]+1)/total_count
    print(prob_word_given_sh)
    return prob_word_given_sh
prob_word_given_ham = get_prob_word_given_class(ham_word_count)
prob_word_given_spam = get_prob_word_given_class(spam_word_count)

80316
29976


### Loading Test Data

In [17]:
ham_test_df = load_data(test_hampath, 0)
spam_test_df = load_data(test_spampath, 1)
test_combined=pd.DataFrame()
test_combined= pd.concat([ham_test_df, spam_test_df], ignore_index=True, sort =False,axis=0)
print(test_combined.shape)

(348, 1)
                                             content  label
0  Subject: enron / hpl actuals for august 16 , 2...      0
1  Subject: inactivations\ncheryl johnson\n08 / 3...      0
2  Subject: calpine daily gas nomination\nstill u...      0
3  Subject: enron / hpl actuals for sept . 14 , 2...      0
4  Subject: dear owner\nas of the 4 th april 2001...      0
(129, 1)
                                             content  label
0  Subject: my testimonial about skuper viakgra l...      1
1  Subject: learn to save on medications at disco...      1
2  Subject: just got out of school\nclick here to...      1
3  Subject: vlagra : discreet , no prescription ,...      1
4  Subject: small - cap market advisors\nhidden g...      1
(477, 2)


### Applying Multinomial NB

In [18]:
def apply_multinomialNB(test_combined, prior_ham,prior_spam,prob_word_given_ham,prob_word_given_spam):
    for index, row in test_combined.iterrows():
        comparision_score_class={'spam_score':0,'ham_score':0}
        test_set = set()
        comparision_score_class['ham_score']=math.log(prior_ham)
        comparision_score_class['spam_score']=math.log(prior_spam)
        tempset = set(test_combined.iloc[index,0].split())
        for setvalue in tempset:
            test_set.add(setvalue)
        for word in test_set:
            if word in prob_word_given_ham:
                comparision_score_class['ham_score']=comparision_score_class['ham_score'] + math.log(prob_word_given_ham[word])
            else:
                continue
            if word in prob_word_given_spam:
                comparision_score_class['spam_score']=comparision_score_class['spam_score'] + math.log(prob_word_given_spam[word])
            else:
                continue
        arg_max = max(comparision_score_class['ham_score'], comparision_score_class['spam_score'])
        if comparision_score_class['ham_score'] > comparision_score_class['spam_score']:
            test_combined.at[index,'test_label']= 0
        else:
            test_combined.at[index,'test_label']=1
    test_combined.test_label = test_combined.test_label.astype(int)
    return test_combined
test_df = apply_multinomialNB(test_combined, prior_ham,prior_spam,prob_word_given_ham,prob_word_given_spam)
print(test_df)
import numpy as np
test_df["accuracy"]=np.where(test_df["label"] == test_df["test_label"],1,0)
sum(test_combined["accuracy"])

                                               content  label  test_label
0    Subject: enron / hpl actuals for august 16 , 2...      0           1
1    Subject: inactivations\ncheryl johnson\n08 / 3...      0           1
2    Subject: calpine daily gas nomination\nstill u...      0           1
3    Subject: enron / hpl actuals for sept . 14 , 2...      0           1
4    Subject: dear owner\nas of the 4 th april 2001...      0           1
5    Subject: revised : eastrans nomination change ...      0           1
6    Subject: first deliveries - comstock oil & gas...      0           1
7    Subject: weekend noms\n- - - - - - - - - - - -...      0           1
8    Subject: hpl nom for march 24 - 26 , 2001\n( s...      0           1
9    Subject: ami , , , ,\ni agree ! !\nthanks .\n-...      0           1
10   Subject: hpl nom . revisions\n( see attached f...      0           1
11   Subject: enron / hpl actuals for january 11 , ...      0           1
12   Subject: teco _ gas _ issues . xl

124

In [19]:
stop_words = {'re', '?','/','.','fwd',',',':', 'Subject:', 'or', 'nor', "didn't", 'during', 'each', 'he', 'has', 'won', "hadn't", "shouldn't", "wouldn't", 'before', 'not', 'because', 'under', 'for', 'all', 'm', 'why', "you're", 'other', 'on', 'can', 'once', "you'll", "couldn't", 'between', "doesn't", 'after', 'very', "don't", 'am', 'mustn', 'the', 'now', 'weren', 'again', 'll', 'hadn', 'wouldn', 'your', 't', 'myself', 'down', 'too', 'to', 'where', 'with', 'they', 'as', 'isn', 'but', 'don', 'against', 'hasn', "won't", "hasn't", 'being', 'you', 've', 'if', 'and', 'more', 'when', 'there', 'of', 'any', 'ain', 'me', 'theirs', 'wasn', "haven't", "you'd", 'yourselves', 'below', 'o', "shan't", 'shouldn', 'until', 'she', 'my', "it's", 'do', 'hers', 'about', 'same', 'didn', 'themselves', "should've", 'were', 'this', 'its', 'his', 'having', 'y', 'so', 'further', 'them', 'then', 'doing', 'our', 'yourself', 'herself', 'their', 'from', 'over', 'in', 'ours', 'both', 'ma', 'was', 're', 'mightn', 'some', 'will', 's', 'while', "wasn't", 'itself', 'doesn', 'whom', 'it', 'shan', 'had', 'is', 'himself', 'into', "aren't", 'own', 'have', 'than', 'only', 'd', 'did', "mightn't", 'yours', 'haven', "isn't", "you've", 'a', 'been', 'up', "weren't", 'what', "mustn't", "needn't", 'those', 'that', 'few', 'we', 'i', 'who', 'needn', "that'll", 'here', 'be', 'an', 'does', 'above', 'which', 'couldn', 'ourselves', 'such', 'at', 'aren', 'most', 'her', 'off', 'should', 'through', "she's", 'by', 'him', 'how', 'just', 'out', 'no', 'these', 'are'}

In [20]:
for junk_word in stop_words:
    if junk_word in set_ham:
        set_ham.remove(junk_word)
    if junk_word in set_spam:
        set_spam.remove(junk_word)

In [23]:
ham_word_count = get_word_count(set_ham,ham_df)
spam_word_count = get_word_count(set_spam,spam_df)
test_df = apply_multinomialNB(test_combined, prior_ham,prior_spam,prob_word_given_ham,prob_word_given_spam)
print(test_df)
import numpy as np
test_df["accuracy"]=np.where(test_df["label"] == test_df["test_label"],1,0)
sum(test_combined["accuracy"])

                                               content  label  test_label  \
0    Subject: enron / hpl actuals for august 16 , 2...      0           1   
1    Subject: inactivations\ncheryl johnson\n08 / 3...      0           1   
2    Subject: calpine daily gas nomination\nstill u...      0           1   
3    Subject: enron / hpl actuals for sept . 14 , 2...      0           1   
4    Subject: dear owner\nas of the 4 th april 2001...      0           1   
5    Subject: revised : eastrans nomination change ...      0           1   
6    Subject: first deliveries - comstock oil & gas...      0           1   
7    Subject: weekend noms\n- - - - - - - - - - - -...      0           1   
8    Subject: hpl nom for march 24 - 26 , 2001\n( s...      0           1   
9    Subject: ami , , , ,\ni agree ! !\nthanks .\n-...      0           1   
10   Subject: hpl nom . revisions\n( see attached f...      0           1   
11   Subject: enron / hpl actuals for january 11 , ...      0           1   

124