In [13]:
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import os, glob, re

In [14]:
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')

In [15]:
hamtexts = []
spamtexts = []

for filename in glob.glob('/Users/migmikael/Downloads/enron1/ham/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    hamtexts.append(fin.read())
    fin.close()
    
for filename in glob.glob('/Users/migmikael/Downloads/enron1/spam/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    spamtexts.append(fin.read())
    fin.close()

In [16]:
spamtexts[:1]

["Subject: what up , , your cam babe\nwhat are you looking for ?\nif your looking for a companion for friendship , love , a date , or just good ole '\nfashioned * * * * * * , then try our brand new site ; it was developed and created\nto help anyone find what they ' re looking for . a quick bio form and you ' re\non the road to satisfaction in every sense of the word . . . . no matter what\nthat may be !\ntry it out and youll be amazed .\nhave a terrific time this evening\ncopy and pa ste the add . ress you see on the line below into your browser to come to the site .\nhttp : / / www . meganbang . biz / bld / acc /\nno more plz\nhttp : / / www . naturalgolden . com / retract /\ncounterattack aitken step preemptive shoehorn scaup . electrocardiograph movie honeycomb . monster war brandywine pietism byrne catatonia . encomia lookup intervenor skeleton turn catfish .\n"]

In [17]:
mixemail = [(email, 'spam') for email in spamtexts]
mixemail += [(email, 'ham') for email in hamtexts]

random.shuffle(mixemail)

In [18]:
def feature_extractor(sent):
    features = {}
    wordtokens = [wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent)]
    
    for word in wordtokens:
        if word not in commonwords:
            features[word] = True
            
    return features

In [19]:
feature_extractor("I am iron man.")

{'.': True, 'iron': True, 'man': True}

In [20]:
featuresets = [(feature_extractor(email), label) for (email, label) in mixemail]

In [21]:
featuresets[:1]

[({'#': True,
   '&': True,
   "'": True,
   ',': True,
   '-': True,
   '.': True,
   '/': True,
   '01': True,
   '02': True,
   '03': True,
   '04': True,
   '05': True,
   '06': True,
   '08': True,
   '09': True,
   '0985192': True,
   '0986757': True,
   '1': True,
   '102775': True,
   '11': True,
   '110502': True,
   '117848': True,
   '12': True,
   '14': True,
   '17': True,
   '18': True,
   '1999': True,
   '2000': True,
   '201': True,
   '215': True,
   '26': True,
   '266962': True,
   '269123': True,
   '30': True,
   '31': True,
   '4': True,
   '45': True,
   '5': True,
   '5192': True,
   '55': True,
   '6757': True,
   '7': True,
   '87426': True,
   '95072': True,
   '99': True,
   ':': True,
   '?': True,
   '@': True,
   'able': True,
   'accounting': True,
   'activity': True,
   'addition': True,
   'aimee': True,
   'allocated': True,
   'also': True,
   'april': True,
   'arrangement': True,
   'asap': True,
   'asking': True,
   'attach': True,
   'aug': Tr

In [22]:
size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:size], featuresets[size:]
print("train_set size = %d, test_set size = %d" % (len(train_set), len(test_set)))

train_set size = 4137, test_set size = 1035


In [23]:
classifier = NaiveBayesClassifier.train(train_set)
print(classify.accuracy(classifier, test_set))

0.936231884057971


In [24]:
classifier.show_most_informative_features(20)

Most Informative Features
                     hou = True              ham : spam   =    189.0 : 1.0
                   meter = True              ham : spam   =    181.8 : 1.0
                    2004 = True             spam : ham    =    159.5 : 1.0
                     nom = True              ham : spam   =    127.2 : 1.0
                    pain = True             spam : ham    =     99.6 : 1.0
                    2005 = True             spam : ham    =     86.6 : 1.0
                  dealer = True             spam : ham    =     83.4 : 1.0
                featured = True             spam : ham    =     80.2 : 1.0
              medication = True             spam : ham    =     76.9 : 1.0
                  differ = True             spam : ham    =     76.9 : 1.0
                    2001 = True              ham : spam   =     73.7 : 1.0
                creative = True             spam : ham    =     68.8 : 1.0
                     ibm = True             spam : ham    =     67.2 : 1.0

In [30]:
num_folds = 10
subset_size = len(featuresets) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets[i*subset_size:][:subset_size]
    training_this_round = featuresets[:i*subset_size] + featuresets[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
    #print(len(testing_this_round))
    #print(len(training_this_round))
    #print()
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9671179883945842
0.9535783365570599
0.9381044487427466
0.9477756286266924
0.9477756286266924
0.9381044487427466
0.9361702127659575
0.9555125725338491
0.941972920696325
0.9303675048355899
Average Accuracy :  0.9456479690522244
