https://cambridgespark.com/content/tutorials/implementing-your-own-spam-filter/index.html

In [14]:
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import os, glob, re

In [15]:
hamtexts = []
spamtexts = []

for filename in glob.glob('/Users/migmikael/Downloads/enron1/ham/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    hamtexts.append(fin.read())
    fin.close()
    
for filename in glob.glob('/Users/migmikael/Downloads/enron1/spam/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    spamtexts.append(fin.read())
    fin.close()

In [16]:
mixemail = [(email, 'spam') for email in spamtexts]
mixemail += [(email, 'ham') for email in hamtexts]

random.shuffle(mixemail)

In [17]:
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')

In [18]:
def preprocess(sentence):
    tokens = word_tokenize(sentence)
    return [wordlemmatizer.lemmatize(word.lower()) for word in tokens]

In [19]:
from collections import Counter

In [20]:
def get_features(text, setting):
    if setting=='bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in commonwords}
    else:
        return {word: True for word in preprocess(text) if not word in commonwords}

In [21]:
featuresets = [(get_features(email, 'bow'), label) for (email, label) in mixemail]

In [22]:
featuresets[0]

({'$': 9,
  "'": 3,
  ',': 14,
  '-': 1,
  '.': 13,
  '2003': 3,
  '210': 1,
  '270': 1,
  '50': 1,
  '520': 1,
  '580': 1,
  '60': 2,
  '800': 1,
  '860': 1,
  ':': 8,
  '?': 1,
  'ability': 1,
  'able': 1,
  'access': 1,
  'advanced': 1,
  'allow': 1,
  'analyze': 1,
  'anytime': 1,
  'anywhere': 1,
  'application': 2,
  'available': 1,
  'away': 2,
  'bell': 1,
  'brand': 1,
  'build': 2,
  'building': 1,
  'business': 1,
  'catalogue': 1,
  'cd': 1,
  'center': 1,
  'charge': 1,
  'click': 1,
  'code': 1,
  'collaboration': 1,
  'come': 1,
  'communication': 1,
  'competitionsabine': 1,
  'computing': 1,
  'concurred': 1,
  'connect': 1,
  'connected': 2,
  'context': 1,
  'data': 1,
  'database': 2,
  'delivers': 1,
  'demand': 1,
  'deploy': 1,
  'design': 1,
  'designed': 1,
  'digging': 1,
  'dollar': 1,
  'easy': 1,
  'edition': 1,
  'enhanced': 1,
  'experience': 1,
  'fancy': 1,
  'feature': 3,
  'fifty': 1,
  'fraction': 1,
  'get': 1,
  'give': 1,
  'great': 1,
  'hannibal

In [23]:
size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:size], featuresets[size:]
print("train_set size = %d, test_set size = %d" % (len(train_set), len(test_set)))

train_set size = 4137, test_set size = 1035


In [24]:
classifier = NaiveBayesClassifier.train(train_set)
print(classify.accuracy(classifier, test_set))

0.936231884057971


In [25]:
classifier.show_most_informative_features(20)

Most Informative Features
               forwarded = 1                 ham : spam   =    145.4 : 1.0
            prescription = 1                spam : ham    =     97.3 : 1.0
                     nom = 1                 ham : spam   =     91.1 : 1.0
                    pain = 1                spam : ham    =     81.2 : 1.0
                   meter = 1                 ham : spam   =     72.0 : 1.0
                    2005 = 1                spam : ham    =     70.0 : 1.0
                    spam = 1                spam : ham    =     70.0 : 1.0
                      ex = 1                spam : ham    =     66.8 : 1.0
                     sex = 1                spam : ham    =     65.2 : 1.0
                creative = 1                spam : ham    =     63.6 : 1.0
                   cheap = 1                spam : ham    =     60.3 : 1.0
                featured = 1                spam : ham    =     58.7 : 1.0
            solicitation = 1                spam : ham    =     55.5 : 1.0

In [26]:
num_folds = 10
subset_size = len(featuresets) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets[i*subset_size:]
    training_this_round = featuresets[:i*subset_size] + featuresets[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
    #print(len(testing_this_round))
    #print(len(training_this_round))
    #print()
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9578499613302398
0.9583243823845328
0.9550507491541808
0.9541563104114885
0.9484536082474226
0.9516814843448009
0.9550724637681159
0.9568576947842885
0.944015444015444
0.9383429672447013
Average Accuracy :  0.9519805065685215


In [27]:
tagged = [mail[1] for mail in test_set]

In [28]:
ref = [classifier.classify(mail[0]) for mail in test_set]

In [29]:
from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(ref, tagged)
print(cm)

     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<694>  5 |
spam |  52<284>|
-----+---------+
(row = reference; col = test)



In [30]:
labels = {'ham', 'spam'}
labels

{'ham', 'spam'}

In [31]:
from collections import Counter
true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

print("TP:", sum(true_positives.values()), true_positives)
print("FN:", sum(false_negatives.values()), false_negatives)
print("FP:", sum(false_positives.values()), false_positives)

TP: 978 Counter({'ham': 694, 'spam': 284})
FN: 57 Counter({'spam': 52, 'ham': 5})
FP: 57 Counter({'ham': 52, 'spam': 5})
