https://www.data-blogger.com/2016/01/20/spam-detection/

In [56]:
import numpy as np
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import os, glob, re

In [57]:
def extract_features(text):
    '''
    
    '''
    features = {}
    if "!" in text:
        features["!"] = True
    if "$" in text:
        features["$"] = True
        
    lowercase = list('abcdefghijklmnopqrstuvwxyz')
    uppercase = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    
    num_lowercase = 0
    num_uppercase = 0
    
    for char in text:
        if char in lowercase:
            num_lowercase += 1
        elif char in uppercase:
            num_uppercase += 1
    features["upper_ratio"] = num_uppercase / (num_lowercase + num_uppercase)
    return features

In [58]:
hamtexts = []
spamtexts = []

for filename in glob.glob('/Users/migmikael/Downloads/enron2/ham/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    hamtexts.append(fin.read())
    fin.close()
    
for filename in glob.glob('/Users/migmikael/Downloads/enron2/spam/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    spamtexts.append(fin.read())
    fin.close()

In [59]:
mixemail = [(email, 'spam') for email in spamtexts]
mixemail += [(email, 'ham') for email in hamtexts]

random.shuffle(mixemail)

In [60]:
featuresets = [(extract_features(email), label) for (email, label) in mixemail]

In [61]:
featuresets[1]

({'upper_ratio': 0.0006293266205160479}, 'ham')

In [62]:
size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:size], featuresets[size:]
print("train_set size = %d, test_set size = %d" % (len(train_set), len(test_set)))

train_set size = 4685, test_set size = 1172


In [63]:
classifier = NaiveBayesClassifier.train(train_set)
print(classify.accuracy(classifier, test_set))

0.7013651877133106


In [64]:
classifier.show_most_informative_features(20)

Most Informative Features
             upper_ratio = 0.004016064257028112   spam : ham    =     26.4 : 1.0
             upper_ratio = 0.008333333333333333   spam : ham    =     11.5 : 1.0
             upper_ratio = 0.003067484662576687   spam : ham    =     10.2 : 1.0
             upper_ratio = 0.000999000999000999   spam : ham    =      6.9 : 1.0
             upper_ratio = 0.0035587188612099642   spam : ham    =      6.1 : 1.0
             upper_ratio = 0.0024390243902439024   spam : ham    =      6.1 : 1.0
             upper_ratio = 0.00684931506849315   spam : ham    =      6.1 : 1.0
             upper_ratio = 0.001669449081803005   spam : ham    =      6.1 : 1.0
             upper_ratio = 0.0016666666666666668   spam : ham    =      6.1 : 1.0
             upper_ratio = 0.002702702702702703   spam : ham    =      4.9 : 1.0
             upper_ratio = 0.00099601593625498   spam : ham    =      4.7 : 1.0
             upper_ratio = 0.002824858757062147   spam : ham    =      4.7 : 1.0
 

In [65]:
num_folds = 10
subset_size = len(featuresets) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets[i*subset_size:]
    training_this_round = featuresets[:i*subset_size] + featuresets[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
    #print(len(testing_this_round))
    #print(len(training_this_round))
    #print()
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.85128905583063
0.8493930197268589
0.8497973117132495
0.8481228668941979
0.8413420528859824
0.8342428376534788
0.8146570089475926
0.7985244040862656
0.7757009345794392
0.6976351351351351
Average Accuracy :  0.816070462745283


---
## Confusion Matrix

In [66]:
test_set[0][1]

'ham'

In [67]:
tagged = [mail[1] for mail in test_set]

In [68]:
ref = [classifier.classify(mail[0]) for mail in test_set]

In [69]:
tagged[:10]

['ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam']

In [70]:
ref[:10]

['ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham']

In [71]:
from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(ref, tagged)
print(cm)

     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<748>127 |
spam | 135<162>|
-----+---------+
(row = reference; col = test)



In [72]:
labels = {'ham', 'spam'}
labels

{'ham', 'spam'}

In [73]:
from collections import Counter
true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

print("TP:", sum(true_positives.values()), true_positives)
print("FN:", sum(false_negatives.values()), false_negatives)
print("FP:", sum(false_positives.values()), false_positives)

TP: 910 Counter({'ham': 748, 'spam': 162})
FN: 262 Counter({'spam': 135, 'ham': 127})
FP: 262 Counter({'ham': 135, 'spam': 127})
