# Import ไลบรารี่ที่จำเป็น

In [1]:
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import random
import os, glob, re

# Stop words  ในภาษาอังกฤษ

In [2]:
wordlemmatizer = WordNetLemmatizer()
commonwords = stopwords.words('english')

# ลองโหลด Data set

In [3]:
hamtexts = []
spamtexts = []

for filename in glob.glob('/Users/migmikael/Downloads/enron1/ham/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    hamtexts.append(fin.read())
    fin.close()
    
for filename in glob.glob('/Users/migmikael/Downloads/enron1/spam/*.txt'):
    fin = open(filename, "r",encoding='utf-8', errors='ignore')
    spamtexts.append(fin.read())
    fin.close()

In [4]:
spamtexts[:1]

["Subject: what up , , your cam babe\nwhat are you looking for ?\nif your looking for a companion for friendship , love , a date , or just good ole '\nfashioned * * * * * * , then try our brand new site ; it was developed and created\nto help anyone find what they ' re looking for . a quick bio form and you ' re\non the road to satisfaction in every sense of the word . . . . no matter what\nthat may be !\ntry it out and youll be amazed .\nhave a terrific time this evening\ncopy and pa ste the add . ress you see on the line below into your browser to come to the site .\nhttp : / / www . meganbang . biz / bld / acc /\nno more plz\nhttp : / / www . naturalgolden . com / retract /\ncounterattack aitken step preemptive shoehorn scaup . electrocardiograph movie honeycomb . monster war brandywine pietism byrne catatonia . encomia lookup intervenor skeleton turn catfish .\n"]

# นำข้อมูลทั้งสองชนิดมาผสมกัน

In [5]:
mixemail = [(email, 'spam') for email in spamtexts]
mixemail += [(email, 'ham') for email in hamtexts]

random.shuffle(mixemail)

---
# แกะ Feature 
- แบ่งประโยคที่รับมาด้วย Tokenizer แปลงเป็นพิมพ์เล็กด้วย .lower และหารากศัพย์ด้วย wordlemmatizer
- สร้าง feature dictionary ด้วย wordtokens ที่ได้จากขั้นตอนก่อนหน้า โดยตัดเอา stop word ออก

In [6]:
def feature_extractor(sent):
    features = {}
    wordtokens = [wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent)]
    
    for word in wordtokens:
        if word not in commonwords:
            features[word] = True
            
    return features

In [7]:
feature_extractor("I am the iron man.")

{'.': True, 'iron': True, 'man': True}

---
# สร้างข้อมูลจาก Data Set

In [8]:
featuresets = [(feature_extractor(email), label) for (email, label) in mixemail]

In [9]:
featuresets[:1]

[({',': True,
   '.': True,
   '/': True,
   '000': True,
   '120': True,
   '20': True,
   '2000': True,
   '28': True,
   ':': True,
   ';': True,
   'actuals': True,
   'august': True,
   'daily': True,
   'enron': True,
   'gas': True,
   'hpl': True,
   'ic': True,
   'l': True,
   'lsk': True,
   'subject': True,
   'tap': True,
   'teco': True},
  'ham')]

# แบ่งข้อมูลออกเป็น Train / Test ที่ 80:20

In [10]:
size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:size], featuresets[size:]
print("train_set size = %d, test_set size = %d" % (len(train_set), len(test_set)))

train_set size = 4137, test_set size = 1035


# เทรนตัว classifier ด้วย Train set

In [11]:
classifier = NaiveBayesClassifier.train(train_set)
print(classify.accuracy(classifier, test_set))

0.9458937198067633


In [12]:
classifier.show_most_informative_features(20)

Most Informative Features
            prescription = True             spam : ham    =    155.0 : 1.0
                    2004 = True             spam : ham    =    145.1 : 1.0
                     nom = True              ham : spam   =    119.4 : 1.0
                   meter = True              ham : spam   =    106.9 : 1.0
                    pain = True             spam : ham    =    103.6 : 1.0
              nomination = True              ham : spam   =     94.8 : 1.0
                   cheap = True             spam : ham    =     93.7 : 1.0
                    spam = True             spam : ham    =     90.4 : 1.0
                  dealer = True             spam : ham    =     87.0 : 1.0
                     sex = True             spam : ham    =     87.0 : 1.0
                     ect = True              ham : spam   =     83.0 : 1.0
                featured = True             spam : ham    =     80.4 : 1.0
                  differ = True             spam : ham    =     80.4 : 1.0

---
# ลองทำ 10 Fold Cross validation

In [13]:
num_folds = 10
subset_size = len(featuresets) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets[i*subset_size:][:subset_size]
    training_this_round = featuresets[:i*subset_size] + featuresets[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
    #print(len(testing_this_round))
    #print(len(training_this_round))
    #print()
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9458413926499033
0.9497098646034816
0.9439071566731141
0.9439071566731141
0.9497098646034816
0.9516441005802708
0.9477756286266924
0.9439071566731141
0.941972920696325
0.9439071566731141
Average Accuracy :  0.9462282398452612
