## Import แพคเกจที่จำเป็น

In [1]:
from nltk import word_tokenize, NaiveBayesClassifier
from nltk import classify
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import random

wordlemmatizer = WordNetLemmatizer()

---
## โหลด Data Set

In [2]:
data = []

with open('/Users/migmikael/Downloads/SMSSpamCollection.tsv', "r",encoding='utf-8', errors='ignore') as data_file:
    for line in data_file:
        label, text = line.split("\t")
        data.append((text, label))

In [3]:
len(data)

5570

In [4]:
data[:5]

[("I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\n",
  'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n",
  'spam'),
 ("Nah I don't think he goes to usf, he lives around here though\n", 'ham'),
 ('Even my brother is not like to speak with me. They treat me like aids patent.\n',
  'ham'),
 ('I HAVE A DATE ON SUNDAY WITH WILL!!\n', 'ham')]

---
## แกะ Features
---
###  วิธีที่ 1
- แบ่งประโยคที่รับมาด้วย Tokenizer แปลงเป็นพิมพ์เล็กด้วย .lower และหารากศัพย์ด้วย wordlemmatizer
- สร้าง feature dictionary ด้วย wordtokens ที่ได้จากขั้นตอนก่อนหน้า โดยตัดเอา stopword ออก

In [5]:
stopword = stopwords.words('english')
def feature_extractor(sent):
    features = {}
    wordtokens = [wordlemmatizer.lemmatize(word.lower()) for word in word_tokenize(sent)]
    
    for word in wordtokens:
        if word not in stopword:
            features[word] = True
            
    return features

In [6]:
feature_extractor("I am the iron man.")

{'.': True, 'iron': True, 'man': True}

### วิธีที่ 2
- จากการสังเกตุพบว่าข้อความที่เป็นสแปมมักมีตัว ! $ ประกอบอยู่ด้วย
- นอกจากนี้ข้อความที่เป็นสแปมมักมีอัตราส่วนตัวอักษรพิมพ์ใหญ่ต่อตัวอักษรทั้งหมดมากเป็นพิเศษ

In [7]:
def feature_extractor2(text):
    features = {}
    if "!" in text:
        features["!"] = True
    if "$" in text:
        features["$"] = True
        
    lowercase = list('abcdefghijklmnopqrstuvwxyz')
    uppercase = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    
    num_lowercase = 0
    num_uppercase = 0
    
    for char in text:
        if char in lowercase:
            num_lowercase += 1
        elif char in uppercase:
            num_uppercase += 1
    if num_lowercase + num_uppercase == 0:
        features["upper_ratio"] = 0
    else:
        features["upper_ratio"] = num_uppercase / (num_lowercase + num_uppercase)
    return features

In [8]:
feature_extractor2("Buy Iron Man Figure just 40$ !!!")

{'!': True, '$': True, 'upper_ratio': 0.2}

---
### วิธีที่ 3 
- วิธีนี้ใช้การนับความถี่ของการปรากฏของคำต่างๆ 

In [9]:
def preprocess(sentence):
    tokens = word_tokenize(sentence)
    return [wordlemmatizer.lemmatize(word.lower()) for word in tokens]

def feature_extractor3(text):
    return {word: count for word, count in Counter(preprocess(text)).items() if not word in stopword}

In [10]:
feature_extractor3("Buy Iron Man Figure just 40$ !!!")

{'!': 3, '$': 1, '40': 1, 'buy': 1, 'figure': 1, 'iron': 1, 'man': 1}

---
## สร้าง Featureset จากวิธีสามวิธีด้านบน

In [11]:
featuresets1 = [(feature_extractor(sms), label) for (sms, label) in data]
featuresets1[:1]

[({"'ve": True,
   '.': True,
   'blessing': True,
   'breather': True,
   'fulfil': True,
   'granted': True,
   'help': True,
   'promise': True,
   'right': True,
   'searching': True,
   'take': True,
   'thank': True,
   'time': True,
   'wonderful': True,
   'wont': True,
   'word': True},
  'ham')]

In [12]:
featuresets2 = [(feature_extractor2(sms), label) for (sms, label) in data]
featuresets2[:1]

[({'upper_ratio': 0.019230769230769232}, 'ham')]

In [13]:
featuresets3 = [(feature_extractor3(sms), label) for (sms, label) in data]
featuresets3[:1]

[({"'ve": 1,
   '.': 3,
   'blessing': 1,
   'breather': 1,
   'fulfil': 1,
   'granted': 1,
   'help': 1,
   'promise': 2,
   'right': 1,
   'searching': 1,
   'take': 1,
   'thank': 1,
   'time': 1,
   'wonderful': 1,
   'wont': 1,
   'word': 1},
  'ham')]

---
## แบ่งข้อมูลออกเป็น Train และ Test Set
- ใช้อัตราส่วน Train : Test ที่ 80 : 20

In [14]:
size = int(len(featuresets1) * 0.8)
train_set1, test_set1 = featuresets1[:size], featuresets1[size:]
print("train_set1 size = %d, test_set1 size = %d" % (len(train_set1), len(test_set1)))

train_set1 size = 4456, test_set1 size = 1114


In [15]:
size = int(len(featuresets1) * 0.8)
train_set2, test_set2 = featuresets2[:size], featuresets2[size:]
print("train_set2 size = %d, test_set2 size = %d" % (len(train_set2), len(test_set2)))

train_set2 size = 4456, test_set2 size = 1114


In [16]:
size = int(len(featuresets3) * 0.8)
train_set3, test_set3 = featuresets3[:size], featuresets3[size:]
print("train_set3 size = %d, test_set3 size = %d" % (len(train_set3), len(test_set3)))

train_set3 size = 4456, test_set3 size = 1114


---
## Train ตัว Classify ด้วย NaiveBayesClassifier

In [17]:
classifier1 = NaiveBayesClassifier.train(train_set1)
print("Accuracy :", classify.accuracy(classifier1, test_set1))
classifier1.show_most_informative_features(10)

Accuracy : 0.8994614003590664
Most Informative Features
                 service = True             spam : ham    =    155.0 : 1.0
                   nokia = True             spam : ham    =    103.8 : 1.0
                     txt = True             spam : ham    =     93.8 : 1.0
                      uk = True             spam : ham    =     91.8 : 1.0
                  urgent = True             spam : ham    =     90.6 : 1.0
                    code = True             spam : ham    =     87.5 : 1.0
                      16 = True             spam : ham    =     87.5 : 1.0
                      po = True             spam : ham    =     70.5 : 1.0
                delivery = True             spam : ham    =     66.2 : 1.0
                   award = True             spam : ham    =     66.2 : 1.0


In [18]:
classifier2 = NaiveBayesClassifier.train(train_set2)
print("Accuracy :", classify.accuracy(classifier2, test_set2))
classifier2.show_most_informative_features(10)

Accuracy : 0.9039497307001796
Most Informative Features
             upper_ratio = 0.041666666666666664    ham : spam   =     14.8 : 1.0
             upper_ratio = 0.045454545454545456    ham : spam   =     14.0 : 1.0
             upper_ratio = 0.058823529411764705    ham : spam   =     13.1 : 1.0
             upper_ratio = 0.1568627450980392   spam : ham    =     12.6 : 1.0
             upper_ratio = 0.04              ham : spam   =     11.6 : 1.0
             upper_ratio = 0.034482758620689655    ham : spam   =     10.9 : 1.0
             upper_ratio = 0.03571428571428571    ham : spam   =     10.7 : 1.0
             upper_ratio = 0.038461538461538464    ham : spam   =     10.5 : 1.0
             upper_ratio = 0.15             spam : ham    =      9.8 : 1.0
             upper_ratio = 0.0784313725490196   spam : ham    =      9.8 : 1.0


In [19]:
classifier3 = NaiveBayesClassifier.train(train_set3)
print("Accuracy :", classify.accuracy(classifier3, test_set3))
classifier3.show_most_informative_features(10)

Accuracy : 0.8931777378815081
Most Informative Features
                 service = 1                spam : ham    =    152.3 : 1.0
                    free = 2                spam : ham    =     91.7 : 1.0
                     txt = 1                spam : ham    =     90.0 : 1.0
                  urgent = 1                spam : ham    =     88.7 : 1.0
                    code = 1                spam : ham    =     87.5 : 1.0
                      16 = 1                spam : ham    =     87.5 : 1.0
                      uk = 1                spam : ham    =     87.5 : 1.0
                      po = 1                spam : ham    =     70.5 : 1.0
                   nokia = 1                spam : ham    =     67.8 : 1.0
                landline = 1                spam : ham    =     65.3 : 1.0


---
## ทำ 10 Fold Cross Validation

---
### Featureset 1

In [20]:
num_folds = 10
subset_size = len(featuresets1) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets1[i*subset_size:]
    training_this_round = featuresets1[:i*subset_size] + featuresets1[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9095152603231598
0.9084380610412927
0.9138240574506283
0.9148499615285971
0.90754039497307
0.9095152603231598
0.9151705565529623
0.9072411729503291
0.9129263913824057
0.9102333931777379
Average Accuracy :  0.9109254509703343


---
### Featureset 2

In [21]:
num_folds = 10
subset_size = len(featuresets2) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets2[i*subset_size:]
    training_this_round = featuresets2[:i*subset_size] + featuresets2[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9481149012567325
0.9463395172551367
0.9481597845601436
0.9443447037701975
0.9437462597247157
0.9396768402154398
0.9344703770197487
0.9317773788150808
0.9192100538599641
0.8940754039497307
Average Accuracy :  0.934991522042689


---
### Featureset 3

In [22]:
num_folds = 10
subset_size = len(featuresets3) // num_folds
accu_list = []
for i in range(num_folds):
    testing_this_round = featuresets3[i*subset_size:]
    training_this_round = featuresets3[:i*subset_size] + featuresets3[(i+1)*subset_size:]
    
    classifier = NaiveBayesClassifier.train(training_this_round)
    accu = classify.accuracy(classifier, testing_this_round) 
    accu_list.append(accu)
    print(accu)
    
avg_accu = sum(accu_list) / len(accu_list)
print("Average Accuracy : ", avg_accu)

0.9025134649910234
0.9024536205864752
0.9077648114901257
0.908951013080277
0.9006582884500299
0.9041292639138241
0.9084380610412927
0.9036505086774387
0.9021543985637342
0.9120287253141831
Average Accuracy :  0.9052742156108403


## สรุปผลการทดลอง 
- จากการทำ cross validation จะเห็นได้ว่าวิธีการ Extract Feature ทั้งสามแบบให้ผลความแม่นยำที่ 90% ทั้งสามวิธี
- อย่างไรก็ตาม วิธีที่ให้ผลลัพธ์ที่แม่นยำมากที่สุดคือวิธีการ Extract Feature แบบที่ 2 ซึ่งเป็นวิธีการที่อาศัยการคำนวนอัตราส่วนระหว่างตัวอักษรพิมพ์ใหญ่ต่อตัวอักษรทั้งหมด โดยให้ความแม่นยำโดยเฉลี่ยที่ 93%

---
## คำนวน Confusion Matrix

In [35]:
tagged = [sms[1] for sms in test_set1]

In [36]:
tagged[:10]

['spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham']

In [37]:
ref = [classifier.classify(sms[0]) for sms in test_set1]

In [38]:
ref[:10]

['spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham']

In [39]:
from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(ref, tagged)
print(cm)

     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<862>  . |
spam | 107<145>|
-----+---------+
(row = reference; col = test)



In [28]:
labels = {'ham', 'spam'}
labels

{'ham', 'spam'}

In [29]:
from collections import Counter
true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

print("TP:", sum(true_positives.values()), true_positives)
print("FN:", sum(false_negatives.values()), false_negatives)
print("FP:", sum(false_positives.values()), false_positives)

TP: 1007 Counter({'ham': 862, 'spam': 145})
FN: 107 Counter({'spam': 107, 'ham': 0})
FP: 107 Counter({'ham': 107, 'spam': 0})
