In [14]:
import pandas
import nltk
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from nltk.metrics import ConfusionMatrix
from nltk.metrics import precision
from nltk.metrics import recall
from nltk.metrics import f_measure

def vectorization(vectorizer,X_train,X_test):
	X_train_vec=vectorizer.fit_transform(X_train)
	X_test_vec=vectorizer.transform(X_test)
	return X_train_vec,X_test_vec

def trainModel_MNB(X_train_vec,y_train):
	nb_clf=MultinomialNB()
	nb_clf.fit(X_train_vec,y_train)
	return nb_clf

def trainModel_SVM(X_train_vec,y_train):
	svm_clf=LinearSVC(C=1)
	svm_clf.fit(X_train_vec,y_train)
	return svm_clf

train=pandas.read_csv("train.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

In [72]:
# default settings as baseline
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words=None)
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
svm_clf=LinearSVC(C=8)
svm_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.978333333333
Confusion Matrix:
[[514  11]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      0.98      0.99       525
       spam       0.87      0.97      0.92        75

avg / total       0.98      0.98      0.98       600


Overall accuracy of Support Vector Machine Model: 0.983333333333
Confusion Matrix:
[[522   3]
 [  7  68]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.96      0.91      0.93        75

avg / total       0.98      0.98      0.98       600



In [73]:
# remove stopword
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


svm_clf=LinearSVC(C=8)
svm_clf.fit(X_train_vec,y_train)
print i/10.0,"\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.985
Confusion Matrix:
[[518   7]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      0.99      0.99       525
       spam       0.91      0.97      0.94        75

avg / total       0.99      0.98      0.99       600

9.9 
Overall accuracy of Support Vector Machine Model: 0.983333333333
Confusion Matrix:
[[521   4]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.95      0.92      0.93        75

avg / total       0.98      0.98      0.98       600



In [31]:
# different C values
svm_clf=LinearSVC(C=2)
svm_clf.fit(X_train_vec,y_train)
print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))


Overall accuracy of Support Vector Machine Model: 0.985
Confusion Matrix:
[[522   3]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.96      0.92      0.94        75

avg / total       0.98      0.98      0.98       600



In [74]:
# N-grams
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',ngram_range=(1,3),binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)


print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))




Overall accuracy of NaiveBayes Model: 0.985
Confusion Matrix:
[[519   6]
 [  3  72]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.92      0.96      0.94        75

avg / total       0.99      0.98      0.99       600



In [75]:
svm_clf=LinearSVC(C=3)
svm_clf.fit(X_train_vec,y_train)
print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))


Overall accuracy of Support Vector Machine Model: 0.986666666667
Confusion Matrix:
[[521   4]
 [  4  71]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.95      0.95      0.95        75

avg / total       0.99      0.99      0.99       600



In [76]:
# money as feature
train=pandas.read_csv("money.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
svm_clf=LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.981666666667
Confusion Matrix:
[[517   8]
 [  3  72]]
             precision    recall  f1-score   support

        ham       0.99      0.98      0.99       525
       spam       0.90      0.96      0.93        75

avg / total       0.98      0.98      0.98       600


Overall accuracy of Support Vector Machine Model: 0.986666666667
Confusion Matrix:
[[523   2]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       525
       spam       0.97      0.92      0.95        75

avg / total       0.99      0.99      0.99       600



In [77]:
# url as feature
train=pandas.read_csv("url.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
svm_clf=LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.981666666667
Confusion Matrix:
[[517   8]
 [  3  72]]
             precision    recall  f1-score   support

        ham       0.99      0.98      0.99       525
       spam       0.90      0.96      0.93        75

avg / total       0.98      0.98      0.98       600


Overall accuracy of Support Vector Machine Model: 0.983333333333
Confusion Matrix:
[[521   4]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.95      0.92      0.93        75

avg / total       0.98      0.98      0.98       600



In [78]:
# email as feature
train=pandas.read_csv("email.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
svm_clf=LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.981666666667
Confusion Matrix:
[[517   8]
 [  3  72]]
             precision    recall  f1-score   support

        ham       0.99      0.98      0.99       525
       spam       0.90      0.96      0.93        75

avg / total       0.98      0.98      0.98       600


Overall accuracy of Support Vector Machine Model: 0.985
Confusion Matrix:
[[522   3]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      0.99      0.99       525
       spam       0.96      0.92      0.94        75

avg / total       0.98      0.98      0.98       600



In [81]:
# phone number as feature
train=pandas.read_csv("phone.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)
svm_clf=LinearSVC(C=0.4)
svm_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.985
Confusion Matrix:
[[518   7]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      0.99      0.99       525
       spam       0.91      0.97      0.94        75

avg / total       0.99      0.98      0.99       600


Overall accuracy of Support Vector Machine Model: 0.99
Confusion Matrix:
[[523   2]
 [  4  71]]
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       525
       spam       0.97      0.95      0.96        75

avg / total       0.99      0.99      0.99       600



In [82]:
# money and phone number as feature
train=pandas.read_csv("money+phone.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.985
Confusion Matrix:
[[518   7]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      0.99      0.99       525
       spam       0.91      0.97      0.94        75

avg / total       0.99      0.98      0.99       600



In [83]:
svm_clf=LinearSVC(C=10,class_weight={"spam":0.13,"ham":0.87})
svm_clf.fit(X_train_vec,y_train)
print i/10.0,"\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))

9.9 
Overall accuracy of Support Vector Machine Model: 0.99
Confusion Matrix:
[[523   2]
 [  4  71]]
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       525
       spam       0.97      0.95      0.96        75

avg / total       0.99      0.99      0.99       600



In [84]:
# money, phone number and email as feature
train=pandas.read_csv("money+phone+email.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)
nb_clf=MultinomialNB()
nb_clf.fit(X_train_vec,y_train)

print "Overall accuracy of NaiveBayes Model:",nb_clf.score(X_test_vec,y_test)
y_nb_pred=nb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_nb_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
target_names = ["ham","spam"]
print(classification_report(y_test, y_nb_pred, target_names=target_names))

Overall accuracy of NaiveBayes Model: 0.985
Confusion Matrix:
[[518   7]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      0.99      0.99       525
       spam       0.91      0.97      0.94        75

avg / total       0.99      0.98      0.99       600



In [85]:
svm_clf=LinearSVC(C=0.4)
svm_clf.fit(X_train_vec,y_train)
print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))


Overall accuracy of Support Vector Machine Model: 0.991666666667
Confusion Matrix:
[[523   2]
 [  3  72]]
             precision    recall  f1-score   support

        ham       0.99      1.00      1.00       525
       spam       0.97      0.96      0.97        75

avg / total       0.99      0.99      0.99       600



In [86]:
# model with highest accuracy
train=pandas.read_csv("tr.tsv",delimiter='\t')
y=train["Label"].values
X=train["Text"].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
unigram_count=CountVectorizer(encoding='latin-1',binary=False,min_df=5,stop_words='english')
X_train_vec=unigram_count.fit_transform(X_train)
X_test_vec=unigram_count.transform(X_test)




In [88]:
svm_clf=LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)


print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print("Confusion Matrix:")
print(cm)
print(classification_report(y_test, y_svm_pred, target_names=target_names))


Overall accuracy of Support Vector Machine Model: 0.993333333333
Confusion Matrix:
[[523   2]
 [  2  73]]
             precision    recall  f1-score   support

        ham       1.00      1.00      1.00       525
       spam       0.97      0.97      0.97        75

avg / total       0.99      0.99      0.99       600



In [89]:
# model with best ham recall
svm_clf=LinearSVC(C=1,class_weight={"spam":0.13,"ham":0.87})
svm_clf.fit(X_train_vec,y_train)

print "\nOverall accuracy of Support Vector Machine Model:",svm_clf.score(X_test_vec,y_test)
y_svm_pred=svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_svm_pred, labels=["ham","spam"])
print "Confusion Matrix:"
print cm
print classification_report(y_test, y_svm_pred, target_names=target_names)


Overall accuracy of Support Vector Machine Model: 0.99
Confusion Matrix:
[[525   0]
 [  6  69]]
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99       525
       spam       1.00      0.92      0.96        75

avg / total       0.99      0.99      0.99       600



In [90]:
# trouble shooting, find out the falsely classified messages
for i in range(len(svm_clf.predict(X_test_vec))):
    if svm_clf.predict(X_test_vec)[i]!=y_test[i]:
        print X_test[i],y_test[i]


Your credits have been topped up for urlurlurl Your renewal Pin is tgxxrz spam
You are now unsubscribed all services. Get tons of sexy babes or hunks straight to your phone! go to urlurlurl No subscriptions. spam
Hi ya babe x you 4goten bout me?' scammers getting smart..Though this is a regular vodafone no, if you respond you get further prem rate msg/subscription. Other nos used also. Beware! spam
accordingly. I repeat, just text the word ok on your mobile phone and send spam
85233 FREE>Ringtone!Reply REAL spam
FreeMsg>FAV XMAS TONES!Reply REAL spam


In [93]:
# use nltk to build the model
documents=[]
for i in range(len(y)):
    documents.append((X[i].split(),y[i]))


In [94]:
def bag_of_words(words):
	return dict([(word,True) for word in words])

In [95]:
featuresets=[(bag_of_words(document),tag) for (document,tag) in documents]

In [96]:
size=int(len(featuresets)*0.8)
train_set,test_set=featuresets[:size],featuresets[size:]


In [97]:
# use nltk to build a mnb model
classifier=nltk.NaiveBayesClassifier.train(train_set)

In [99]:
# use nltk to build a svm model
svm_classifier=nltk.classify.SklearnClassifier(LinearSVC()).train(train_set)

In [101]:
def measure(classifier,test_set):
    groundTruth=[]
    predictionResult=[]

    for (features,label) in test_set:
        groundTruth.append(label)
        predictionResult.append(classifier.classify(features))
    
    cm=ConfusionMatrix(groundTruth,predictionResult)
    print "Confusion Matrix:"
    print cm

    groundTruth_pos=set([i for i,label in enumerate(groundTruth) if label=='ham'])
    groundTruth_neg=set([i for i,label in enumerate(groundTruth) if label=='spam'])
    predictionResult_pos=set([i for i,label in enumerate(predictionResult) if label=='ham'])
    predictionResult_neg=set([i for i,label in enumerate(predictionResult) if label=='spam'])    

    print "Overall Accuracy:",nltk.classify.accuracy(classifier,test_set)
    print "Result of Positive："
    printmeasures("ham",groundTruth_pos,predictionResult_pos)
    print "Result of Negative："
    printmeasures("spam",groundTruth_neg,predictionResult_neg)

In [102]:
def printmeasures(label, groundTruth_set, predictionResult_set):
    print label,'precision:',precision(groundTruth_set,predictionResult_set)
    print label,'recall:',recall(groundTruth_set,predictionResult_set)
    print label,'F-measure:',f_measure(groundTruth_set,predictionResult_set)

In [104]:
# evaluation of mnb model
measure(classifier,test_set)

Confusion Matrix:
     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<453> 71 |
spam |   . <76>|
-----+---------+
(row = reference; col = test)

Overall Accuracy: 0.881666666667
Result of Positive：
ham precision: 1.0
ham recall: 0.864503816794
ham F-measure: 0.927328556807
Result of Negative：
spam precision: 0.517006802721
spam recall: 1.0
spam F-measure: 0.681614349776


In [105]:
# evaluation of svm model
measure(svm_classifier,test_set)

Confusion Matrix:
     |       s |
     |   h   p |
     |   a   a |
     |   m   m |
-----+---------+
 ham |<524>  . |
spam |  11 <65>|
-----+---------+
(row = reference; col = test)

Overall Accuracy: 0.981666666667
Result of Positive：
ham precision: 0.979439252336
ham recall: 1.0
ham F-measure: 0.989612842304
Result of Negative：
spam precision: 1.0
spam recall: 0.855263157895
spam F-measure: 0.921985815603
