# 2 Spam Classification

In [78]:
import numpy as np
import scipy.io as scio
from sklearn.svm import SVC
import re
from nltk.stem.porter import PorterStemmer #词干提取算法
%matplotlib inline

## 2.1 Preprocessing Emails
• Lower-casing:
The entire email is converted into lower case, so
that captialization is ignored (e.g., IndIcaTE is treated the same as
Indicate).

• Stripping HTML: All HTML tags are removed from the emails.
Many emails often come with HTML formatting; we remove all the
HTML tags, so that only the content remains.

• Normalizing URLs: All URLs are replaced with the text “httpaddr”.

• Normalizing Email Addresses:
with the text “emailaddr”.

• Normalizing Numbers:
“number”.
All email addresses are replaced
All numbers are replaced with the text

• Normalizing Dollars: All dollar signs ($) are replaced with the text
“dollar”.

• Word Stemming: Words are reduced to their stemmed form. For ex-
ample, “discount”, “discounts”, “discounted” and “discounting” are all
replaced with “discount”. Sometimes, the Stemmer actually strips o↵
additional characters from the end, so “include”, “includes”, “included”,
and “including” are all replaced with “includ”.

• Removal of non-words: Non-words and punctuation have been re-
moved. All white spaces (tabs, newlines, spaces) have all been trimmed
to a single space character.

### 2.1.1 Vocabulary List

In [79]:
#getVocabList function will return a ndarray that contains vocab.txt
def getVocabList():
    data=np.loadtxt('vocab.txt',dtype='str')
    return data

In [80]:
#readFile function will return the contents of the file
def readFile(filename):
    f=open(filename)
    contents=f.read()
    f.close()
    return contents

In [81]:
#processEmail function will divide the contents into single word and tranpose them to stem by
#using porter stemmer algrithm,and then the function look up to the vocab-list to find wether
#any word is in the vocab-list.If yes,put the index of the word into word_indices(List),then
#return it
def processEmail(email_contents):
    '''将内容分割成单个的单词，然后用词干提取算法处理每一个单词得到它的词干，对照着词汇表(vocab-list)
    查看每个词干是否存在于词汇表中，如果存在，则返回该单词在词汇表中对应的下标'''
    #Load Vocabulary
    vocabList=getVocabList()
    
    #init return value(it's a List)
    word_indices=[]
    
    #Preprocess Email
    
    #Lower case(转换成全部小写)
    email_contents=email_contents.lower()
    #Strip all HTML
    #Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents=re.sub('<[^<>]+>',' ',email_contents)
    
    #Handle Numbers
    #Look for one or more characters between 0-9
    email_contents=re.sub('[0-9]+','number',email_contents)
    
    #Handle URLS
    #Look for strings starting with http:// or https://
    email_contents=re.sub('(http|https)://[^\s]*','httpaddr',email_contents)
    
    #Handle Email Address
    #Look for strings with @ in the middle
    email_contents=re.sub('[^\s]+@[^\s]+','emailaddr',email_contents)
    
    #Handle $ sign
    email_contents=re.sub('[$]+','dollar',email_contents)
    
    #处理所有不是字母和数字的符号，将它们都换成空格
    email_contents=re.sub('[\W]+',' ',email_contents)
    
    #用re.split函数将其分割为单词
    words=re.split(' ',email_contents)#words is a List
    
    #Tokenize Email
    lenth=len(words)
    for i in range(lenth):
        if words[i]!=' ':
            myporter=PorterStemmer()
            words[i]=myporter.stem(words[i])
    for i in range(lenth):
        for j in range(1899):
            if words[i]==vocabList[j][1]:
                word_indices.append(vocabList[j][0])
    return word_indices
        


In [82]:
#Extract Features
file_contents=readFile('emailSample1.txt')
word_indices=processEmail(file_contents)

#Print Stats
print('word indices:')
print(word_indices)

word indices:
['86', '916', '794', '1077', '883', '370', '1699', '790', '1822', '1831', '883', '431', '1171', '794', '1002', '1893', '1364', '592', '1676', '238', '162', '89', '688', '945', '1663', '1120', '1062', '1699', '375', '1162', '479', '1893', '1510', '799', '1182', '1237', '810', '1895', '1440', '1547', '181', '1699', '1758', '1896', '688', '1676', '992', '961', '1477', '71', '530', '1699', '531']


## 2.2 Extracting Features from Emails

In [83]:
def emailFeatures(word_indices):
    '''This function will return a featrue vector for the given email(word_indices)'''
    n=1899
    x=np.zeros((n,1))
    lenth=len(word_indices)
    for i in range(lenth):
        x[int(word_indices[i])-1][0]=1
    return x

In [84]:
#Feature Extraction
#Extract Features
file_contents=readFile('emailSample1.txt')
word_indices=processEmail(file_contents)
features=emailFeatures(word_indices)

#Print Stats
print('length of feature vector:',features.shape[0],',it should be 1899')
print('Number of non-zero entries:',np.sum(features),',it should be 45')

length of feature vector: 1899 ,it should be 1899
Number of non-zero entries: 45.0 ,it should be 45


## 2.3 Training SVM for Spam Classification

In [85]:
#Train Linear SVM for Spam Classification
dataFile='spamTrain.mat'
data=scio.loadmat(dataFile)
X=data['X']
y=data['y']
print(X.shape)

(4000, 1899)


In [86]:
#train
C=0.1
model=SVC(C=0.1,kernel='linear').fit(X,y)
accuracy=model.score(X,y)
print('Training Accuracy:'+str(accuracy*100)+'%')
print('It should be about 99.8%')

  y = column_or_1d(y, warn=True)


Training Accuracy:99.825%
It should be about 99.8%


In [87]:
#Test Spam Classification
#load the test dataset
dataFile='spamTest.mat'
data=scio.loadmat(dataFile)
Xtest=data['Xtest']
ytest=data['ytest']
accuracy_test=model.score(Xtest,ytest)
print('Training Accuracy:'+str(accuracy_test*100)+'%')
print('It should be about 98.5%')

Training Accuracy:98.9%
It should be about 98.5%


## 2.4 Top Predictors for Spam

In [88]:
print(model.coef_) #coef_ is an attributes of SVC objection,it is the weights(theta), it can 
#only be used when the method is 'linear'
dict_weight={}
#把每个权重以及对应的下标都加入字典
for i in range(model.coef_.shape[1]):#note that shape of model.coef_ is 1*1899
    dict_weight[model.coef_[0][i]]=i
#对该字典以下列方式进行排序，返回的将是一个列表，列表中的每一个元素都是一个元祖，每个元祖包含两个元素：key值
#和value值（排好序的）
list_weight=sorted(dict_weight.items(),reverse=True)
vocabList=getVocabList()
#在vocabList中依次查找对应的下标，找出前十五个，表示就是分类为垃圾邮件的最重要的15个词干，因为它们被分配到的
#权重最大
for i in range(15):
    print(vocabList[list_weight[i][1]][1],list_weight[i][0])


[[ 0.00793208  0.01563324  0.05546492 ..., -0.08670606 -0.00661274
   0.06506632]]
our 0.500613736175
click 0.465916390689
remov 0.422869117061
guarante 0.383621601794
visit 0.367710398246
basenumb 0.345064097946
dollar 0.323632035796
will 0.269724106037
price 0.267297714618
pleas 0.2611688867
most 0.257298197952
nbsp 0.25394145516
lo 0.253466524314
ga 0.248296990456
hour 0.246404357832
