# 2 Spam Classification

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from sklearn import svm
import re
import nltk, nltk.stem.porter

## 2.1 Preprocessing Emails

In [2]:
print("emailSample1.txt:")
with open('./data/emailSample1.txt', 'r') as f:
    print(f.read())

emailSample1.txt:
> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [3]:
def preProcess(email):
    """
    Function to do some pre processing (simplification of e-mails).
    Comments throughout implementation describe what it does.
    Input = raw e-mail
    Output = processed (simplified) email
    """
    email = email.lower()
    email = re.sub('<[^<>]+>', ' ', email)
    email = re.sub('[0-9]+', 'number', email)
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    email = re.sub('[$]+', 'dollar', email)
    
    return email

In [4]:
def email2TokenList(raw_email):
    """
    Function that takes in preprocessed (simplified) email, tokenizes it,
    stems each word, and returns an (ordered) list of tokens in the e-mail
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    email = preProcess(raw_email)
    token = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist = []
    for itoken in token:
        itoken = re.sub('[^a-zA-Z0-9]', '', itoken)
        stemmed = stemmer.stem(itoken)
        if len(itoken) == 0:
            continue
        tokenlist.append(stemmed)
        
    return tokenlist

### 2.1.1 Vocabulary List

In [5]:
def getVocabDict(reverse = False):
    """
    Function to read in the supplied vocab list text file into a dictionary.
    If "reverse", the keys and values are switched.
    """
    vocab_dict = {}
    with open('./data/vocab.txt', 'r') as f:
        for line in f:
            (val, key) = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key
            
    return vocab_dict

In [6]:
def email2VocabIndices(raw_email, vocab_dict):
    """
    Function that takes in a raw email and returns a list of indices corresponding
    to the location in vocab_dict for each stemmed word in the email.
    """
    tokenlist = email2TokenList(raw_email)
    index_list = [vocab_dict[token] for token in tokenlist if token in vocab_dict]
    
    return index_list

## 2.2 Extracting Features from Emails

In [7]:
def email2FeatureVector(raw_email, vocab_dict):
    """
    Function that takes as input a raw email, and returns a vector of shape
    (n,1) where n is the size of the vocab_dict.
    The first element in this vector is 1 if the vocab word with index == 1
    is in the raw_email, 0 otherwise.
    """
    n = len(vocab_dict)
    result = np.zeros((n, 1))
    vocab_indices = email2VocabIndices(raw_email, vocab_dict)
    for i in vocab_indices:
        result[i] = 1
    
    return result

In [8]:
vocab_dict = getVocabDict()
email_content = open('./data/emailSample1.txt', 'r').read()
test_fv = email2FeatureVector(email_content, vocab_dict)

print("Length of feature vector is {}".format(len(test_fv)))
print("Number of non-zero entries is: {}".format(sum(test_fv == 1)))

Length of feature vector is 1899
Number of non-zero entries is: [45]


## 2.3 Training SVM for Spam Classification

In [9]:
datafile = 'data/spamTrain.mat'
mat = scipy.io.loadmat(datafile)
X, y = mat['X'], mat['y']

datafile = 'data/spamTest.mat'
mat = scipy.io.loadmat(datafile)
Xtest, ytest = mat['Xtest'], mat['ytest']

In [10]:
pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])

print('Total number of training emails = ',X.shape[0])
print('Number of training spam emails = ',pos.shape[0])
print('Number of training nonspam emails = ',neg.shape[0])

Total number of training emails =  4000
Number of training spam emails =  1277
Number of training nonspam emails =  2723


In [11]:
linear_svm = svm.SVC(C = 0.1, kernel = 'linear')
linear_svm.fit(X, y.flatten())

SVC(C=0.1, kernel='linear')

In [16]:
train_pred = linear_svm.predict(X).reshape((y.shape[0], 1))
train_acc = float(sum(train_pred == y)) / y.shape[0]
print("Training accuracy = {:.2%}".format(train_acc))

Training accuracy = 99.83%


In [20]:
test_pred = linear_svm.predict(Xtest).reshape((ytest.shape[0], 1))
test_acc = float(sum(test_pred == ytest)) / ytest.shape[0]
print("Test accuracy = {:.2%}".format(test_acc))

Test accuracy = 98.90%


## 2.4 Top Predictors for Spam

In [21]:
vocab_dict_flipped = getVocabDict(reverse=True)

#Sort indicies from most important to least-important (high to low weight)
sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]
print("The 15 most important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[:15] ])
print()
print("The 15 least important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[-15:] ])
print()

# Most common word (mostly to debug):
most_common_word = vocab_dict_flipped[sorted_indices[0]]
print('# of spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(pos[:,1190]),pos.shape[0],  \
     100.*float(sum(pos[:,1190]))/pos.shape[0]))
print('# of NON spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(neg[:,1190]),neg.shape[0],      \
     100.*float(sum(neg[:,1190]))/neg.shape[0]))

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']

The 15 least important words to classify a spam e-mail are:
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']

# of spam containing "otherwis" = 804/1277 = 62.96%
# of NON spam containing "otherwis" = 301/2723 = 11.05%
