In [1]:
import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy import io
import sklearn.svm as svm
import re

## 1. email to features

In [2]:
f = open('emailSample1.txt', 'r')
file_content = f.read()
f.close()

In [3]:
def processEmail(email_contents):
    vocabList = getVocablist()
    word_indices = []

    email_contents = email_contents.lower()
    email_contents = re.sub(r'<[^<>]+>', '', email_contents)
    email_contents = re.sub(r'[0-9]+', r'number', email_contents)
    email_contents = re.sub(r'(http|https)://[^\s]*', r'httpaddr', email_contents)
    email_contents = re.sub(r'[^\s]+@[^\s]+', r'emailaddr', email_contents)
    email_contents = re.sub(r'[$]+', r'dollar', email_contents)
    email_contents = re.sub(r'[^\sA-Za-z]', '', email_contents)

    email_contents = email_contents.split()

    for string in email_contents:
        if len(string) <= 1:
            continue
        try:
            word_indices.append(vocabList[string])
        except KeyError:
            continue

    return word_indices

In [4]:
def getVocablist():
    f = open('vocab.txt', 'r')
    contents = f.read()
    f.close()

    dic = {}
    contents = contents.split()
    for i in range(0, len(contents), 2):
        dic[contents[i+1]] = eval(contents[i])
    return dic

In [5]:
word_indices = processEmail(file_content)
vocabList = getVocablist()
features = np.zeros(len(vocabList))
features[word_indices] = 1

## 2. load svm data

In [6]:
data = io.loadmat('spamTrain.mat')
X = data['X']
y = data['y'].ravel()

In [7]:
model = svm.SVC(C=0.1, kernel='linear')
model.fit(X, y)

SVC(C=0.1, kernel='linear')

In [8]:
p = model.predict(X)
print('Traning Accuracy: {:.2f}%'.format(100*np.mean(p==y)))

Traning Accuracy: 99.83%


In [9]:
testdata = io.loadmat('spamTest.mat')
Xtest = testdata['Xtest']
ytest = testdata['ytest'].ravel()

In [10]:
p = model.predict(Xtest)
print('Test Accuracy: {:.2f}%'.format(100*np.mean(p==ytest)))

Test Accuracy: 98.90%


## 3. display top words

In [11]:
w = model.coef_.ravel()
id = np.argsort(w).tolist()
id.reverse()
wordList = list(vocabList.keys())
for i in range(15):
    print(wordList[id[i]] + '{:>20.5f}'.format(w[id[i]]))

our             0.50061
click             0.46592
remov             0.42287
guarante             0.38362
visit             0.36771
basenumb             0.34506
dollar             0.32363
will             0.26972
price             0.26730
pleas             0.26117
most             0.25730
nbsp             0.25394
lo             0.25347
ga             0.24830
hour             0.24640


## 4. make my own prediction

In [12]:
prediction = model.predict(features.reshape(1, 1899))
print(prediction)
# not spam?

[0]
