In [70]:
import numpy as np
import matplotlib.pyplot as plt
import re
import utils
from sklearn.svm import SVC
from jupyterthemes import jtplot
jtplot.style()

In [79]:
data_path = ['emailSample1.txt', 'emailSample2.txt', 
             'spamSample1.txt', 'spamSample2.txt']
data = []
for path in data_path:
    with open(path, 'r', newline='') as file:
        data.append(file.read())
        
for sample in data:
    print(sample)
    print("-"*100, end="\n\n")

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com


----------------------------------------------------------------------------------------------------

Folks,
 
my first time posting - have a bit of Unix experience, but am new to Linux.

 
Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
fine except it didn't pick up my monitor.
 
I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
Ti4200 video card, both of which are probably too new to feature in Suse's default
set. I downloaded a driver from the nVidia website 

In [16]:
def get_vocab_list():
    with open('vocab.txt', 'r', newline='') as file:
        data = file.read()
    return data.split()[1::2]

In [82]:
def process_email(email_contents, verbose=True):
    vocab_list = get_vocab_list()
    word_indices = []
    # Lower case
    email_contents = email_contents.lower()
    # Looks for any expression that starts with < and ends with > and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)
    # Handle Numbers
    email_contents = re.sub('[0-9]+', 'number', email_contents)
    # Handle URLS
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr',
                            email_contents)
    # Handle Email Addresses
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)
    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue
        
        for i in range(len(vocab_list)):
            if word == vocab_list[i]:
                word_indices.append(i)
        
    if verbose:
        print('-'*100)
        print('Processed email:')
        print('-'*100)
        print(' '.join(processed_email))
        
    return word_indices

In [83]:
word_indices = process_email(data[0])

----------------------------------------------------------------------------------------------------
Processed email:
----------------------------------------------------------------------------------------------------
anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr


In [84]:
def email_features(word_indices):
    # Total number of words in the dictionary
    x = [0] * 1899
    for i in word_indices:
        x[i] = 1
    return x

In [90]:
features_of_letters = []
for sample in data:
    word_indices = process_email(sample, False)
    features_of_letters.append(email_features(word_indices))

In [87]:
import scipy.io as sio
train_data_path = 'spamTrain.mat'
test_data_path = 'spamTest.mat'
train_data = sio.loadmat(train_data_path)
test_data = sio.loadmat(test_data_path)
print(train_data.keys())
print(test_data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])
dict_keys(['__header__', '__version__', '__globals__', 'Xtest', 'ytest'])


In [88]:
X_train, y_train = train_data['X'], train_data['y']
X_test, y_test = test_data['Xtest'], test_data['ytest']
print(X_train.shape, X_test.shape)

(4000, 1899) (1000, 1899)


In [95]:
clf = SVC(kernel='linear', C=0.1)
clf.fit(X_train, y_train.ravel())
pred = clf.predict(X_test)
print("Accuracy: ", np.mean(pred == y_test.ravel()))

Accuracy:  0.989


In [96]:
print(clf.predict(features_of_letters))

[0 0 1 1]


In [144]:
most_important = np.argsort(clf.coef_.ravel())[-1:-16:-1]
vocab_list = get_vocab_list()
print("Top spam predictors:")
print("{0: <12} \t{1}".format("WORD", "WEIGHT"), end="\n\n")
for i in most_important:
    print("{0: <12}-\t{1}".format(vocab_list[i], clf.coef_.ravel()[i]))

Top spam predictors:
WORD         	WEIGHT

our         -	0.5006137361746403
click       -	0.465916390688888
remov       -	0.42286911706104086
guarante    -	0.38362160179406524
visit       -	0.367710398245535
basenumb    -	0.3450640979461706
dollar      -	0.3236320357963838
will        -	0.2697241060374008
price       -	0.2672977146177071
pleas       -	0.2611688867001495
most        -	0.2572981979518164
nbsp        -	0.2539414551595324
lo          -	0.25346652431419925
ga          -	0.24829699045568662
hour        -	0.24640435783158998
