In [2]:
graphics_toolkit('qt')

In [3]:
addpath('../libs/');

# Prepocessing emails

In [14]:
[vocabList, vocabListReversed] = getVocabList('../../data/vocab.txt');

In [15]:
file_contents = readFile('../../data/emailSample1.txt');
word_indices = processEmail(file_contents, vocabList);

In [16]:
% Print Stats
fprintf('Word Indices: \n');
fprintf(' %d', word_indices);
fprintf('\n\n');

Word Indices: 
 86 916 794 1077 883 370 1699 790 1822 1831 883 431 1171 794 1002 1893 1364 592 1676 238 162 89 688 945 1663 1120 1062 1699 375 1162 479 1893 1510 799 1182 1237 810 1895 1440 1547 181 1699 1758 1896 688 1676 992 961 1477 71 530 1699 531



## Feature extraction

In [17]:
file_contents = readFile('../../data/emailSample1.txt');
word_indices = processEmail(file_contents, vocabList);
features = emailFeatures(word_indices);

In [18]:
% Print Stats
fprintf('Length of feature vector: %d\n', length(features));
fprintf('Number of non-zero entries: %d\n', sum(features > 0));

Length of feature vector: 1899
Number of non-zero entries: 45


## Train Linear SVM for Spam Classification

In [19]:
load('../../data/spamTrain.mat');

In [21]:
C = 0.1;
model = svmTrain(X, y, C, @linearKernel);


Training ......................................................................
...............................................................................
...............................................................................
........................................ Done! 



In [22]:
p = svmPredict(model, X);
fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);

Training Accuracy: 99.825000


### Test Spam Classification

In [23]:
load('../../data/spamTest.mat');
p = svmPredict(model, Xtest);
fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);

Test Accuracy: 98.700000


### Top predictors of Spam

In [24]:
[weight, idx] = sort(model.w, 'descend');

fprintf('\nTop predictors of spam: \n');
for i = 1:15
    fprintf(' %-15s (%f) \n', vocabListReversed(idx(i)), weight(i));
end


Top predictors of spam: 
 our             (0.503158) 
 click           (0.462220) 
 remov           (0.416805) 
 guarante        (0.389823) 
 visit           (0.369379) 
 basenumb        (0.341222) 
 dollar          (0.320755) 
 will            (0.270624) 
 price           (0.266652) 
 pleas           (0.263620) 
 lo              (0.257981) 
 most            (0.252696) 
 nbsp            (0.251416) 
 ga              (0.245294) 
 se              (0.240361) 


### Try your own email

In [25]:
filenames = cell();
filenames{end+1} = '../../data/spamSample1.txt';
filenames{end+1} = '../../data/spamSample2.txt';
filenames{end+1} = '../../data/emailSample1.txt';
filenames{end+1} = '../../data/emailSample2.txt';

for i = 1: length(filenames)
    % Read and predict
    filename = filenames{i};
    file_contents = readFile(filename);
    word_indices  = processEmail(file_contents, vocabList);
    x             = emailFeatures(word_indices);
    p = svmPredict(model, x);

    fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
    fprintf('(1 indicates spam, 0 indicates not spam)\n\n');
end


Processed ../../data/spamSample1.txt

Spam Classification: 1
(1 indicates spam, 0 indicates not spam)


Processed ../../data/spamSample2.txt

Spam Classification: 1
(1 indicates spam, 0 indicates not spam)


Processed ../../data/emailSample1.txt

Spam Classification: 0
(1 indicates spam, 0 indicates not spam)


Processed ../../data/emailSample2.txt

Spam Classification: 0
(1 indicates spam, 0 indicates not spam)

