In [5]:
import os
import numpy as np
from collections import Counter
from sklearn import svm
from sklearn.metrics import accuracy_score


import _pickle as cPickle
import gzip

def load(file_name):
    # load the model
    stream = gzip.open(file_name, "rb")
    model = cPickle.load(stream)
    stream.close()
    return model


def save(file_name, model):
    # save the model
    stream = gzip.open(file_name, "wb")
    cPickle.dump(model, stream)
    stream.close()


def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)

    for item in list_to_remove:
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)

    return dictionary



def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
      with open(fil) as fi:
        for i,line in enumerate(fi):
          if i == 2:
            words = line.split()
            for word in words:
              wordID = 0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        train_labels[docID] = 0;
        filepathTokens = fil.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1;
            count = count + 1
        docID = docID + 1
    return features_matrix, train_labels



TRAIN_DIR = "train-mails"
TEST_DIR = "test-mails"

dictionary = make_Dictionary(TRAIN_DIR)

print ("reading and processing emails from file.")

features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)


reading and processing emails from file.


TypeError: slice indices must be integers or None or have an __index__ method

In [7]:

features_matrix = features_matrix[:int(len(features_matrix)/10)]
labels = labels[:int(len(labels)/10)]


save("/tmp/features_matrix", features_matrix)
save("/tmp/labels", labels)
save("/tmp/test_feature_matrix", test_feature_matrix)
save("/tmp/test_labels", test_labels)

model = svm.SVC()

print ("Training model.")
#train model
model.fit(features_matrix, labels)

predicted_labels = model.predict(test_feature_matrix)

print ("FINISHED classifying. accuracy score : ")
print (accuracy_score(test_labels, predicted_labels))


Training model.
FINISHED classifying. accuracy score : 
0.5538461538461539




In [8]:
model = svm.SVC(kernel="rbf", C = 1)

print ("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)

print ("FINISHED classifying. accuracy score : ")
print (accuracy_score(test_labels, predicted_labels))


Training model.
FINISHED classifying. accuracy score : 
0.5538461538461539




In [32]:
# tuning c value
for c in [10,100,1000,10000,100000]:
    model = svm.SVC(kernel="rbf", C = c)

    print ("c value:",c )
    model.fit(features_matrix, labels)

    predicted_labels = model.predict(test_feature_matrix)

    print ("Accuracy score : ",accuracy_score(test_labels, predicted_labels))
    print ()


c value: 10
Accuracy score :  0.7115384615384616

c value: 100
Accuracy score :  0.9038461538461539

c value: 1000
Accuracy score :  0.9192307692307692

c value: 10000




Accuracy score :  0.9192307692307692

c value: 100000
Accuracy score :  0.9192307692307692





In [31]:
for g in [0.00001,0.0001,0.001,0.01,0.1,1,10,100]:
    print("gamma value:",g)
    model = svm.SVC(kernel="rbf", C=1000, gamma=g)
    model.fit(features_matrix, labels)

    predicted_labels = model.predict(test_feature_matrix)

    print ("Accuracy score : ",accuracy_score(test_labels, predicted_labels))
    print()

gamma value: 1e-05
Accuracy score :  0.8153846153846154

gamma value: 0.0001
Accuracy score :  0.9230769230769231

gamma value: 0.001
Accuracy score :  0.8884615384615384

gamma value: 0.01
Accuracy score :  0.7307692307692307

gamma value: 0.1
Accuracy score :  0.5307692307692308

gamma value: 1
Accuracy score :  0.5

gamma value: 10
Accuracy score :  0.5

gamma value: 100
Accuracy score :  0.5



In [12]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')