In [13]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time

path_prefix = '../../tools'
sys.path.append(path_prefix)
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(path_prefix=path_prefix)

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [2]:
from sklearn import svm
from sklearn.metrics import accuracy_score

# SVM Author ID

In [3]:
clf = svm.SVC(kernel='linear')
clf.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [5]:
pred = clf.predict(features_test)

In [8]:
accuracy_score(pred, labels_test)

0.98407281001137659

# A smaller Training Set

In [14]:
sub_features_train = features_train[:int(len(features_train)/100)] 
sub_labels_train = labels_train[:int(len(labels_train)/100)] 

In [16]:
clf = svm.SVC(kernel='linear')
clf.fit(sub_features_train, sub_labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
pred = clf.predict(features_test)

In [18]:
accuracy_score(pred, labels_test)

0.88452787258248011

# Deploy an RBF Kernel

In [19]:
clf = svm.SVC(kernel='rbf')
clf.fit(sub_features_train, sub_labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
pred = clf.predict(features_test)

In [21]:
accuracy_score(pred, labels_test)

0.61604095563139927

# Optimize C Parameter

In [22]:
def test_svm_c(values):
    max_accuracy = 0
    best_c = None
    for c in values:
        clf = svm.SVC(kernel='rbf', C=c)
        clf.fit(sub_features_train, sub_labels_train)
        pred = clf.predict(features_test)
        accuracy = accuracy_score(pred, labels_test)
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_c = c
    return (max_accuracy, best_c)

In [23]:
test_svm_c([10, 1000, 1000, 10000])

(0.89249146757679176, 10000)

# Extracting Predictions from an SVM

In [26]:
clf = svm.SVC(kernel='rbf', C=10000)
clf.fit(sub_features_train, sub_labels_train)

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
test_data = [features_test[10], features_test[26], features_test[50]]

In [36]:
clf.predict(test_data)

array([1, 0, 1])

# How Many Chris Emails Predicted

In [37]:
clf = svm.SVC(kernel='rbf', C=10000)
clf.fit(features_train, labels_train)

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
pred = clf.predict(features_test)

In [45]:
len(pred[pred == 1])

877