# Initialization

## Load Input Data

In [1]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 1 (Naive Bayes) mini-project. 

    Use a Naive Bayes Classifier to identify emails by their authors
    
    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
from tools.email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(path_prefix='tools')

no. of Chris training emails: 7936
no. of Sara training emails: 7884


## Load Lib

In [2]:
from sklearn.metrics import accuracy_score

---

# L1 - Naive Bayes

In [3]:
from sklearn.naive_bayes import GaussianNB

In [4]:
clf = GaussianNB()

t_start_train = time()
clf.fit(features_train, labels_train)
print("training time: ", time() - t_start_train)

training time:  1.4886889457702637


In [5]:
t_start_predict = time()
pred = clf.predict(features_test)
print("prediction time: ", time() - t_start_predict)

prediction time:  0.3462941646575928


In [6]:
accuracy_score(labels_test, pred)

0.97326507394766781

---

# L2 - SVM
- SVM maximizes robustness
- SVM first considers the correctness and then the margin

In [7]:
from sklearn import svm

## SVM Author ID

In [8]:
clf = svm.SVC(kernel='linear')
clf.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
pred = clf.predict(features_test)
accuracy_score(pred, labels_test)

0.98407281001137659

## A smaller Training Set

In [10]:
sub_features_train = features_train[:int(len(features_train)/100)] 
sub_labels_train = labels_train[:int(len(labels_train)/100)] 

In [11]:
clf = svm.SVC(kernel='linear')
clf.fit(sub_features_train, sub_labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
pred = clf.predict(features_test)
accuracy_score(pred, labels_test)

0.88452787258248011

## Deploy an RBF Kernel

In [13]:
clf = svm.SVC(kernel='rbf')
clf.fit(sub_features_train, sub_labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
pred = clf.predict(features_test)
accuracy_score(pred, labels_test)

0.61604095563139927

## Optimize C Parameter

In [15]:
def test_svm_c(values):
    max_accuracy = 0
    best_c = None
    for c in values:
        clf = svm.SVC(kernel='rbf', C=c)
        clf.fit(sub_features_train, sub_labels_train)
        pred = clf.predict(features_test)
        accuracy = accuracy_score(pred, labels_test)
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_c = c
    return (max_accuracy, best_c)

In [16]:
test_svm_c([10, 1000, 1000, 10000])

(0.89249146757679176, 10000)

## Extracting Predictions from an SVM

In [17]:
clf = svm.SVC(kernel='rbf', C=10000)
clf.fit(sub_features_train, sub_labels_train)

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
test_data = [features_test[10], features_test[26], features_test[50]]

In [19]:
clf.predict(test_data)

array([1, 0, 1])

## How Many Chris Emails Predicted

In [20]:
clf = svm.SVC(kernel='rbf', C=10000)
clf.fit(features_train, labels_train)

SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
pred = clf.predict(features_test)

In [22]:
len(pred[pred == 1])

877

# L3 - Decision Tree

In [23]:
from sklearn import tree
from sklearn.metrics import accuracy_score

In [24]:
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=40, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [25]:
pred = clf.predict(features_test)
accuracy_score(pred, labels_test)

0.9766780432309442