In [15]:
"""
In this project, we will again try to identify the authors in a body of emails, this time using a decision tree. The starter 
code is in decision_tree/dt_author_id.py.
"""
#!/usr/bin/python

import pickle
import cPickle
import numpy

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif


def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl", perc=10):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=perc)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test

In [18]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 3 (decision tree) mini-project.

    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
#import sys
#from time import time
#sys.path.append("../tools/")
#from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(perc=10)


no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [19]:
"""
Using the starter code in decision_tree/dt_author_id.py, get a decision tree up and running as a classifier, setting 
min_samples_split=40. It will probably take a while to train. What's the accuracy?
"""
from sklearn import tree
from sklearn.metrics import accuracy_score

clf =  tree.DecisionTreeClassifier(min_samples_split=40)

### train step
clf.fit(features_train, labels_train)

### use the trained classifier to predict labels for the test features
pred = clf.predict(features_test)

### calculate and return the accuracy on the test data
accuracy = accuracy_score(pred, labels_test)

print 'DTree Accuracy:', accuracy

DTree Accuracy: 0.966439135381


In [5]:
"""
You found in the SVM mini-project that the parameter tune can significantly speed up the training time of a machine learning 
algorithm. A general rule is that the parameters can tune the complexity of the algorithm, with more complex algorithms 
generally running more slowly.

Another way to control the complexity of an algorithm is via the number of features that you use in training/testing. The more 
features the algorithm has available, the more potential there is for a complex fit. We will explore this in detail in the 
"Feature Selection" lesson, but you'll get a sneak preview now.

What's the number of features in your data? (Hint: the data is organized into a numpy array where the number of rows is the 
number of data points and the number of columns is the number of features; so to extract this number, use a line of code 
like len(features_train[0]).)
"""
print len(features_train[0])

3785


In [20]:
"""
go into ../tools/email_preprocess.py, and find the line of code that looks like this: 

selector = SelectPercentile(f_classif, percentile=10) 

Change percentile from 10 to 1, and rerun dt_author_id.py. What’s the number of features now?
"""
from sklearn import tree
from sklearn.metrics import accuracy_score

features_train, features_test, labels_train, labels_test = preprocess(perc=1)
print len(features_train[0])

no. of Chris training emails: 7936
no. of Sara training emails: 7884
379


In [21]:
"""
What's the accuracy of your decision tree when you use only 1% of your available features (i.e. percentile=1)?
"""
from sklearn import tree
from sklearn.metrics import accuracy_score

clf =  tree.DecisionTreeClassifier(min_samples_split=40)

### train step
clf.fit(features_train, labels_train)

### use the trained classifier to predict labels for the test features
pred = clf.predict(features_test)

### calculate and return the accuracy on the test data
accuracy = accuracy_score(pred, labels_test)

print 'DTree Accuracy:', accuracy

DTree Accuracy: 0.967007963595
