In [1]:
import pickle

import numpy as np
from sklearn import cross_validation
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)

In [2]:
### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "tools/your_word_data.pkl" 
authors_file = "tools/your_email_authors.pkl"
word_data = pickle.load(open(words_file, "rb"))
authors = pickle.load(open(authors_file, "rb"))

In [3]:
### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data,
                                                                                             authors,
                                                                                             test_size=0.1,
                                                                                             random_state=42)

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

In [4]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]

In [5]:
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

In [6]:
accuracy_score(pred, labels_test)

0.81683731513083047

In [7]:
features_train

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [8]:
for index, feature in enumerate(clf.feature_importances_):
    if feature:
        print(index, feature, vectorizer.get_feature_names()[index])

11975 0.105378579003 attach
13080 0.0262801932367 bond
15434 0.0137142857143 copi
16267 0.0474074074074 deal
18095 0.0426666666667 enron
18849 0.186927243449 fax
19196 0.02 floor
21323 0.363636363636 houectect
21327 0.012 hour
22546 0.0840692099229 isda
24320 0.0248101945003 leav
25675 0.0255293305728 master
29690 0.0475805258904 pleas
