## Enron Data POI Classifier 
### Jo Anna Capp

In [1]:
#set working directory
import os
os.chdir('D:/Documents/Udacity/IntroMachineLearning/ud120projectsmaster/ud120projectsmaster/UdacityP5')

In [2]:
#import all packages and modules here
import sys
import pickle
sys.path.append("../tools/")
import pandas
import numpy
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from ggplot import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import pipeline
from sklearn.grid_search import GridSearchCV

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [3]:
features_list = ['poi']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

Lets look at the structure of the dataset and check for missing values.

In [4]:
#total individuals
print "There are ", len(data_dict.keys()), "executives of interest in the Enron dataset"
#number of pois
num_poi = 0
for dic in data_dict.values():
    if dic['poi'] == 1: 
        num_poi += 1
print "There are ", num_poi, "identified persons of interest within the dataset"
print "Data Dictionary Keys:"
print(data_dict.keys())
#data dictionary format
print "A typical key:value list: ", data_dict["SKILLING JEFFREY K"]


There are  146 executives of interest in the Enron dataset
There are  18 identified persons of interest within the dataset
Data Dictionary Keys:
['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIM

I can see from this brief exploration that there are 146 exectives in the dataset, 18 identified POIs, and 22 features, for a total of 3088 observations. There are also a number of missing values. I'll investigate those in the next section.

### EDA and Outlier Removal

In [5]:
#change dataset to pandas dataframe
df = pandas.DataFrame.from_records(list(data_dict.values()))
employees = pandas.Series(list(data_dict.keys()))

#count number of NA values
df.replace(to_replace='NaN', value=numpy.nan, inplace=True)
print "Number of NaN values for each feature:"
print df.isnull().sum()
print "Shape of the dataframe: ", df.shape

Number of NaN values for each feature:
bonus                         64
deferral_payments            107
deferred_income               97
director_fees                129
email_address                 35
exercised_stock_options       44
expenses                      51
from_messages                 60
from_poi_to_this_person       60
from_this_person_to_poi       60
loan_advances                142
long_term_incentive           80
other                         53
poi                            0
restricted_stock              36
restricted_stock_deferred    128
salary                        51
shared_receipt_with_poi       60
to_messages                   60
total_payments                21
total_stock_value             20
dtype: int64
Shape of the dataframe:  (146, 21)


There are quite a lot of NaN values for some of the features. Particularly loan advances, director fees, restricted stock deferred, and deferral payments. However, when I look at the data schema provided (enron61702insiderpay.pdf), I see that these "missing values" are actually 0, so I will convert them to 0.

In [6]:
#replace missing values with 0
df.replace(to_replace=numpy.nan, value=0, inplace=True)
#drop email address column
del df['email_address']

df.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,1333474.0,438796.5,-382762.2,19422.49,4182736.0,70748.27,358.60274,38.226027,24.287671,1149658.0,664683.9,585431.8,1749257.0,20516.37,365811.4,692.986301,1221.589041,4350622.0,5846018.0
std,8094029.0,2741325.0,2378250.0,119054.3,26070400.0,432716.3,1441.259868,73.901124,79.278206,9649342.0,4046072.0,3682345.0,10899950.0,1439661.0,2203575.0,1072.969492,2226.770637,26934480.0,36246810.0
min,0.0,-102500.0,-27992890.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2604490.0,-7576788.0,0.0,0.0,0.0,0.0,-44093.0
25%,0.0,0.0,-37926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8115.0,0.0,0.0,0.0,0.0,93944.75,228869.5
50%,300000.0,0.0,0.0,0.0,608293.5,20182.0,16.5,2.5,0.0,0.0,0.0,959.5,360528.0,0.0,210596.0,102.5,289.0,941359.5,965955.0
75%,800000.0,9684.5,0.0,0.0,1714221.0,53740.75,51.25,40.75,13.75,0.0,375064.8,150606.5,814528.0,0.0,270850.5,893.5,1585.75,1968287.0,2319991.0
max,97343620.0,32083400.0,0.0,1398517.0,311764000.0,5235198.0,14368.0,528.0,609.0,83925000.0,48521930.0,42667590.0,130322300.0,15456290.0,26704230.0,5521.0,15149.0,309886600.0,434509500.0


In [7]:
#pairplot to visualize feature distributions
def splom_viz(df, labels=None):
    ax = sns.pairplot(df, hue="poi", diag_kind='kde', size=2, vars=['poi','salary', 'total_payments', 'bonus', 
                 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                 'restricted_stock', 'to_messages', 'from_poi_to_this_person', 'from_messages',
                 'from_this_person_to_poi', 'shared_receipt_with_poi'])
    plt.show()

#splom_viz(df)

Looking at this data, there definitely appears to be outliers. If I look at the data dictionary keys again, I see that there are two that are not names: Total and travel agency in the park. I'll remove these then look again at the pairplot.

#### Outlier Removal

In [8]:
#outlier removal
df= df.drop(df.index[[data_dict.keys().index("TOTAL"), data_dict.keys().index("THE TRAVEL AGENCY IN THE PARK")]])
df.describe()

#pairplot to visualize distributions and correllations
splom_viz(df)

After these outliers are removed, I can see on the pairplot that the most of the remainder of the outliers are classified as POI, so these "outliers" are in fact real data. The exception to this are the features 'from_poi_to_this_person', 'from_this_person_to_poi', and 'from_messages'. Looking at the statistics above, we see that the max 'from_messages' is 14368, which is one order of magnitude higher than the 75%. The same is true for the outliers in the other two categories.  Who are these people?

In [9]:
#identify keys for potential outliers
for key, value in data_dict.items():
    if value['from_poi_to_this_person'] != 'NaN' and value['from_poi_to_this_person'] > 500: 
        print "Max from_poi_to_this_person: ", key

for key, value in data_dict.items():
    if value['from_this_person_to_poi'] != 'NaN' and value['from_this_person_to_poi'] > 500: 
        print "Max from_this_person_to_poi: ", key
        
for key, value in data_dict.items():
    if value['from_messages'] != 'NaN' and value['from_messages'] > 14000: 
        print "Max from_messages: ", key

Max from_poi_to_this_person:  LAVORATO JOHN J
Max from_this_person_to_poi:  DELAINEY DAVID W
Max from_messages:  KAMINSKI WINCENTY J


Since these keys are all different people, I will keep the email data and assume it is real. Finally, looking at the pairplots and statistics for each feature, there are negative values for deferred_income, defferal_payments, restricted_stock, and restricted_stock deferred. Are these outliers, real data, or errors in the dataset?

#### Checking financial features

One final check we can do is make sure there aren't any mistakes in the financial data. In the data schema, we can see that two features: total_payments and total_stock_value are linear combinations of the other financial features. If the negative values observed above are real, then total_payments and total_stock_value should equal the sum of the values these features. If there is an error in the dataset, then these values will not be equal.

In [10]:
#Checking total pay
df_totalPay = pandas.DataFrame()
df_totalPay['name'] = data_dict.keys()
df_totalPay['total'] = (
    df['bonus'] + 
    df['director_fees'] +
    df['deferral_payments'] +
    df['deferred_income'] +
    df['loan_advances'] +
    df['long_term_incentive'] +
    df['expenses'] +
    df['other'] +
    df['salary']
)
df_totalPay['total_payments'] = df['total_payments']
df_totalPay['equals?'] = (df_totalPay['total'] == df_totalPay['total_payments'])
df_totalPay['poi'] = df['poi']
print numpy.sum(df_totalPay['equals?']), " out of ", len(df_totalPay)
print "Summed Totals Different than Total Payments"
print df_totalPay[df_totalPay['equals?'] == False]

142  out of  146
Summed Totals Different than Total Payments
                              name     total  total_payments equals?    poi
24                   BELFER ROBERT  -99215.0        102500.0   False  False
101  THE TRAVEL AGENCY IN THE PARK       NaN             NaN   False    NaN
104                          TOTAL       NaN             NaN   False    NaN
118               BHATNAGAR SANJAY  275728.0      15456290.0   False  False


In [11]:
#Checking total stock value
df_totalStock = pandas.DataFrame()
df_totalStock['name'] = data_dict.keys()
df_totalStock['total'] = (
    df['restricted_stock'] +
    df['exercised_stock_options'] +
    df['restricted_stock_deferred']
)
df_totalStock['total_stock_value'] = df['total_stock_value']
df_totalStock['equals?'] = (df_totalStock['total'] ==df_totalStock['total_stock_value'])
df_totalStock['poi'] = df['poi']
print numpy.sum(df_totalPay['equals?']), " out of ", len(df_totalPay)
print "Summed Totals Different from Total Stock Value"
print df_totalStock[df_totalStock['equals?'] == False]

142  out of  146
Summed Totals Different from Total Stock Value
                              name       total  total_stock_value equals?  \
24                   BELFER ROBERT     47378.0           -44093.0   False   
101  THE TRAVEL AGENCY IN THE PARK         NaN                NaN   False   
104                          TOTAL         NaN                NaN   False   
118               BHATNAGAR SANJAY  15456290.0                0.0   False   

       poi  
24   False  
101    NaN  
104    NaN  
118  False  


In [12]:
#check 
print data_dict["BELFER ROBERT"]
print"/n"
print data_dict["BHATNAGAR SANJAY"]

{'salary': 'NaN', 'to_messages': 'NaN', 'deferral_payments': -102500, 'total_payments': 102500, 'exercised_stock_options': 3285, 'bonus': 'NaN', 'restricted_stock': 'NaN', 'shared_receipt_with_poi': 'NaN', 'restricted_stock_deferred': 44093, 'total_stock_value': -44093, 'expenses': 'NaN', 'loan_advances': 'NaN', 'from_messages': 'NaN', 'other': 'NaN', 'from_this_person_to_poi': 'NaN', 'poi': False, 'director_fees': 3285, 'deferred_income': 'NaN', 'long_term_incentive': 'NaN', 'email_address': 'NaN', 'from_poi_to_this_person': 'NaN'}
/n
{'salary': 'NaN', 'to_messages': 523, 'deferral_payments': 'NaN', 'total_payments': 15456290, 'exercised_stock_options': 2604490, 'bonus': 'NaN', 'restricted_stock': -2604490, 'shared_receipt_with_poi': 463, 'restricted_stock_deferred': 15456290, 'total_stock_value': 'NaN', 'expenses': 'NaN', 'loan_advances': 'NaN', 'from_messages': 29, 'other': 137864, 'from_this_person_to_poi': 1, 'poi': False, 'director_fees': 137864, 'deferred_income': 'NaN', 'long_t

It appears that the values for these two employees are incorrect when I compare the data schema to the data dictionary. I will drop these two individuals from the dataframe.

In [13]:
#remove two outliers
df = df.drop(df.index[[data_dict.keys().index("BELFER ROBERT"), data_dict.keys().index("BHATNAGAR SANJAY")]])
df.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,681997.3,225939.4,-196411.2,10097.753521,2099202.0,35869.359155,367.823944,39.302817,24.971831,591021.1,339557.2,298888.3,879437.9,74141.44,186311.4,712.34507,1248.338028,2280176.0,2943935.0
std,1240462.0,758698.3,609851.7,31505.3104,4825043.0,45435.17055,1460.462179,74.657135,80.287775,6842133.0,691446.8,1138775.0,2028670.0,1311168.0,197747.0,1081.732338,2250.905721,8907260.0,6225651.0
min,0.0,0.0,-3504386.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2604490.0,-1787380.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-37926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35368.25,0.0,0.0,0.0,0.0,88392.25,253495.5
50%,300000.0,0.0,0.0,0.0,608293.5,21937.0,17.5,5.5,0.0,0.0,0.0,919.0,361978.0,0.0,210596.0,115.5,347.5,941359.5,965955.0
75%,800000.0,9684.5,0.0,0.0,1714221.0,53740.75,51.25,41.75,14.0,0.0,375064.8,149831.0,814528.0,0.0,270850.5,1001.25,1655.0,1968287.0,2319991.0
max,8000000.0,6426990.0,0.0,137864.0,34348380.0,228763.0,14368.0,528.0,609.0,81525000.0,5145434.0,10359730.0,14761690.0,15456290.0,1111258.0,5521.0,15149.0,103559800.0,49110080.0


### Feature Selection

#### Create New Feature

Before I look at selecting the features for my classifier, I'll first create some new features.  I want to see if the ratio of total payments/total compensation is an indicator of POIs. I'll also look at the ratio of emails involving POIs to all other emails as a new feature. Then, I'll let feature selection decide if I use these features in the model.

In [14]:
#create new feature: ratio of messages involving POI/total
df['poi_email_ratio'] = (df['from_poi_to_this_person'] + df['from_this_person_to_poi']) / (df['from_messages'] + df['to_messages'])
df['poi_email_ratio'].replace(to_replace='NaN', value=0, inplace=True)

#create new feature: payments/total compensation ratio
df['payment_ratio'] = df['total_payments']/(df['total_stock_value'] + df['total_payments'])
df.describe()

df.replace(to_replace='NaN', value=0, inplace=True)

#### Visualize Feature Separation/Correlation

In [15]:
#radviz plot
from pandas.tools.plotting import radviz
radviz(df, 'poi')
plt.show()

#correlation table
print "Pearson correlation between features:"
print df.corr(method='pearson')

Pearson correlation between features:
                              bonus  deferral_payments  deferred_income  \
bonus                      1.000000           0.173090        -0.330798   
deferral_payments          0.173090           1.000000        -0.542689   
deferred_income           -0.330798          -0.542689         1.000000   
director_fees             -0.177464          -0.096124         0.077079   
exercised_stock_options    0.415674           0.116331        -0.267636   
expenses                   0.243604          -0.024248        -0.038320   
from_messages              0.173714           0.028069        -0.013916   
from_poi_to_this_person    0.640544           0.215175        -0.193366   
from_this_person_to_poi    0.448018           0.000789        -0.003598   
loan_advances              0.432479           0.014256        -0.024998   
long_term_incentive        0.497020           0.118576        -0.295811   
other                      0.383004           0.368232        

In this radviz plot, there is some separation between non-POIs and POIs. With the more predictive features for non-POIs being deferral payments, deferred income, and bonus. This is a multi-dimensional dataset though, and radviz is not great at visualizing high dimensional data. The correlation table also shows that there is no strong correlation between POI and other features. Because of this, I'll use PCA and/or Select K best for feature selection.

#### Train/Test Split

In [16]:
#convert pandas df to pickled dictionary
#drop rows in index corresponding to df
employees = employees = pandas.Series(list(data_dict.keys()))
employees = employees.drop(employees.index[[24, 101, 104, 120]])

#print df.index.values
#create new feature list
new_features_list = df.columns.values

# set the index of df to be the employees series:
df.set_index(employees, inplace=True)

# create a dictionary from the dataframe
df_dict = df.to_dict('index')

# Store to my_dataset for easy export below.
my_dataset = df_dict
my_feature_list = ['poi','bonus', 'deferral_payments', 'deferred_income', 'director_fees',
 'exercised_stock_options', 'expenses', 'from_messages',
 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances',
 'long_term_incentive', 'other', 'restricted_stock',
 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi',
 'to_messages', 'total_payments', 'total_stock_value', 'poi_email_ratio',
 'payment_ratio']
# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, my_feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

Since this data is very inbalanced, I'm going to use a stratified shuffle split to split my training and testing data.

In [17]:
#SSS to split into train/test data (code borrowed from tester.py)
cv = StratifiedShuffleSplit(labels, n_iter=100, test_size=0.75, random_state = 42)
for train_idx, test_idx in cv: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )

### Create/tune classifiers

##### Feature Scaling
Since there is a large range in the data from binary data (poi), to 10s-100s of emails, to a salary range averaging 10^6, I'm going to standardize all the data using sklearn's Standard Scaler. This converts all the features into a normal distribution with a mean of 0 and a standard deviation of 1.  I chose to use standardization instead of normalization (used in sklearn's MinMaxScaler) because normalization will suppress the effects of outliers, whereas standardization won't. I've seen during the data exploration above that there are still significant outliers in the data and standardization will keep information about these outliers instead of suppressing them.

##### Feature Selection
Before building the classifiers, I'll go ahead and use SelectKBest and/or PCA to select the best features and reduce the dimensionality of my dataset.  In tuning my classifiers, I'll explore the effects of using one or the other technique, and both combined.

##### Classifier Selection
I'll build GridSearchCV into the pipeline of each of the three classifiers I'm exploring, and select the best classifier for my training set after tuning the parameters.

In [18]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

In [19]:
#create a pipeline for analysis - GaussianNB
scaler = preprocessing.StandardScaler()
select = SelectKBest()
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clf = GaussianNB()

steps = [('scale', scaler),('feature_selection', feature_selection),
        ('naive_bayes', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

#search for best parameters
parameters = dict(feature_selection__select__k=[5, 10, 15, 20], 
              feature_selection__pca__n_components=[2, 5, 10])

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

cv.fit(features_train, labels_train)
pred = cv.predict(features_test)

print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report




{'feature_selection__select__k': 10, 'feature_selection__pca__n_components': 5}
             precision    recall  f1-score   support

        0.0       0.89      0.88      0.89        92
        1.0       0.27      0.29      0.28        14

avg / total       0.81      0.80      0.80       106



In [20]:
#create a pipeline for analysis - AdaBoost
scaler = preprocessing.StandardScaler()
select = SelectKBest()
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clf = AdaBoostClassifier()

steps = [('scale', scaler),('feature_selection', feature_selection),
        ('adaboost', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

#search for best parameters
parameters = dict(feature_selection__select__k=[5, 10, 15, 20], 
              feature_selection__pca__n_components=[2, 5, 10],
                 adaboost__n_estimators=[5, 10, 20, 30, 40, 50],
                 adaboost__learning_rate=[0.1, 0.5, 0.8, 1, 1.2, 1.5])

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

cv.fit(features_train, labels_train)
pred = cv.predict(features_test)

print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report

{'feature_selection__select__k': 5, 'adaboost__learning_rate': 0.1, 'feature_selection__pca__n_components': 2, 'adaboost__n_estimators': 10}
             precision    recall  f1-score   support

        0.0       0.86      0.96      0.91        92
        1.0       0.00      0.00      0.00        14

avg / total       0.75      0.83      0.79       106



In [21]:
#create a pipeline for analysis - Random Forest
scaler = preprocessing.StandardScaler()
select = SelectKBest()
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clf = RandomForestClassifier()

steps = [('scale', scaler),('feature_selection', feature_selection),
        ('random_forest', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

#search for best parameters
parameters = dict(feature_selection__select__k=[5, 10, 20], 
                 feature_selection__pca__n_components=[2, 5, 10],
                 random_forest__n_estimators=[25, 50, 100],
                 random_forest__min_samples_split=[1, 3, 5, 10])

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

cv.fit(features_train, labels_train)
pred = cv.predict(features_test)

print select.get_support()
print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report

<bound method SelectKBest.get_params of SelectKBest(k=10, score_func=<function f_classif at 0x000000000CCE5A58>)>
{'random_forest__n_estimators': 50, 'feature_selection__select__k': 5, 'random_forest__min_samples_split': 1, 'feature_selection__pca__n_components': 2}
             precision    recall  f1-score   support

        0.0       0.87      0.98      0.92        92
        1.0       0.33      0.07      0.12        14

avg / total       0.80      0.86      0.82       106



In [25]:
#take selectKbest out of the pipeline to look at top features
k_best = SelectKBest(k=10)
k_best.fit(features, labels)

results_list = zip(k_best.get_support(), my_feature_list[1:], k_best.scores_)
results_list = sorted(results_list, key=lambda x: x[2], reverse=True)
print "K-best features:", results_list

K-best features: [(True, 'exercised_stock_options', 24.29484881566545), (True, 'total_stock_value', 23.651174232774022), (True, 'bonus', 20.352782168409945), (True, 'salary', 18.008236859583267), (True, 'deferred_income', 11.184580251839124), (True, 'long_term_incentive', 9.7014319140157106), (True, 'restricted_stock', 8.9601824650818251), (True, 'total_payments', 8.594044043070511), (True, 'shared_receipt_with_poi', 8.2786814027836879), (True, 'loan_advances', 7.0667108613197493), (False, 'expenses', 5.8166004823459376), (False, 'poi_email_ratio', 5.1621999121024418), (False, 'from_poi_to_this_person', 5.0412573786693846), (False, 'other', 4.0704701087327813), (False, 'from_this_person_to_poi', 2.2951831957380029), (False, 'director_fees', 2.1537689442003294), (False, 'to_messages', 1.5784680483230906), (False, 'deferral_payments', 0.2447738510610257), (False, 'from_messages', 0.17884036938016448), (False, 'restricted_stock_deferred', 0.065999756426723316), (False, 'payment_ratio', 0.

The random forest classifier appears to produce the best precision and recall scores. I further tuned this classifier by adjusting the parameters in the grid search and testing whether both feature selection and dimensionality reduction were needed to produce the highest precision and recall scores. They were.

In [22]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
#from sklearn.cross_validation import train_test_split
#features_train, features_test, labels_train, labels_test = 
    #train_test_split(features, labels, test_size=0.3, random_state=42)

In [23]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, my_feature_list)