# Enron Project

In [1]:
#!/usr/bin/python
from time import time
import sys
import pickle
import pprint
pp = pprint.PrettyPrinter(indent=4)

from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV


sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data




In [2]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [3]:
### Explore features of data

print "Number of  people: %d" % (len(data_dict))
print "Number of features: %d" % (len(data_dict['METTS MARK']))
print
poi_names = open("../final_project/poi_names.txt").read().split('\n')
poi_y = [name for name in poi_names if "(y)" in name]
poi_n = [name for name in poi_names if "(n)" in name]
print "Number of POI in poi_names.txt: %d" % len(poi_y + poi_n)

poi_count = 0
for person in data_dict:
	if data_dict[person]["poi"]==1:
		poi_count +=1 
print "Number of POI in dataset: %d" % (poi_count)

Number of  people: 146
Number of features: 21

Number of POI in poi_names.txt: 35
Number of POI in dataset: 18


In [4]:
def neg_nan_count(data_dict):
    '''returns a dictionary containing the number of NaNs for each feature and the number of negative numbers for each feature'''
    keys_w_nans = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())
    keys_w_negs = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())
    for person in data_dict:
        for key, value in data_dict[person].iteritems():
            if value == "NaN":
                keys_w_nans[key] += 1
            elif value < 0:
                keys_w_negs[key] += 1
    return keys_w_nans, keys_w_negs


In [5]:
keys_w_nans, keys_w_negs = neg_nan_count(data_dict)

print "Number of NaNs:"          
pp.pprint(keys_w_nans)
print
print "Number of Negative Values"
pp.pprint(keys_w_negs) 


Number of NaNs:
{   'bonus': 64,
    'deferral_payments': 107,
    'deferred_income': 97,
    'director_fees': 129,
    'email_address': 35,
    'exercised_stock_options': 44,
    'expenses': 51,
    'from_messages': 60,
    'from_poi_to_this_person': 60,
    'from_this_person_to_poi': 60,
    'loan_advances': 142,
    'long_term_incentive': 80,
    'other': 53,
    'poi': 0,
    'restricted_stock': 36,
    'restricted_stock_deferred': 128,
    'salary': 51,
    'shared_receipt_with_poi': 60,
    'to_messages': 60,
    'total_payments': 21,
    'total_stock_value': 20}

Number of Negative Values
{   'bonus': 0,
    'deferral_payments': 1,
    'deferred_income': 49,
    'director_fees': 0,
    'email_address': 0,
    'exercised_stock_options': 0,
    'expenses': 0,
    'from_messages': 0,
    'from_poi_to_this_person': 0,
    'from_this_person_to_poi': 0,
    'loan_advances': 0,
    'long_term_incentive': 0,
    'other': 0,
    'poi': 0,
    'restricted_stock': 1,
    'restricted_stock_

In [6]:
for person in data_dict:
    if data_dict[person]['deferral_payments'] < 0:
        print data_dict[person]['deferral_payments']
        print person

-102500
BELFER ROBERT


This must be a mis-entry into the dataset. According to the pdf with financial values, this should be his 'deferred income' value, which makes sense. Deferral payments should be positive not negative.  Taking a look at all of his values (below) shows some other errors.


In [7]:
pp.pprint (data_dict['BELFER ROBERT'])

{   'bonus': 'NaN',
    'deferral_payments': -102500,
    'deferred_income': 'NaN',
    'director_fees': 3285,
    'email_address': 'NaN',
    'exercised_stock_options': 3285,
    'expenses': 'NaN',
    'from_messages': 'NaN',
    'from_poi_to_this_person': 'NaN',
    'from_this_person_to_poi': 'NaN',
    'loan_advances': 'NaN',
    'long_term_incentive': 'NaN',
    'other': 'NaN',
    'poi': False,
    'restricted_stock': 'NaN',
    'restricted_stock_deferred': 44093,
    'salary': 'NaN',
    'shared_receipt_with_poi': 'NaN',
    'to_messages': 'NaN',
    'total_payments': 102500,
    'total_stock_value': -44093}


In [8]:
data_dict['BELFER ROBERT']['director_fees'] = 102500
data_dict['BELFER ROBERT']['deferred_income'] = -102500
data_dict['BELFER ROBERT']['deferral_payments'] = 'NaN'
data_dict['BELFER ROBERT']['expenses'] = 3285 
data_dict['BELFER ROBERT']['total_payments'] = 102500
data_dict['BELFER ROBERT']['restricted_stock'] = 44093
data_dict['BELFER ROBERT']['restricted_stock_deferred'] = -44093
data_dict['BELFER ROBERT']['total_stock_value'] = "NaN"


In [9]:
for person in data_dict:
    if data_dict[person]['restricted_stock'] < 0:
        print data_dict[person]['restricted_stock']
        print person

-2604490
BHATNAGAR SANJAY


This too is a mis-entry according to the pdf. This negative value is meant to be his 'restricted_stock_deferred' and he is mean to have a positive value here.

In [10]:
pp.pprint (data_dict['BHATNAGAR SANJAY'])

{   'bonus': 'NaN',
    'deferral_payments': 'NaN',
    'deferred_income': 'NaN',
    'director_fees': 137864,
    'email_address': 'sanjay.bhatnagar@enron.com',
    'exercised_stock_options': 2604490,
    'expenses': 'NaN',
    'from_messages': 29,
    'from_poi_to_this_person': 0,
    'from_this_person_to_poi': 1,
    'loan_advances': 'NaN',
    'long_term_incentive': 'NaN',
    'other': 137864,
    'poi': False,
    'restricted_stock': -2604490,
    'restricted_stock_deferred': 15456290,
    'salary': 'NaN',
    'shared_receipt_with_poi': 463,
    'to_messages': 523,
    'total_payments': 15456290,
    'total_stock_value': 'NaN'}


There are a few corrections to be made here. This occured because there is a blank space instead of dash for 'other' meaning that all of the values got slid one to the left.

In [11]:
data_dict['BHATNAGAR SANJAY']['exercised_stock_options'] = 15456290
data_dict['BHATNAGAR SANJAY']['restricted_stock'] = 2604490
data_dict['BHATNAGAR SANJAY']['restricted_stock_deferred'] = -2604490
data_dict['BHATNAGAR SANJAY']['total_stock_value'] = 15456290
data_dict['BHATNAGAR SANJAY']['total_payments'] = 137864
data_dict['BHATNAGAR SANJAY']['other'] = 'NaN'

In [12]:
keys_w_nans, keys_w_negs = neg_nan_count(data_dict)

print "Number of Negative Values"
pp.pprint(keys_w_negs) 



Number of Negative Values
{   'bonus': 0,
    'deferral_payments': 0,
    'deferred_income': 50,
    'director_fees': 0,
    'email_address': 0,
    'exercised_stock_options': 0,
    'expenses': 0,
    'from_messages': 0,
    'from_poi_to_this_person': 0,
    'from_this_person_to_poi': 0,
    'loan_advances': 0,
    'long_term_incentive': 0,
    'other': 0,
    'poi': 0,
    'restricted_stock': 0,
    'restricted_stock_deferred': 18,
    'salary': 0,
    'shared_receipt_with_poi': 0,
    'to_messages': 0,
    'total_payments': 0,
    'total_stock_value': 0}


This looks a lot cleaner now.  Let's make sure there are no negative values in either of the categories where everything should be postiive.

In [13]:
for person in data_dict:
    if data_dict[person]['deferred_income'] > 0 and data_dict[person]['deferred_income'] != "NaN":
        print person, data_dict[person]['deferred_income']

In [14]:
for person in data_dict:
    if data_dict[person]['restricted_stock_deferred'] > 0 and data_dict[person]['restricted_stock_deferred'] != "NaN":
        print person, data_dict[person]['restricted_stock_deferred']

Phew, all clear.  Let's turn those negative values into positive values now, since some classifiers (e.g. SelectKbest with chi-squared) can have issues with negative values.

In [15]:
for person in data_dict:
    if data_dict[person]['deferred_income'] < 0 and data_dict[person]['deferred_income'] != "NaN":
        data_dict[person]['deferred_income'] = - data_dict[person]['deferred_income']
        
for person in data_dict:
    if data_dict[person]['restricted_stock_deferred'] < 0 and data_dict[person]['restricted_stock_deferred'] != "NaN":
        data_dict[person]['restricted_stock_deferred'] = - data_dict[person]['restricted_stock_deferred']

In [16]:
keys_w_nans, keys_w_negs = neg_nan_count(data_dict)

print "Number of Negative Values"
pp.pprint(keys_w_negs) 

Number of Negative Values
{   'bonus': 0,
    'deferral_payments': 0,
    'deferred_income': 0,
    'director_fees': 0,
    'email_address': 0,
    'exercised_stock_options': 0,
    'expenses': 0,
    'from_messages': 0,
    'from_poi_to_this_person': 0,
    'from_this_person_to_poi': 0,
    'loan_advances': 0,
    'long_term_incentive': 0,
    'other': 0,
    'poi': 0,
    'restricted_stock': 0,
    'restricted_stock_deferred': 0,
    'salary': 0,
    'shared_receipt_with_poi': 0,
    'to_messages': 0,
    'total_payments': 0,
    'total_stock_value': 0}


In [17]:
#Remove these values from the dictionary
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [18]:
### Feature selection attempts

#Create a numerical dataframe
import pandas as pd

df = pd.DataFrame.from_dict(data_dict)
df = df.transpose()
df = df.drop('email_address', 1)

In [19]:
import numpy as np

def iqr_outliers(column):
    '''Takes a panda series, identifies number of NaNs, removes NaNs, calculates IQR, 
    identifies low outliers (< Q1 - 1.5*IQR), high outliers (> Q3 + 1.5*IQR )
    and returns a dictionary of the number of high, low, non-outlier values and NaNs'''
    
    nan_count = 0
    neg_count = 0
    for value in column:
        if value == 'NaN':
            nan_count += 1
        elif value < 0:
            neg_count += 1
    cleaned_column = [x for x in column if str(x) != 'NaN']
    iqr = np.subtract(*np.percentile(cleaned_column, [75, 25]))
    upper =  np.percentile(cleaned_column, 75) + 1.5 * iqr
    lower =  np.percentile(cleaned_column, 25) - 1.5 * iqr
    outliers = []
    lows = 0
    highs = 0
    non_outliers= 0
    for value in cleaned_column:
        if value < lower:
            lows += 1
            outliers.append(value)
        elif value > upper:
            highs += 1
            outliers.append(value)
        else:
            non_outliers += 1
                
    
    return ({"Low_outliers": lows, 
             "High_outliers": highs, 
             "NaNs": nan_count,
             "Negative values": neg_count,
            "Non_outliers": non_outliers
            })


In [20]:
outlier_count = dict((name, {}) for name in df.columns)

for key, value in outlier_count.iteritems():
    value = iqr_outliers(df[key])
    outlier_count[key] = value

outlier_df = pd.DataFrame.from_dict(outlier_count)
outlier_df = outlier_df.transpose()

print outlier_df

                           High_outliers  Low_outliers  NaNs  Negative values  \
bonus                                 10             0    63                0   
deferral_payments                      6             0   107                0   
deferred_income                        5             0    95                0   
director_fees                          1             3   128                0   
exercised_stock_options               12             0    43                0   
expenses                               3             0    49                0   
from_messages                         17             0    58                0   
from_poi_to_this_person               11             0    58                0   
from_this_person_to_poi               13             0    58                0   
loan_advances                          0             0   141                0   
long_term_incentive                    7             0    79                0   
other                       

## Feature Creation and Selection

In [21]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
### Currently, I have all of them selected

features_list = ['poi','salary', 'to_messages', 'deferral_payments',
                 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock',
                 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value',
                 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi',
                 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']

Create the following features:
>- Percent salary/total payments
>- Percent bonus/total payments
>- Ratio of salary:bonus
>- Ratio of total stock value:total payments
>- Percent excercised stock/total stock value

In [22]:
def create_ratio(data_dict, ratio_name, numerator, denominator):
    '''Calcultes the ratio between a given numerator and denominator
    Names the ratio "ratio_name"
    Returns the the updated dictionary with the ratio values'''
    
    for person in data_dict:
        if data_dict[person][numerator] == 'NaN' or data_dict[person][denominator] == 'NaN':
                data_dict[person][ratio_name] = 'NaN'
        else:
            data_dict[person][ratio_name] = float(data_dict[person][numerator])/float(data_dict[person][denominator])
    return data_dict

In [23]:
data_dict = create_ratio(data_dict, 'sal_total', 'salary', 'total_payments')
data_dict = create_ratio(data_dict, 'bon_total', 'bonus', 'total_payments')
data_dict = create_ratio(data_dict, 'sal_bon', 'salary', 'bonus')
data_dict = create_ratio(data_dict, 'stock_pay', 'total_stock_value', 'total_payments')
data_dict = create_ratio(data_dict, 'excer_stock', 'exercised_stock_options', 'total_stock_value')

In [24]:
features_list.append('sal_total')
features_list.append('bon_total')
features_list.append('sal_bon')
features_list.append('stock_pay')
features_list.append('excer_stock')

### Feature Selection

In [25]:
enron_df = pd.DataFrame.from_dict(data_dict)
enron_df = enron_df.transpose()
enron_df = enron_df.drop('email_address', 1)

#Right now NaNs are string, not actual NaNs -- change this
enron_df.replace(['NaN'], [None], inplace=True)

#Creates an alternate dataframe with NaNs replaced by 0s
no_nans = enron_df.fillna(0)

In [26]:
from sklearn.feature_selection import VarianceThreshold

#Use Variance Threshold to select features with the highest variance

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(no_nans)
indices = sel.get_support()

discarded = []
for i in range(0, len(indices)):
    if indices[i] == False:
        discarded.append(list(no_nans.columns.values)[i])

In [27]:
discarded

['excer_stock', 'poi', 'sal_total']

Oh, this is rich.  After all that work (it was short code, but my basic programming skills made this take a while), the only feature with low variance is poi, which we would remove anyway and two of the combined features.  Let's see what other columns might have relatively low variance when we change the threshold.

In [28]:
def usevariance_t(thresh, df):
    '''Takes a numerical threshold and a dataframe
    Implements the VarianceThreshold method for feature selection
    Returns a list of features which were removed because their variance was lower than the threshold'''
    sel = VarianceThreshold(threshold=thresh)
    reduced = sel.fit(df)
    indices = sel.get_support()
    features = []
    for i in range(0, len(indices)):
        if indices[i] == False:
            features.append(list(no_nans.columns.values)[i])
    return features

print usevariance_t(10000000, no_nans)

['bon_total', 'excer_stock', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'poi', 'sal_bon', 'sal_total', 'shared_receipt_with_poi', 'stock_pay', 'to_messages']


It makes sense that the other columns with relatively low variance are those dealing with e-mails and the other ratios.  That makes sense because those numbers IN GENERAL will be lower than salaries and thus variance will be lower. More and more it seems to me that the whole variance threshold thing will only really work with normalized features in each column.

In [29]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(no_nans)
df_normalized = pd.DataFrame(df_scaled)

print usevariance_t(.01, df_normalized)


['loan_advances', 'stock_pay', 'total_payments']


Ok, so this might be more meaningful.  It looks like there is relatively low variance (< 0.01) within these categories.  Not sure if that means I should eliminate them yet. 

In [30]:
#Splits the dataframe into a series (the poi column) and the rest of the dataframe
poi_labels = enron_df['poi']
no_pois = enron_df.drop('poi', 1)
no_pois = no_pois.replace("NaN", 0)

In [31]:
poi_labels.head()

ALLEN PHILLIP K       False
BADUM JAMES P         False
BANNANTINE JAMES M    False
BAXTER JOHN C         False
BAY FRANKLIN R        False
Name: poi, dtype: bool

In [32]:
#Check to make sure no poi or e-mail columns and no NaNs
no_pois.head()

Unnamed: 0,bon_total,bonus,deferral_payments,deferred_income,director_fees,excer_stock,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,restricted_stock,restricted_stock_deferred,sal_bon,sal_total,salary,shared_receipt_with_poi,stock_pay,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,0.930997,4175000.0,2869717.0,3081055.0,0.0,1.0,1729541.0,13868.0,2195.0,47.0,...,126027.0,126027.0,0.048372,0.045035,201955.0,1407.0,0.385676,2902.0,4484442.0,1729541.0
BADUM JAMES P,0.0,0.0,178980.0,0.0,0.0,1.0,257817.0,3486.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.412959,0.0,182466.0,257817.0
BANNANTINE JAMES M,0.0,0.0,0.0,5104.0,0.0,0.771654,4046157.0,56301.0,29.0,39.0,...,1757552.0,560222.0,0.0,0.000521,477.0,465.0,5.7231,566.0,916197.0,5243487.0
BAXTER JOHN C,0.21298,1200000.0,1295738.0,1386055.0,0.0,0.62886,6680544.0,11200.0,0.0,0.0,...,3942714.0,0.0,0.222585,0.047406,267102.0,0.0,1.885448,0.0,5634343.0,10623258.0
BAY FRANKLIN R,0.483269,400000.0,260455.0,201641.0,0.0,0.0,0.0,129142.0,0.0,0.0,...,145796.0,82782.0,0.599178,0.289564,239671.0,0.0,0.076132,0.0,827696.0,63014.0


In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

clf = DecisionTreeClassifier()
clf = clf.fit(no_pois, poi_labels)
imp_dic = {}
for feature, importance in zip(list(no_pois.columns.values), clf.feature_importances_):
    imp_dic[feature] = importance

In [34]:
imp_dic

{'bon_total': 0.0,
 'bonus': 0.10945152803076318,
 'deferral_payments': 0.0,
 'deferred_income': 0.0,
 'director_fees': 0.042328042328042326,
 'excer_stock': 0.0,
 'exercised_stock_options': 0.19999999999999979,
 'expenses': 0.064965420713307881,
 'from_messages': 0.057720057720057727,
 'from_poi_to_this_person': 0.097883597883597878,
 'from_this_person_to_poi': 0.0073375262054505997,
 'loan_advances': 0.0,
 'long_term_incentive': 0.10884353741496598,
 'other': 0.029993455569975496,
 'restricted_stock': 0.10991999591033426,
 'restricted_stock_deferred': 0.0,
 'sal_bon': 0.0,
 'sal_total': 0.12194507432602666,
 'salary': 0.0,
 'shared_receipt_with_poi': 0.0,
 'stock_pay': 0.04961176389747822,
 'to_messages': 0.0,
 'total_payments': 0.0,
 'total_stock_value': 0.0}

In [35]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

def use_KBest(non_neg_df, score, feat_num):
    '''Uses the KBest feature selection
    Takes as input: a dataframe with non-negative values, which score you would like to use and number of features
    Returns a dictionary k_tops with the most to least important features, (key = place, value = feature name)
    Returns a dictionary k_scores of actualy scores '''
    #Iterates through selecting 1st best, 2nd best, etc. feature to create a dictionary 
    #where key is a number and value is the feature

    k_tops = {}
    for i in range(1, feat_num+1):
        ch2 = SelectKBest(score, k = i)
        ch2.fit(non_neg_df, poi_labels)
        klist = ch2.get_support()
        for truth, name in zip(klist, list(non_neg_df.columns.values)):
            if truth and name not in k_tops.values():
                k_tops[i] = (name)

    ch2 = SelectKBest(score, k = 'all')
    ch2.fit(non_neg_df, poi_labels)
    scores = ch2.scores_

    k_scores = {}
    for score, name in zip(scores, list(non_neg_df.columns.values)):
        k_scores[name] = score
    
    return k_tops, k_scores

In [36]:
#Calculates chi-squared scores in KBest
chi_k_tops, chi_k_scores = use_KBest(no_pois, chi2, len(no_pois.columns))

#Calculates mutual-information score 
mut_k_tops, mut_k_scores = use_KBest(no_pois, mutual_info_classif, len(no_pois.columns))

In [37]:
import operator
ord_imp_dic = sorted(imp_dic.items(), key=operator.itemgetter(1), reverse = True)    
ord_chi_scores = sorted(chi_k_scores.items(), key=operator.itemgetter(1), reverse = True)
ord_mut_scores = sorted(mut_k_scores.items(), key=operator.itemgetter(1), reverse = True)

In [38]:
ord_chi_scores

[('loan_advances', 549702499.04251242),
 ('total_payments', 317800121.982463),
 ('total_stock_value', 257709136.55456081),
 ('exercised_stock_options', 219904009.58211353),
 ('bonus', 41546794.079927064),
 ('restricted_stock', 37520984.091158733),
 ('deferred_income', 20328959.706663936),
 ('other', 18156107.284525376),
 ('long_term_incentive', 13273623.898099067),
 ('salary', 3463395.416699328),
 ('restricted_stock_deferred', 1082398.2857142857),
 ('deferral_payments', 567318.57663510973),
 ('expenses', 348139.58355496643),
 ('director_fees', 219483.0),
 ('shared_receipt_with_poi', 13704.817630381005),
 ('to_messages', 6833.8754052980303),
 ('from_messages', 955.78800082948601),
 ('from_poi_to_this_person', 738.41454424450308),
 ('from_this_person_to_poi', 620.96027717347522),
 ('stock_pay', 98.333051732246957),
 ('bon_total', 18.308612008608346),
 ('sal_total', 0.74596712680387356),
 ('excer_stock', 0.019507424303642693),
 ('sal_bon', 1.3332479328604013e-06)]

In [39]:
ord_mut_scores

[('bonus', 0.080438187398327976),
 ('expenses', 0.072312562666069224),
 ('other', 0.068519472897583533),
 ('excer_stock', 0.057737907628071294),
 ('shared_receipt_with_poi', 0.050474381241929045),
 ('restricted_stock_deferred', 0.044686386888540985),
 ('deferred_income', 0.042554784384563948),
 ('total_stock_value', 0.03792731667962812),
 ('sal_bon', 0.032803777834637726),
 ('from_this_person_to_poi', 0.032172151249011627),
 ('salary', 0.027706413998022539),
 ('restricted_stock', 0.027402313295444625),
 ('loan_advances', 0.025678543623489869),
 ('director_fees', 0.024465338593110175),
 ('from_poi_to_this_person', 0.01901148207318526),
 ('stock_pay', 0.015994350541096658),
 ('total_payments', 0.014427953983081387),
 ('exercised_stock_options', 0.012091285776731242),
 ('long_term_incentive', 0.0074101030719704308),
 ('from_messages', 0.00034702616094617333),
 ('to_messages', 0.0),
 ('deferral_payments', 0.0),
 ('sal_total', 0.0),
 ('bon_total', 0.0)]

In [40]:
### Store to my_dataset for easy export below. (This is after outlier removal ("TOTAL") and ("TRAVEL AGENCY IN THE PARK"))
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Classifier Selection

In [41]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Out of the box algorithms
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier


In [42]:
def run_clf(clf, features_train, features_test, labels_train, labels_test):
    ''' takes a classifier and training and test data
    prints performance time and metrics'''
    t0 = time()
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t0, 3), "s"
    t0 = time()
    labels_prediction = clf.predict(features_test)
    print "prediction time:", round(time()-t0, 3), "s"
    report = classification_report(labels_test, labels_prediction)
    print report

In [43]:
print "Naive Bayes Classifier:"
nb_clf = GaussianNB()
run_clf(nb_clf, features_train, features_test, labels_train, labels_test)

Naive Bayes Classifier:
training time: 0.003 s
prediction time: 0.0 s
             precision    recall  f1-score   support

        0.0       0.92      0.95      0.94        38
        1.0       0.50      0.40      0.44         5

avg / total       0.87      0.88      0.88        43



In [44]:
svm_clf = SVC(kernel="rbf", C = 10000)
print "Support Vector Classifier"
run_clf(svm_clf, features_train, features_test, labels_train, labels_test)

Support Vector Classifier
training time: 0.004 s
prediction time: 0.001 s
             precision    recall  f1-score   support

        0.0       0.88      1.00      0.94        38
        1.0       0.00      0.00      0.00         5

avg / total       0.78      0.88      0.83        43



  'precision', 'predicted', average, warn_for)


In [45]:
print "Decision Tree"
split = tree.DecisionTreeClassifier(min_samples_split = 10)
run_clf(split, features_train, features_test, labels_train, labels_test)

Decision Tree
training time: 0.003 s
prediction time: 0.001 s
             precision    recall  f1-score   support

        0.0       0.90      0.97      0.94        38
        1.0       0.50      0.20      0.29         5

avg / total       0.86      0.88      0.86        43



In [46]:
neigh_clf = KNeighborsClassifier(n_neighbors = 3)
print "K Nearest Neighbors"
run_clf(neigh_clf, features_train, features_test, labels_train, labels_test)

K Nearest Neighbors
training time: 0.002 s
prediction time: 0.005 s
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        38
        1.0       0.67      0.40      0.50         5

avg / total       0.89      0.91      0.90        43



In [47]:
neigh_clf = KNeighborsClassifier(n_neighbors = 3)
print "K Nearest Neighbors w Scaling"
run_clf(neigh_clf, preprocessing.MinMaxScaler().fit_transform(features_train), preprocessing.MinMaxScaler().fit_transform(features_test), labels_train, labels_test)

K Nearest Neighbors w Scaling
training time: 0.001 s
prediction time: 0.005 s
             precision    recall  f1-score   support

        0.0       0.92      0.95      0.94        38
        1.0       0.50      0.40      0.44         5

avg / total       0.87      0.88      0.88        43



In [48]:
sgd_clf = SGDClassifier(loss="log")
print "Stochastic Gradient Descent"
run_clf(sgd_clf,(features_train), (features_test), labels_train, labels_test)

Stochastic Gradient Descent
training time: 0.004 s
prediction time: 0.001 s
             precision    recall  f1-score   support

        0.0       0.87      0.89      0.88        38
        1.0       0.00      0.00      0.00         5

avg / total       0.77      0.79      0.78        43



In [49]:
sgd_clf = SGDClassifier(loss="log")
print "Stochastic Gradient Descent w scaling"
run_clf(sgd_clf, preprocessing.MinMaxScaler().fit_transform(features_train), preprocessing.MinMaxScaler().fit_transform(features_test), labels_train, labels_test)

Stochastic Gradient Descent w scaling
training time: 0.002 s
prediction time: 0.0 s
             precision    recall  f1-score   support

        0.0       0.91      0.82      0.86        38
        1.0       0.22      0.40      0.29         5

avg / total       0.83      0.77      0.79        43



In [50]:
from sklearn.ensemble import RandomForestClassifier
rando = RandomForestClassifier(n_estimators=10)
run_clf(rando, features_train, features_test, labels_train, labels_test)

training time: 0.206 s
prediction time: 0.044 s
             precision    recall  f1-score   support

        0.0       0.90      1.00      0.95        38
        1.0       1.00      0.20      0.33         5

avg / total       0.92      0.91      0.88        43



In [51]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=100)
run_clf(ada_clf, features_train, features_test, labels_train, labels_test)

training time: 0.976 s
prediction time: 0.0 s
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        38
        1.0       0.67      0.40      0.50         5

avg / total       0.89      0.91      0.90        43



In [52]:
import sklearn.pipeline
from sklearn.decomposition import PCA

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
select = SelectKBest(score_func = chi2, k = 10)
pca = PCA(n_components = 5)
kneighs = KNeighborsClassifier(n_neighbors = 3, n_jobs = -1)

steps = [('scaling', scaler),
        ('feature_selection', select),
        ('reduce_dim', pca),
        ('k_neighbors', kneighs)]

kNN_pipeline = sklearn.pipeline.Pipeline(steps)
kNN_pipeline.fit(features_train, labels_train)
labels_prediction = kNN_pipeline.predict(features_test)
report = classification_report(labels_test, labels_prediction)
print(report)

             precision    recall  f1-score   support

        0.0       0.88      0.95      0.91        38
        1.0       0.00      0.00      0.00         5

avg / total       0.78      0.84      0.81        43



Accuracy is not a good evaluation metric here because of the small number of POIs.  For example, in the testing set, there are 15 values, only one of which is a POI. Accuracy is defined at the number of items labeled correctly/total number of items. So if we were simply to make a rule to always predict non-POI, in this small testing set and with this skewed data, our accuracy would be 14/15 or .93. Edit: I updated this to show precision, recall, and f1-score instead. Also by changing the testing data to be 30% rather than 10% of the data, this allowed for more variation in the evaluation metrics.


In [53]:
ada = AdaBoostClassifier(n_estimators=100)

steps = [('feature_selection', select),
         ('reduce_dim', pca),
        ('adaboost', ada)]

ada_pipeline = sklearn.pipeline.Pipeline(steps)
ada_pipeline.fit(features_train, labels_train)
labels_prediction = ada_pipeline.predict(features_test)
report = classification_report(labels_test, labels_prediction)
print(report)

             precision    recall  f1-score   support

        0.0       0.89      0.87      0.88        38
        1.0       0.17      0.20      0.18         5

avg / total       0.81      0.79      0.80        43



### Parameter Tuning

In [None]:
from sklearn.cross_validation import StratifiedShuffleSplit

parameters = dict(feature_selection__k = [5, 10, 15, 20],
                  feature_selection__score_func = [chi2, mutual_info_classif, f_classif],
                  reduce_dim__n_components = [1, 2, 3, 4],
                  k_neighbors__n_neighbors = [3, 5, 7, 9],
                  k_neighbors__n_jobs = [-1],
                  k_neighbors__algorithm = ['ball_tree', 'kd_tree'],
                  k_neighbors__weights = ['uniform', 'distance'],
                  k_neighbors__leaf_size =[30, 10, 60, 100]
               )


cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

kNN_gs = GridSearchCV(kNN_pipeline, param_grid = parameters, cv = cv, scoring = 'f1')
t0 = time()
kNN_gs.fit(features, labels)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
labels_predictions = kNN_gs.predict(features)
print "prediction time:", round(time()-t0, 3), "s"
kNN_clf = kNN_gs.best_estimator_
report = classification_report(labels, labels_predictions)
print(report)
print kNN_clf

In [None]:
parameters = dict(feature_selection__k = [5, 10, 15, 20],
                  feature_selection__score_func = [chi2, mutual_info_classif, f_classif],
                  reduce_dim__n_components = [1, 2, 3, 4],
                  adaboost__n_estimators = [50, 75, 100, 200], )

ada_gs = GridSearchCV(ada_pipeline, param_grid = parameters, cv = cv, scoring = 'f1')
t0 = time()
ada_gs.fit(features, labels)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
labels_predictions = ada_gs.predict(features)
print "prediction time:", round(time()-t0, 3), "s"
ada_clf = gs.best_estimator_
report = classification_report(labels, labels_predictions)
accuracy = accuracy_score(labels, labels_predictions)
print(report)
print "Accuracy: ", accuracy
print ada_clf

Ok, the best score in the end is from the parameterized adaboost:
- SelectKbest works best chosing 5 features with a chi-square score.
- PCA works best creating just one component.
- Adaboost works best using 75 estimators.




In [None]:
features_k = ada_clf.best_params_['selectkbest__k']

In [None]:
SKB_k = SelectKBest(score_func = chi2, k = 5)
SKB_k.fit_transform(features, labels)   
features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
features_scores_selected=[feature_scores[i]for i in SKB_k.get_support(indices=True)]
print ' '
print 'Selected Features', features_selected
print 'Feature Scores', features_scores_selected

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

clf = ada_clf
dump_classifier_and_data(clf, my_dataset, features_list)

In [None]:
from tester import test_classifier

test_classifier(clf, my_dataset, features_list, folds = 1000)