In [1]:
#!/usr/bin/python
from time import time
import sys
import pickle
import pprint
pp = pprint.PrettyPrinter(indent=4)
from sklearn.metrics import accuracy_score
sys.path.append("../tools/")

#from feature_format import featureFormat, targetFeatureSplit
#from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary', 'to_messages', 'deferral_payments',
                 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock',
                 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value',
                 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi',
                 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)


In [2]:
import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """

    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)

def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features



In [3]:
### Explore features of data
print "Number of  people: %d" % (len(data_dict))
print "Number of features: %d" % (len(data_dict['METTS MARK']))
print "Features: %s" % data_dict['METTS MARK'].keys()
print
poi_names = open("../final_project/poi_names.txt").read().split('\n')
poi_y = [name for name in poi_names if "(y)" in name]
poi_n = [name for name in poi_names if "(n)" in name]
print "Number of POI in poi_names.txt: %d" % len(poi_y + poi_n)

poi_count = 0
for person in data_dict:
	if data_dict[person]["poi"]==1:
		poi_count +=1 
print "Number of POI in dataset: %d" % (poi_count)
print
print

keys_w_nans = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())
keys_w_negs = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())

for person in data_dict:
    for key, value in data_dict[person].iteritems():
        if value == "NaN":
            keys_w_nans[key] += 1
        elif value < 0:
            keys_w_negs[key] += 1

print "Number of NaNs:"          
pp.pprint(keys_w_nans)

print
print "Number of Negative Values"
pp.pprint(keys_w_negs) 
print


Number of  people: 146
Number of features: 21
Features: ['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'email_address', 'from_poi_to_this_person']

Number of POI in poi_names.txt: 35
Number of POI in dataset: 18


Number of NaNs:
{   'bonus': 64,
    'deferral_payments': 107,
    'deferred_income': 97,
    'director_fees': 129,
    'email_address': 35,
    'exercised_stock_options': 44,
    'expenses': 51,
    'from_messages': 60,
    'from_poi_to_this_person': 60,
    'from_this_person_to_poi': 60,
    'loan_advances': 142,
    'long_term_incentive': 80,
    'other': 53,
    'poi': 0,
    'restricted_stock': 36,
    'restricted_stock_deferred': 128,
    'salary': 51,
    'shared_receipt_with

In [4]:
#Remove these values from the dictionary
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

Create the following features:
>- Percent salary/total payments
>- Percent bonus/total payments
>- Ratio of salary:bonus
>- Ratio of total stock value:total payments
>- Percent excercised stock/total stock value

In [5]:
def create_ratio(data_dict, ratio_name, numerator, denominator):
    for person in data_dict:
        if data_dict[person][numerator] == 'NaN' or data_dict[person][denominator] == 'NaN':
                data_dict[person][ratio_name] = 'NaN'
        else:
            data_dict[person][ratio_name] = float(data_dict[person][numerator])/float(data_dict[person][denominator])
    return data_dict

In [6]:
data_dict = create_ratio(data_dict, 'sal_total', 'salary', 'total_payments')
data_dict = create_ratio(data_dict, 'bon_total', 'bonus', 'total_payments')
data_dict = create_ratio(data_dict, 'sal_bon', 'salary', 'bonus')
data_dict = create_ratio(data_dict, 'stock_pay', 'total_stock_value', 'total_payments')
data_dict = create_ratio(data_dict, 'excer_stock', 'exercised_stock_options', 'total_stock_value')

In [7]:
features_list.append('sal_total')
features_list.append('bon_total')
features_list.append('sal_bon')
features_list.append('stock_pay')
features_list.append('excer_stock')

In [8]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.1, random_state=42)

In [9]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Out of the box algorithms
from sklearn.metrics import precision_recall_fscore_support

from sklearn.naive_bayes import GaussianNB
nbclf = GaussianNB()
print "Naive Bayes Classifier:"
t0 = time()
nbclf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
nbpred = nbclf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
nb_acc = accuracy_score(labels_test, nbpred)
print "accuracy:", round(nb_acc, 4)
print "precision, recall, fscore, totals [0, 1]"
pp.pprint(precision_recall_fscore_support(labels_test, nbpred))
print

from sklearn.svm import SVC
svm_clf = SVC(kernel="rbf", C = 10000)
print "Support Vector Machine"
t0 = time()
svm_clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
svm_pred = svm_clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
svm_acc = accuracy_score(labels_test, svm_pred)
print "accuracy: ", round(svm_acc, 4)
print "precision, recall, fscore, totals [0, 1]"
pp.pprint(precision_recall_fscore_support(labels_test, svm_pred))
print

from sklearn import tree
print "Decision Tree"
split = tree.DecisionTreeClassifier(min_samples_split = 10)
t0 = time()
split.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
split_pred = split.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
acc_split = accuracy_score(labels_test, split_pred)
print "accuracy: ", round(acc_split, 4)
print "precision, recall, fscore, totals [0, 1]"
pp.pprint(precision_recall_fscore_support(labels_test, split_pred))
print

from sklearn.neighbors import KNeighborsClassifier
neigh_clf = KNeighborsClassifier(n_neighbors=3)
print "K Nearest Neighbors"
t0 = time()
neigh_clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
neigh_pred = neigh_clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
neigh_accuracy = accuracy_score(labels_test, neigh_pred)
print "accuracy: ", round(neigh_accuracy, 4)
print "precision, recall, fscore, totals [0, 1]"
pp.pprint(precision_recall_fscore_support(labels_test, neigh_pred))


Naive Bayes Classifier:
training time: 0.001 s
prediction time: 0.001 s
accuracy: 0.6667
precision, recall, fscore, totals [0, 1]
(   array([ 1.        ,  0.16666667]),
    array([ 0.64285714,  1.        ]),
    array([ 0.7826087 ,  0.28571429]),
    array([14,  1], dtype=int64))

Support Vector Machine
training time: 0.003 s
prediction time: 0.001 s
accuracy:  0.9333
precision, recall, fscore, totals [0, 1]
(   array([ 0.93333333,  0.        ]),
    array([ 1.,  0.]),
    array([ 0.96551724,  0.        ]),
    array([14,  1], dtype=int64))

Decision Tree
training time: 0.001 s
prediction time: 0.0 s
accuracy:  0.8
precision, recall, fscore, totals [0, 1]
(   array([ 0.92307692,  0.        ]),
    array([ 0.85714286,  0.        ]),
    array([ 0.88888889,  0.        ]),
    array([14,  1], dtype=int64))

K Nearest Neighbors
training time: 0.001 s
prediction time: 0.001 s
accuracy:  0.9333
precision, recall, fscore, totals [0, 1]
(   array([ 1. ,  0.5]),
    array([ 0.92857143,  1.     

  'precision', 'predicted', average, warn_for)


In [10]:
### Feature selection attempts

#Create a numerical dataframe
import pandas as pd

df = pd.DataFrame.from_dict(data_dict)
df = df.transpose()
df = df.drop('email_address', 1)
num_df = df.replace("NaN", 0)

from sklearn.feature_selection import VarianceThreshold
import pprint
pp = pprint.PrettyPrinter(indent=4)

sel_8 = VarianceThreshold(threshold=(.8 * (1 - .8)))
reduced_8 = sel_8.fit_transform(num_df)
indices_8 = sel_8.get_support()

for i in range(0, len(indices_8)):
    if indices_8[i] == False:
        print list(num_df.columns.values)[i]

excer_stock
poi
sal_total


Oh, this is rich.  After all that work (it was short code, but my basic programming skills made this take a while), the only feature with low variance is poi, which we would remove anyway.  Let's see what other columns might have relatively low variance when we change the threshold.

In [11]:
def usevariance_t(thresh, df):
    sel = VarianceThreshold(threshold=thresh)
    reduced = sel.fit_transform(df)
    indices = sel.get_support()
    features = []
    for i in range(0, len(indices)):
        if indices[i] == False:
            features.append(list(num_df.columns.values)[i])
    return features

print usevariance_t(10000000, num_df)

['bon_total', 'excer_stock', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'poi', 'sal_bon', 'sal_total', 'shared_receipt_with_poi', 'stock_pay', 'to_messages']


It makes sense that the other columns with relatively low variance are those dealing with e-mails.  That makes sense because those numbers IN GENERAL will be lower than salaries and thus variance will be lower. More and more it seems to me that the whole variance threshold thing will only really work with normalized features in each column.

In [12]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(num_df)
df_normalized = pd.DataFrame(df_scaled)

print usevariance_t(.01, df_normalized)


['loan_advances', 'restricted_stock_deferred', 'stock_pay', 'total_payments']


Ok, so this might be more meaningful.  It looks like there is relatively low variance (< 0.01) within these categories.  Not sure if that means I should eliminate them yet. 

In [13]:
poi_labels = num_df['poi']
num_df = num_df.drop('poi', 1)

In [14]:
def iqr_outliers(df, column):
    nan_count = 0
    for value in column:
        if value == 'NaN':
            nan_count += 1
    cleaned_column = [x for x in column if str(x) != "NaN"]
    iqr = np.subtract(*np.percentile(cleaned_column, [75, 25]))
    upper =  np.percentile(cleaned_column, 75) + 1.5 * iqr
    lower =  np.percentile(cleaned_column, 25) - 1.5 * iqr
    outliers = []
    lows = 0
    highs = 0
    non_outliers= 0
    for value in cleaned_column:
        if value < lower:
            lows += 1
            outliers.append(value)
        elif value > upper:
            highs += 1
            outliers.append(value)
        else:
            non_outliers += 1
                
    
    return ({"Low_outliers": lows, 
             "High_outliers": highs, 
             "NaNs": nan_count,
            "Non_outliers": non_outliers
            })

outlier_count = dict((name, {}) for name in df.columns)

for key, value in outlier_count.iteritems():
    value = iqr_outliers(df, df[key])
    outlier_count[key] = value

outlier_df = pd.DataFrame.from_dict(outlier_count)
outlier_df = outlier_df.transpose()

In [15]:
print outlier_df

                           High_outliers  Low_outliers  NaNs  Non_outliers
bon_total                              4             0    63            77
bonus                                 10             0    63            71
deferral_payments                      6             0   106            32
deferred_income                        0             5    96            43
director_fees                          0             4   128            12
excer_stock                            0             1    44            99
exercised_stock_options               11             0    43            90
expenses                               3             0    50            91
from_messages                         17             0    58            69
from_poi_to_this_person               11             0    58            75
from_this_person_to_poi               13             0    58            73
loan_advances                          0             0   141             3
long_term_incentive      

In [16]:
no_poi = df.drop('poi', 1)
no_poi = no_poi.replace("NaN", 0)

In [17]:
list(no_poi.columns.values)

['bon_total',
 'bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'excer_stock',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'sal_bon',
 'sal_total',
 'salary',
 'shared_receipt_with_poi',
 'stock_pay',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [37]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier()
clf = clf.fit(no_poi, poi_labels)
imp_dic = {}
for feature, importance in zip(list(no_poi.columns.values), clf.feature_importances_):
    imp_dic[feature] = importance


In [38]:
imp_dic

{'bon_total': 0.041959184371542189,
 'bonus': 0.083194102634256487,
 'deferral_payments': 0.019912228458090802,
 'deferred_income': 0.09355854423354773,
 'director_fees': 0.00057566797788414137,
 'excer_stock': 0.034295743514794863,
 'exercised_stock_options': 0.051637191394683965,
 'expenses': 0.04384394779105364,
 'from_messages': 0.050745556314803431,
 'from_poi_to_this_person': 0.027700376765628899,
 'from_this_person_to_poi': 0.040367553272505526,
 'loan_advances': 0.0035303006608425349,
 'long_term_incentive': 0.051880630391807112,
 'other': 0.054656707231666789,
 'restricted_stock': 0.038377335506411356,
 'restricted_stock_deferred': 0.0050050066514957629,
 'sal_bon': 0.037110398228410654,
 'sal_total': 0.036245668012236491,
 'salary': 0.05939896193731934,
 'shared_receipt_with_poi': 0.038290522190466844,
 'stock_pay': 0.045476735100144758,
 'to_messages': 0.039731909342403317,
 'total_payments': 0.046024097164250065,
 'total_stock_value': 0.056481630853753295}

In [28]:
non_neg_df = no_poi.where(no_poi > 0, -no_poi)

In [32]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

k_tops = {}

for i in range(1, 25):
    ch2 = SelectKBest(chi2, k = i)
    ch2.fit(non_neg_df, poi_labels)
    klist = ch2.get_support()
    for truth, name in zip(klist, list(no_poi.columns.values)):
        if truth and name not in k_tops.values():
            k_tops[i] = (name)

ch2 = SelectKBest(chi2, k = 'all')
ch2.fit(non_neg_df, poi_labels)
scores = ch2.scores_

k_scores = {}
for score, name in zip(scores, list(no_poi.columns.values)):
    k_scores[name] = score

In [33]:
k_scores

{'bon_total': 18.308612008608346,
 'bonus': 41546794.079927064,
 'deferral_payments': 575829.04306187853,
 'deferred_income': 20469996.744445831,
 'director_fees': 205309.42857142858,
 'excer_stock': 0.0098678733905965335,
 'exercised_stock_options': 237947643.76971057,
 'expenses': 349013.77484335151,
 'from_messages': 955.78800082948601,
 'from_poi_to_this_person': 738.41454424450308,
 'from_this_person_to_poi': 620.96027717347522,
 'loan_advances': 549702499.04251242,
 'long_term_incentive': 13273623.898099067,
 'other': 18029146.221745741,
 'restricted_stock': 37551575.915919423,
 'restricted_stock_deferred': 2918369.7142857141,
 'sal_bon': 1.3332479328604013e-06,
 'sal_total': 0.74596712680387356,
 'salary': 3463395.416699328,
 'shared_receipt_with_poi': 13704.817630381005,
 'stock_pay': 89.214421366573902,
 'to_messages': 6833.8754052980303,
 'total_payments': 291743055.52305174,
 'total_stock_value': 276569697.03888297}

In [41]:
import operator
ord_imp_dic = sorted(imp_dic.items(), key=operator.itemgetter(1), reverse = True)    
ord_k_scores = sorted(k_scores.items(), key=operator.itemgetter(1), reverse = True)       

In [45]:
ord_imp_dic

[('deferred_income', 0.09355854423354773),
 ('bonus', 0.083194102634256487),
 ('salary', 0.05939896193731934),
 ('total_stock_value', 0.056481630853753295),
 ('other', 0.054656707231666789),
 ('long_term_incentive', 0.051880630391807112),
 ('exercised_stock_options', 0.051637191394683965),
 ('from_messages', 0.050745556314803431),
 ('total_payments', 0.046024097164250065),
 ('stock_pay', 0.045476735100144758),
 ('expenses', 0.04384394779105364),
 ('bon_total', 0.041959184371542189),
 ('from_this_person_to_poi', 0.040367553272505526),
 ('to_messages', 0.039731909342403317),
 ('restricted_stock', 0.038377335506411356),
 ('shared_receipt_with_poi', 0.038290522190466844),
 ('sal_bon', 0.037110398228410654),
 ('sal_total', 0.036245668012236491),
 ('excer_stock', 0.034295743514794863),
 ('from_poi_to_this_person', 0.027700376765628899),
 ('deferral_payments', 0.019912228458090802),
 ('restricted_stock_deferred', 0.0050050066514957629),
 ('loan_advances', 0.0035303006608425349),
 ('director_f

In [43]:
final_rank = {}
for key in k_scores:
    final_rank[key] = imp_dic[key] * k_scores[key]

In [44]:
ord_final_rank = sorted(final_rank.items(), key=operator.itemgetter(1), reverse = True)  
ord_final_rank

[('total_stock_value', 15621107.533484574),
 ('total_payments', 13427210.734388135),
 ('exercised_stock_options', 12286948.023250625),
 ('bonus', 3456448.2508097719),
 ('loan_advances', 1940615.0956365746),
 ('deferred_income', 1915143.0958758132),
 ('restricted_stock', 1441129.427719716),
 ('other', 985413.76667886833),
 ('long_term_incentive', 688643.97541713563),
 ('salary', 205722.09253040963),
 ('expenses', 15302.141722590453),
 ('restricted_stock_deferred', 14606.459831523789),
 ('deferral_payments', 11466.039458251931),
 ('shared_receipt_with_poi', 524.76462359240509),
 ('to_messages', 271.52291806058105),
 ('director_fees', 118.19006358626285),
 ('from_messages', 48.501993821106069),
 ('from_this_person_to_poi', 25.066647068910058),
 ('from_poi_to_this_person', 20.454361084792886),
 ('stock_pay', 4.057180607600376),
 ('bon_total', 0.76821442685622898),
 ('sal_total', 0.027038076826175123),
 ('excer_stock', 0.00033842605484036789),
 ('sal_bon', 4.94773617256548e-08)]