In [1]:
#!/usr/bin/python
from time import time
import sys
import pickle
import pprint
pp = pprint.PrettyPrinter(indent=4)
from sklearn.metrics import accuracy_score
sys.path.append("../tools/")

#from feature_format import featureFormat, targetFeatureSplit
#from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person'] # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)


In [2]:
import numpy as np

def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """

    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print "error: key ", feature, " not present"
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)

def targetFeatureSplit( data ):
    """ 
        given a numpy array like the one returned from
        featureFormat, separate out the first feature
        and put it into its own list (this should be the 
        quantity you want to predict)

        return targets and features as separate lists

        (sklearn can generally handle both lists and numpy arrays as 
        input formats when training/predicting)
    """

    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )

    return target, features



In [3]:
### Explore features of data
print "Number of  people: %d" % (len(data_dict))
print "Number of features: %d" % (len(data_dict['METTS MARK']))
print "Features: %s" % data_dict['METTS MARK'].keys()
print
poi_names = open("../final_project/poi_names.txt").read().split('\n')
poi_y = [name for name in poi_names if "(y)" in name]
poi_n = [name for name in poi_names if "(n)" in name]
print "Number of POI in poi_names.txt: %d" % len(poi_y + poi_n)

poi_count = 0
for person in data_dict:
	if data_dict[person]["poi"]==1:
		poi_count +=1 
print "Number of POI in dataset: %d" % (poi_count)
print
print

keys_w_nans = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())
keys_w_negs = dict((key, 0) for key, value in data_dict['METTS MARK'].iteritems())

for person in data_dict:
    for key, value in data_dict[person].iteritems():
        if value == "NaN":
            keys_w_nans[key] += 1
        elif value < 0:
            keys_w_negs[key] += 1

print "Number of NaNs:"          
pp.pprint(keys_w_nans)

print
print "Number of Negative Values"
pp.pprint(keys_w_negs) 
print


Number of  people: 146
Number of features: 21
Features: ['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'email_address', 'from_poi_to_this_person']

Number of POI in poi_names.txt: 35
Number of POI in dataset: 18


Number of NaNs:
{   'bonus': 64,
    'deferral_payments': 107,
    'deferred_income': 97,
    'director_fees': 129,
    'email_address': 35,
    'exercised_stock_options': 44,
    'expenses': 51,
    'from_messages': 60,
    'from_poi_to_this_person': 60,
    'from_this_person_to_poi': 60,
    'loan_advances': 142,
    'long_term_incentive': 80,
    'other': 53,
    'poi': 0,
    'restricted_stock': 36,
    'restricted_stock_deferred': 128,
    'salary': 51,
    'shared_receipt_with

In [4]:
#Remove these values from the dictionary
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK')

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [5]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.1, random_state=42)

In [6]:
print len(features_train)
print len(features_test)


128
15


In [15]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Example starting point. Try investigating other evaluation techniques!

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
nbclf = GaussianNB()
print "Naive Bayes Classifier:"
t0 = time()
nbclf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
nbpred = nbclf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
nb_acc = accuracy_score(labels_test, nbpred)
print "accuracy: %d " % nb_acc
print

from sklearn.svm import SVC
svm_clf = SVC(kernel="rbf", C = 10000)
print "Support Vector Machine"
t0 = time()
svm_clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
svm_pred = svm_clf.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
svm_acc = accuracy_score(labels_test, svm_pred)
print "accuracy: %d " % svm_acc
print

from sklearn import tree
print "Decision Tree"
split = tree.DecisionTreeClassifier(min_samples_split = 2)
t0 = time()
split.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t0 = time()
split_pred = split.predict(features_test)
print "prediction time:", round(time()-t0, 3), "s"
acc_split = accuracy_score(labels_test, split_pred)
print "accuracy: %d " % acc_split
print

Naive Bayes Classifier:
training time: 0.004 s
prediction time: 0.0 s
accuracy: 0 

Support Vector Machine
training time: 0.004 s
prediction time: 0.0 s
accuracy: 0 

Decision Tree
training time: 0.0 s
prediction time: 0.0 s
accuracy: 1 



In [8]:
### Feature selection attempts

#Create a numerical dataframe
import pandas as pd

df = pd.DataFrame.from_dict(data_dict)
df = df.transpose()
df = df.drop('email_address', 1)
num_df = df.replace("NaN", 0)

from sklearn.feature_selection import VarianceThreshold
import pprint
pp = pprint.PrettyPrinter(indent=4)

sel_8 = VarianceThreshold(threshold=(.8 * (1 - .8)))
reduced_8 = sel_8.fit_transform(num_df)
indices_8 = sel_8.get_support()

for i in range(0, len(indices_8)):
    if indices_8[i] == False:
        print list(num_df.columns.values)[i]

poi


Oh, this is rich.  After all that work (it was short code, but my basic programming skills made this take a while), the only feature with low variance is poi, which we would remove anyway.  Let's see what other columns might have relatively low variance when we change the threshold.

In [9]:
def usevariance_t(thresh, df):
    sel = VarianceThreshold(threshold=thresh)
    reduced = sel.fit_transform(df)
    indices = sel.get_support()
    features = []
    for i in range(0, len(indices)):
        if indices[i] == False:
            features.append(list(num_df.columns.values)[i])
    return features

print usevariance_t(10000000, num_df)

['from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi', 'to_messages']


It makes sense that the other columns with relatively low variance are those dealing with e-mails.  That makes sense because those numbers IN GENERAL will be lower than salaries and thus variance will be lower. More and more it seems to me that the whole variance threshold thing will only really work with normalized features in each column.

In [10]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(num_df)
df_normalized = pd.DataFrame(df_scaled)

print usevariance_t(.01, df_normalized)


['loan_advances', 'restricted_stock_deferred', 'total_payments']


Ok, so this might be more meaningful.  It looks like there is relatively low variance (< 0.01) within these categories.  Not sure if that means I should eliminate them yet. 

In [11]:
poi_labels = num_df['poi']
num_df = num_df.drop('poi', 1)


In [12]:
non_neg_df = num_df.where(num_df > 0, -num_df)

In [35]:
def iqr_outliers(df, column):
    nan_count = 0
    for value in column:
        if value == 'NaN':
            nan_count += 1
    cleaned_column = [x for x in column if str(x) != "NaN"]
    iqr = np.subtract(*np.percentile(cleaned_column, [75, 25]))
    upper =  np.percentile(cleaned_column, 75) + 1.5 * iqr
    lower =  np.percentile(cleaned_column, 25) - 1.5 * iqr
    outliers = []
    lows = 0
    highs = 0
    non_outliers= 0
    for value in cleaned_column:
        if value < lower:
            lows += 1
            outliers.append(value)
        elif value > upper:
            highs += 1
            outliers.append(value)
        else:
            non_outliers += 1
                
    
    return ({"Low_outliers": lows, 
             "High_outliers": highs, 
             "NaNs": nan_count,
            "Non_outliers": non_outliers
            })


In [52]:
outlier_count = dict((name, {}) for name in df.columns)

for key, value in outlier_count.iteritems():
    value = iqr_outliers(df, df[key])
    outlier_count[key] = value

outlier_df = pd.DataFrame.from_dict(outlier_count)
outlier_df = outlier_df.transpose()

In [53]:
outlier_df

Unnamed: 0,High_outliers,Low_outliers,NaNs,Non_outliers
bonus,10,0,63,71
deferral_payments,6,0,106,32
deferred_income,0,5,96,43
director_fees,0,4,128,12
exercised_stock_options,11,0,43,90
expenses,3,0,50,91
from_messages,17,0,58,69
from_poi_to_this_person,11,0,58,75
from_this_person_to_poi,13,0,58,73
loan_advances,0,0,141,3


In [54]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X, y = non_neg_df, poi_labels
reduced_dim = SelectKBest(chi2, k=10).fit_transform(X, y)
SelectKBest(chi2, k=10).get_support(X, y)



TypeError: get_support() takes at most 2 arguments (3 given)