In [18]:
#!/usr/bin/python

import sys
sys.path.append("../tools/")

import pprint
pp = pprint.PrettyPrinter(indent=4)

import csv
import pickle
import numpy as np
from time import time
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data


with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# fix Sanjay Bhatnagar's data
data_dict['BHATNAGAR SANJAY']['director_fees'] = 0
data_dict['BHATNAGAR SANJAY']['exercised_stock_options'] = 15456290
data_dict['BHATNAGAR SANJAY']['expenses'] = 137864
data_dict['BHATNAGAR SANJAY']['other'] = 0
data_dict['BHATNAGAR SANJAY']['restricted_stock'] = 2604490
data_dict['BHATNAGAR SANJAY']['restricted_stock_deferred'] = -2604490
data_dict['BHATNAGAR SANJAY']['total_payments'] = 137864
data_dict['BHATNAGAR SANJAY']['total_stock_value'] = 15456290

# fix Robert Belfer's data
data_dict['BELFER ROBERT']['deferral_payments'] = 0
data_dict['BELFER ROBERT']['deferred_income'] = -102500
data_dict['BELFER ROBERT']['director_fees'] = 102500
data_dict['BELFER ROBERT']['exercised_stock_options'] = 0
data_dict['BELFER ROBERT']['expenses'] = 3285
data_dict['BELFER ROBERT']['restricted_stock'] = 44093
data_dict['BELFER ROBERT']['restricted_stock_deferred'] = -44093
data_dict['BELFER ROBERT']['total_payments'] = 3285
data_dict['BELFER ROBERT']['total_stock_value'] = 0

### Task 1: Identify outliers
# The details of identifying outliers are presented in identify_outliers.html
non_employee = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK']
outliers_1 = ['FREVERT MARK A', 'ALLEN PHILLIP K']
outliers_2 = ['BECK SALLY W', 'KITCHEN LOUISE', 'PAI LOU L', 'SHAPIRO RICHARD S', 'URQUHART JOHN A']

for i in non_employee + outliers_1:
    data_dict.pop(i) # remove 4 outliers

### Task 2: Design new features
# list all the numeric features
all_features = ['poi', 'bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']

# introduce two new features
new_features = ['NaN_num', 'poi_message_over_total_message']

# return the number of 'NaN' in a dictionary
def count_NaN(dic):
    count = 0
    for value in dic.values():
        if value == 'NaN':
            count += 1
    return count

# insert new features into dataset
if new_features != []:
    for value in data_dict.values():
        value[new_features[0]] = count_NaN(value)
        if value['from_poi_to_this_person'] != 'NaN' and value['from_this_person_to_poi'] != 'NaN' and value['from_messages'] != 'NaN' and value['to_messages'] != 'NaN':
            value[new_features[1]] = ( float(value['from_poi_to_this_person']) + float(value['from_this_person_to_poi']) ) / ( float(value['from_messages']) + float(value['to_messages']) )
        else: value[new_features[1]] = 'NaN'

features_list = ['poi', 'bonus', 'deferred_income', 'expenses', 'total_payments', 'total_stock_value']
features_list = ['poi', 'bonus', 'deferred_income', 'expenses', 'total_payments', 'total_stock_value'] + [new_features[0]]
features_list = ['poi', 'bonus', 'deferred_income', 'expenses', 'total_payments', 'total_stock_value'] + [new_features[1]]

### Task 4&5: Try a varity of classifiers and Tune classifier parameters
# Extract features and labels from the dataset
my_dataset = data_dict
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

# Feature scaling
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
features_scaled = min_max_scaler.fit_transform(features)

# split train set and test set
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)

# try different classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

classifiers = [GaussianNB, svm.SVC,tree.DecisionTreeClassifier]
clf_names = ['Gaussion Naive Bayes', 'SVM', 'Decision Tree']
results = {}
for name, clf_cls in zip(clf_names, classifiers):
    tmp_dict = {}
    # use KFold for cross validation
    tmp_dict['precision'] = cross_val_score( clf_cls(), features, labels, cv=StratifiedKFold(n_splits=3, shuffle=True), scoring='precision').mean()
    tmp_dict['recall'] = cross_val_score( clf_cls(), features, labels, cv=StratifiedKFold(n_splits=3, shuffle=True), scoring='recall').mean()
    tmp_dict['F1 Score'] = cross_val_score( clf_cls(), features, labels, cv=StratifiedKFold(n_splits=3, shuffle=True), scoring='f1').mean()
    results[name] = tmp_dict

# validation and metrics
pp.pprint(results)

{   'Decision Tree': {   'F1 Score': 0.43915343915343907,
                         'precision': 0.52380952380952384,
                         'recall': 0.38888888888888884},
    'Gaussion Naive Bayes': {   'F1 Score': 0.25108225108225107,
                                'precision': 0.43888888888888888,
                                'recall': 0.33333333333333331},
    'SVM': {   'F1 Score': 0.0, 'precision': 0.0, 'recall': 0.0}}
