### Objective:

 The goal of this project is to construct an algorithm to predict persons of interest (POIs), or those involved in illegal activities, in the Enron fraud investigation. The Enron dataset contains features such as information on the finances and emails of employees that may be helpful to explore this question. 

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

import sys
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append("C:/Users/Jenn/ud120-projects/tools")
sys.path.append("C:/Users/Jenn/ud120-projects/final_project")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

%matplotlib inline 

In [2]:
# Load the dictionary containing the dataset
with open("C:/Users/princess/ud120-projects/final_project/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [3]:
# Examined the PDF for names that do not seem to correspond to employees' names. Remove below
data_dict.pop ( 'TOTAL', 0 )
data_dict.pop ( 'THE TRAVEL AGENCY IN THE PARK', 0 )

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [4]:
#adding key-value pairs to dict of dicts (from discussion forum):

#New feature: salary to total payments
def compute_ratio (numerator, denominator):
    if (numerator == 'NaN') or (denominator == 'NaN') or (denominator == 0):
        fraction = 0
    else:
        fraction = float(numerator)/float(denominator)
    return fraction

def add_feature(dict):
    for key in dict:
        salary = dict[key]['salary']
        total_payments = dict[key]['total_payments']
        salary_to_total_payments = compute_ratio(salary, total_payments)
        dict[key]['salary_to_total_payments'] = salary_to_total_payments

add_feature(data_dict)

In [5]:
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 
                'bonus', 'restricted_stock_deferred', 'deferred_income', 
                'total_stock_value', 'expenses', 'exercised_stock_options',
                'long_term_incentive', 'restricted_stock', 'salary_to_total_payments', 
                'from_poi_to_this_person','from_this_person_to_poi','shared_receipt_with_poi']

In [6]:
# Extract features and labels from dataset for local testing
data = featureFormat(data_dict, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [7]:
# splitting the data into train and testing datasets
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)


In [8]:
# stratifiedshufflesplit works by splitting the data into different train and test groups
# each having a class (POI) proportion roughly equal to the whole
sss = StratifiedShuffleSplit (
    labels_train,
    n_iter = 100,
    test_size = 0.2,
    random_state = 0)

K nearest neighbor algorithm

In [11]:
# Build kNN algorithm by chaining steps, including robust scaler in a pipeline.
SKB = SelectKBest(f_classif)
scale = RobustScaler()
nbrs = KNeighborsClassifier()

pipe= Pipeline(steps=[("SKB", SKB), ("scale", scale), ("nbrs", nbrs)])

param_grid = {
    "SKB__k": [4,5,6,7,8,9,10,11,12,13,14],
    "nbrs__n_neighbors": [2,4,6,8],
    "nbrs__weights": ["uniform", "distance"],
    "nbrs__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "nbrs__p": [1,2]
}

In [13]:
# GridSearchCV tries different combination of the specified parameters to find the best combination
gs = GridSearchCV(
        pipe,
        param_grid=param_grid,
        cv = sss,
        scoring='f1',
        n_jobs = -1
    )

In [14]:
#fit algorithm
gs.fit (features_train, labels_train)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[ 0.  0. ...,  0.  0.], n_iter=100, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=Pipeline(steps=[('SKB', SelectKBest(k=10, score_func=<function f_classif at 0x0578FF70>)), ('scale', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('nbrs', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'nbrs__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'nbrs__n_neighbors': [2, 4, 6, 8], 'nbrs__weights': ['uniform', 'distance'], 'nbrs__p': [1, 2], 'SKB__k': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [15]:
test_predict = gs.predict(features_test)

In [16]:
clf = gs.best_estimator_

In [17]:
#Below code from discussion forum finds the list of the selected features
features_selected_bool = gs.best_estimator_.named_steps['SKB'].get_support()
features_selected_list = [x for x, y in zip(features_list[1:], features_selected_bool) if y]

In [18]:
features_selected_list

['salary', 'bonus', 'total_stock_value', 'shared_receipt_with_poi']

In [26]:
#find the scores associated with the features
import numpy as np
np.set_printoptions(precision=4)

In [27]:
gs.best_estimator_.named_steps['SKB'].scores_

array([ 11.1963,   0.2594,   2.7679,  11.1295,   0.7635,   5.3041,
        14.6913,   5.9062,  13.714 ,   2.6113,   6.5769,   1.4965,
         3.5907,   2.1438,   5.4953])

In [20]:
acc = accuracy_score(test_predict, labels_test)
prec = precision_score(labels_test, test_predict)
rec = recall_score (labels_test, test_predict)

In [21]:
print "Accuracy score:", acc
print "Precision score:", prec
print "Recall Score:", rec

Accuracy score: 0.860465116279
Precision score: 0.333333333333
Recall Score: 0.2


In [22]:
# Run test_classifier script, which takes the average of a bunch of stratified splits
from tester import test_classifier
test_classifier(clf, data_dict, features_list)

Pipeline(steps=[('SKB', SelectKBest(k=4, score_func=<function f_classif at 0x0578FF70>)), ('scale', RobustScaler(copy=True, with_centering=True, with_scaling=True)), ('nbrs', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=1,
           weights='distance'))])
	Accuracy: 0.83707	Precision: 0.38377	Recall: 0.36650	F1: 0.37494	F2: 0.36983
	Total predictions: 15000	True positives:  733	False positives: 1177	False negatives: 1267	True negatives: 11823

