In [2]:
#!/usr/bin/python

import sys
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.feature_selection import SelectKBest, chi2
from pprint import pprint
sys.path.append("../tools/")


### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".


features_list = ['poi', 'salary', 'bonus', 'deferral_payments', 'total_payments', 'ave_earnings', 
                 'deferred_income','total_stock_value', 'exercised_stock_options', 
                'restricted_stock', 'restricted_stock_deferred', 'expenses',  
                 'long_term_incentive', 'shared_receipt_with_poi', 
                 'from_this_person_to_poi','from_poi_to_this_person',
                'to_messages','from_messages'] 

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
# List of all keys of the data_dict for  salary value > 1 million and 
# bonus > 5 million dollars
outliers = []
for e in data_dict.keys():
    if data_dict[e]["salary"] != 'NaN' and data_dict[e]['salary'] > 1000000 and data_dict[e]['bonus'] > 5000000:
        outliers.append(e)
        
print "Outliers Before Removal of TOTAL :",outliers

data_dict.pop('TOTAL',0)


### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.

my_dataset = data_dict

#### I will add a new feature that shows average value of total earning called ave_earnings 
#### by calculating the mean value of 'salary', 'bonus', 'deferral_payments', and 'total_payments' for each person.

for ele in my_dataset:
    earnings = []
    for e in features_list[1:5]:
        earn = my_dataset[ele][e]
        if earn =='NaN':
            earn = 0
            earnings.append(earn)
        earnings.append(earn)
    ave_earnings = np.mean(earnings)
    my_dataset[ele].update({'ave_earnings': ave_earnings})

print 'ave_earnings is the average value of:', features_list[1:5]

      
       
###Extract features and labels from dataset for local testing
# I removed entries with all 'NaN' values or all '0' in order to clean up data and avoid any problem on calcultions.
data = featureFormat(my_dataset, features_list, sort_keys = True, remove_NaN=True,
                     remove_all_zeroes=True, remove_any_zeroes=False)
labels, features = targetFeatureSplit(data)
print "\n Features List:\n"
pprint (features_list)



### Task 4: Try a varity of classifiers

# I tried several different classifiers and their output results of the tester.py script
# in the ML_project_varity_of_classifiers.ipny file
# Provided to give you a starting point. Try a variety of classifiers.
#from sklearn.naive_bayes import GaussianNB
#from sklearn import cross_validation



### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Below is the clasifier with best precision result above .3




data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
feature_train, feature_test, label_train, label_test = train_test_split( 
features, labels, test_size=0.3,random_state=42)

skb = SelectKBest(k=2)
svc = SVC(C=1000, gamma=.001, kernel='linear')
scaler = MinMaxScaler()

clf = Pipeline(steps=[('scaling',scaler),("skb", skb), ("svc", svc)])


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.


dump_classifier_and_data(clf, my_dataset, features_list)

Outliers Before Removal of TOTAL : ['LAY KENNETH L', 'SKILLING JEFFREY K', 'TOTAL']
ave_earnings is the average value of: ['salary', 'bonus', 'deferral_payments', 'total_payments']

 Features List:

['poi',
 'salary',
 'bonus',
 'deferral_payments',
 'total_payments',
 'ave_earnings',
 'deferred_income',
 'total_stock_value',
 'exercised_stock_options',
 'restricted_stock',
 'restricted_stock_deferred',
 'expenses',
 'long_term_incentive',
 'shared_receipt_with_poi',
 'from_this_person_to_poi',
 'from_poi_to_this_person',
 'to_messages',
 'from_messages']
