In [8]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)

if __name__ == '__main__':
    main()


Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('skb', SelectKBest(k=2, score_func=<function f_classif at 0x05B0A8F0>)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
	Accuracy: 0.87080	Precision: 0.84444	Recall: 0.03800	F1: 0.07273	F2: 0.04697
	Total predictions: 15000	True positives:   76	False positives:   14	False negatives: 1924	True negatives: 12986



# Pipeline (Scaling, SKB=2, SVC) -> Precision: 0.63060¶

Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('SKB', SelectKBest(k=6, score_func=)), ('MultinomialNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]) Accuracy: 0.87273 Precision: 0.92523 Recall: 0.04950 F1: 0.09397 F2: 0.06106 Total predictions: 15000 True positives: 99 False positives: 8 False negatives: 1901 True negatives: 12992


In [9]:
#!/usr/bin/python

import sys
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.feature_selection import SelectKBest, chi2
from pprint import pprint
sys.path.append("../tools/")


### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".


features_list = ['poi', 'salary', 'bonus', 'deferral_payments', 'total_payments', 'ave_earnings', 
                 'deferred_income','total_stock_value', 'exercised_stock_options', 
                'restricted_stock', 'restricted_stock_deferred', 'expenses',  
                 'long_term_incentive', 'shared_receipt_with_poi', 
                 'from_this_person_to_poi','from_poi_to_this_person',
                'to_messages','from_messages'] 

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
# List of all keys of the data_dict for  salary value > 1 million and 
# bonus > 5 million dollars
outliers = []
for e in data_dict.keys():
    if data_dict[e]["salary"] != 'NaN' and data_dict[e]['salary'] > 1000000 and data_dict[e]['bonus'] > 5000000:
        outliers.append(e)
        
print "Outliers Before Removal of TOTAL :",outliers

data_dict.pop('TOTAL',0)


### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.

my_dataset = data_dict

#### I will add a new feature that shows average value of total earning called ave_earnings 
#### by calculating the mean value of 'salary', 'bonus', 'deferral_payments', and 'total_payments' for each person.

for ele in my_dataset:
    earnings = []
    for e in features_list[1:5]:
        earn = my_dataset[ele][e]
        if earn =='NaN':
            earn = 0
            earnings.append(earn)
        earnings.append(earn)
    ave_earnings = np.mean(earnings)
    my_dataset[ele].update({'ave_earnings': ave_earnings})

print 'ave_earnings is the average value of:', features_list[1:5]

      
       
###Extract features and labels from dataset for local testing
# I removed entries with all 'NaN' values or all '0' in order to clean up data and avoid any problem on calcultions.
data = featureFormat(my_dataset, features_list, sort_keys = True, remove_NaN=True,
                     remove_all_zeroes=True, remove_any_zeroes=False)
labels, features = targetFeatureSplit(data)
print "\n Features List:\n"
pprint (features_list)



### Task 4: Try a varity of classifiers

# I tried several different classifiers and their output results of the tester.py script
# in the ML_project_varity_of_classifiers.ipny file
# Provided to give you a starting point. Try a variety of classifiers.
#from sklearn.naive_bayes import GaussianNB
#from sklearn import cross_validation



### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Below is the clasifier with best precision result above .3




data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
feature_train, feature_test, label_train, label_test = train_test_split( 
features, labels, test_size=0.3,random_state=42)
skb = SelectKBest()
svc = SVC()
scaler = MinMaxScaler()
clfi = Pipeline(steps=[('scaling',scaler),("skb", skb), ("svc", svc)])
print clfi








print(__doc__)

# Set the parameters by cross-validation
tuned_parameters =  {
    'skb__k': [2, 4, 5, 6],
    'svc__C': [1000,10000],
    'svc__kernel': ['rbf'],
    'svc__gamma': [0.001, 0.0001],
    }

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(clfi, tuned_parameters, cv=3,scoring='%s_weighted' % score)
    clf.fit(feature_train, label_train)
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    label_true, label_pred = label_test, clf.predict(feature_test)
    print(classification_report(label_true, label_pred))
    print()




### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

#test_classifier(clf, my_dataset, features_list) 

Outliers Before Removal of TOTAL : ['LAY KENNETH L', 'SKILLING JEFFREY K', 'TOTAL']
ave_earnings is the average value of: ['salary', 'bonus', 'deferral_payments', 'total_payments']

 Features List:

['poi',
 'salary',
 'bonus',
 'deferral_payments',
 'total_payments',
 'ave_earnings',
 'deferred_income',
 'total_stock_value',
 'exercised_stock_options',
 'restricted_stock',
 'restricted_stock_deferred',
 'expenses',
 'long_term_incentive',
 'shared_receipt_with_poi',
 'from_this_person_to_poi',
 'from_poi_to_this_person',
 'to_messages',
 'from_messages']
Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('skb', SelectKBest(k=10, score_func=<function f_classif at 0x05B0A8F0>)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
 a basic script for importing student's POI identifi

In [10]:
#!/usr/bin/python

import sys
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.feature_selection import SelectKBest, chi2
from pprint import pprint
sys.path.append("../tools/")


### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".


features_list = ['poi', 'salary', 'bonus', 'deferral_payments', 'total_payments', 'ave_earnings', 
                 'deferred_income','total_stock_value', 'exercised_stock_options', 
                'restricted_stock', 'restricted_stock_deferred', 'expenses',  
                 'long_term_incentive', 'shared_receipt_with_poi', 
                 'from_this_person_to_poi','from_poi_to_this_person',
                'to_messages','from_messages'] 

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
# List of all keys of the data_dict for  salary value > 1 million and 
# bonus > 5 million dollars
outliers = []
for e in data_dict.keys():
    if data_dict[e]["salary"] != 'NaN' and data_dict[e]['salary'] > 1000000 and data_dict[e]['bonus'] > 5000000:
        outliers.append(e)
        
print "Outliers Before Removal of TOTAL :",outliers

data_dict.pop('TOTAL',0)


### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.

my_dataset = data_dict

#### I will add a new feature that shows average value of total earning called ave_earnings 
#### by calculating the mean value of 'salary', 'bonus', 'deferral_payments', and 'total_payments' for each person.

for ele in my_dataset:
    earnings = []
    for e in features_list[1:5]:
        earn = my_dataset[ele][e]
        if earn =='NaN':
            earn = 0
            earnings.append(earn)
        earnings.append(earn)
    ave_earnings = np.mean(earnings)
    my_dataset[ele].update({'ave_earnings': ave_earnings})

print 'ave_earnings is the average value of:', features_list[1:5]

      
       
###Extract features and labels from dataset for local testing
# I removed entries with all 'NaN' values or all '0' in order to clean up data and avoid any problem on calcultions.
data = featureFormat(my_dataset, features_list, sort_keys = True, remove_NaN=True,
                     remove_all_zeroes=True, remove_any_zeroes=False)
labels, features = targetFeatureSplit(data)
print "\n Features List:\n"
pprint (features_list)



### Task 4: Try a varity of classifiers

# I tried several different classifiers and their output results of the tester.py script
# in the ML_project_varity_of_classifiers.ipny file
# Provided to give you a starting point. Try a variety of classifiers.
#from sklearn.naive_bayes import GaussianNB
#from sklearn import cross_validation



### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Below is the clasifier with best precision result above .3




data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
feature_train, feature_test, label_train, label_test = train_test_split( 
features, labels, test_size=0.3,random_state=42)
skb = SelectKBest()
svc = SVC()
scaler = MinMaxScaler()
clfi = Pipeline(steps=[('scaling',scaler),("skb", skb), ("svc", svc)])
print clfi

print(__doc__)

# Set the parameters by cross-validation
tuned_parameters =  {
    'skb__k': [2, 4, 5, 6],
    'svc__C': [1000,10000],
    'svc__kernel': ['linear'],
    'svc__gamma': [0.001, 0.0001],
    }

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(clfi, tuned_parameters, cv=5,scoring='%s_weighted' % score)
    clf.fit(feature_train, label_train)
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    label_true, label_pred = label_test, clf.predict(feature_test)
    print(classification_report(label_true, label_pred))
    print()




Outliers Before Removal of TOTAL : ['LAY KENNETH L', 'SKILLING JEFFREY K', 'TOTAL']
ave_earnings is the average value of: ['salary', 'bonus', 'deferral_payments', 'total_payments']

 Features List:

['poi',
 'salary',
 'bonus',
 'deferral_payments',
 'total_payments',
 'ave_earnings',
 'deferred_income',
 'total_stock_value',
 'exercised_stock_options',
 'restricted_stock',
 'restricted_stock_deferred',
 'expenses',
 'long_term_incentive',
 'shared_receipt_with_poi',
 'from_this_person_to_poi',
 'from_poi_to_this_person',
 'to_messages',
 'from_messages']
Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('skb', SelectKBest(k=10, score_func=<function f_classif at 0x05B0A8F0>)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
 a basic script for importing student's POI identifi

# Best parameters set found on development set:

{'svc__gamma': 0.001, 'svc__kernel': 'linear', 'svc__C': 1000, 'skb__k': 2}

In [11]:
skb = SelectKBest(k=2)
svc = SVC(C=1000, gamma=0.001, kernel='linear')
scaler = MinMaxScaler()

clf = Pipeline(steps=[('scaling',scaler),("skb", skb), ("svc", svc)])


### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.


dump_classifier_and_data(clf, my_dataset, features_list)

test_classifier(clf, my_dataset, features_list) 

Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('skb', SelectKBest(k=2, score_func=<function f_classif at 0x05B0A8F0>)), ('svc', SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
	Accuracy: 0.87133	Precision: 0.63060	Recall: 0.08450	F1: 0.14903	F2: 0.10220
	Total predictions: 15000	True positives:  169	False positives:   99	False negatives: 1831	True negatives: 12901

