In [None]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list_complete = data_dict['ALLEN PHILLIP K'].keys()
features_list = features_list_complete
features_list.remove('poi')
#Remove the email address
features_list.remove('email_address')
# Add poi at the first position
features_list.insert(0, 'poi')

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
#clf = ensemble.RandomForestClassifier()
clf = ensemble.AdaBoostClassifier(algorithm = 'SAMME', learning_rate=0.1, n_estimators=3)

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

In [None]:
""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        #print(clf.feature_importances_)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

def check_classifier():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    ### Run testing script
    test_classifier(clf, dataset, feature_list)



In [None]:
check_classifier()

In [None]:
features_list_complete = data_dict['ALLEN PHILLIP K'].keys()

In [None]:
features_list_complete.insert(0, 'poi')

In [None]:
features_list_complete

# Backup

In [78]:
import pickle
import pandas as pd
import numpy as np
import sys
from sklearn.preprocessing import MinMaxScaler
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit

with open("final_project_dataset.pkl", "r") as data_file:
    my_dataset = pickle.load(data_file)

In [79]:
my_dataset

{'ALLEN PHILLIP K': {'bonus': 4175000,
  'deferral_payments': 2869717,
  'deferred_income': -3081055,
  'director_fees': 'NaN',
  'email_address': 'phillip.allen@enron.com',
  'exercised_stock_options': 1729541,
  'expenses': 13868,
  'from_messages': 2195,
  'from_poi_to_this_person': 47,
  'from_this_person_to_poi': 65,
  'loan_advances': 'NaN',
  'long_term_incentive': 304805,
  'other': 152,
  'poi': False,
  'restricted_stock': 126027,
  'restricted_stock_deferred': -126027,
  'salary': 201955,
  'shared_receipt_with_poi': 1407,
  'to_messages': 2902,
  'total_payments': 4484442,
  'total_stock_value': 1729541},
 'BADUM JAMES P': {'bonus': 'NaN',
  'deferral_payments': 178980,
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'NaN',
  'exercised_stock_options': 257817,
  'expenses': 3486,
  'from_messages': 'NaN',
  'from_poi_to_this_person': 'NaN',
  'from_this_person_to_poi': 'NaN',
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 'NaN'

In [80]:
def datasetSplit(features, labels, test_size=0.3, random_state=42):
    features = np.array(features)
    labels = np.array(labels)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    for train_index, test_index in sss.split(features, labels):
        features_train, features_test = features[train_index], features[test_index]
        labels_train, labels_test = labels[train_index], labels[test_index]

In [81]:
features_list = ['poi', \
                 'deferred_income', \
                 'long_term_incentive', \
                 'bonus', \
                 'total_stock_value', \
                 'salary', \
                 'fraction_to_poi', \
                 'exercised_stock_options', \
                 'total_benefit'] 

In [82]:
def check_value(v):
    if v != 'NaN':
        return v
    else:
        return 0
    
for p in my_dataset:
    if my_dataset[p]['total_payments'] != 'NaN':
        total_payments = my_dataset[p]['total_payments']
    else:
        total_payments = 0
        
    if my_dataset[p]['total_stock_value'] != 'NaN':
        total_stock_value = my_dataset[p]['total_stock_value']
    else:
        total_stock_value = 0
    
    my_dataset[p]['total_benefit'] = total_payments + total_stock_value
    
for p in my_dataset:
    to_message = check_value(my_dataset[p]['to_messages'])
    from_message = check_value(my_dataset[p]['from_messages'])
    from_poi_to_this_person = check_value(my_dataset[p]['from_poi_to_this_person'])
    from_this_person_to_poi = check_value(my_dataset[p]['from_this_person_to_poi'])
    
    if to_message == 0:
        my_dataset[p]['fraction_from_poi'] = 0
    else:
        my_dataset[p]['fraction_from_poi'] = from_poi_to_this_person / float(to_message)
    if from_message == 0:
        my_dataset[p]['fraction_to_poi'] = 0
    else:    
        my_dataset[p]['fraction_to_poi'] = from_this_person_to_poi / float(from_message)

In [83]:
data = featureFormat(my_dataset, features_list, sort_keys = True)

In [84]:
data

array([[  0.00000000e+00,  -3.08105500e+06,   3.04805000e+05, ...,
          2.96127563e-02,   1.72954100e+06,   6.21398300e+06],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   2.57817000e+05,   4.40283000e+05],
       [  0.00000000e+00,  -5.10400000e+03,   0.00000000e+00, ...,
          0.00000000e+00,   4.04615700e+06,   6.15968400e+06],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.39130000e+05,   1.39130000e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   8.30855200e+06,   1.22450580e+07],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   1.92758000e+05,   2.47855000e+05]])

In [47]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [None]:
features_train, features_test, labels_train, labels_test = datasetSplit()

In [136]:
def scaleDict(dict_):
    '''
    Scale a dictionary, replace the NaN value by 0 and remove the email_address column if present
    
    parameter:
    - the dictionary of the dataset
    
    return:
    - the scaled dictionary
    '''
    # Convert the dictionay in a dataframe
    dataset = pd.DataFrame.from_dict(dict_, orient='index')
    
    # if the column email_address exists remove it (it is just text redondant with the name)
    if 'email_address' in dataset.columns:
        dataset.drop('email_address', axis=1, inplace=True)
    # Replace the NaN value by 0. Becareful in the pickel NaN is a string and not a type.
    dataset.replace('NaN', 0, inplace=True)
    
    # Rescale the data
    scaler = MinMaxScaler()
    features_rescaled = scaler.fit_transform(dataset)
    
    # recreate the dictionary and return it
    return pd.DataFrame(features_rescaled, index=dataset.index, columns=dataset.columns).to_dict(orient='index')

In [137]:
scaleDict(my_dataset)

{'ALLEN PHILLIP K': {'bonus': 0.042889303303999822,
  'deferral_payments': 0.092345324175533275,
  'deferred_income': 0.88993437655296126,
  'director_fees': 0.0,
  'exercised_stock_options': 0.0055475969002193965,
  'expenses': 0.0026489924545356259,
  'fraction_from_poi': 0.07451757408683668,
  'fraction_to_poi': 0.029612756264236904,
  'from_messages': 0.15277004454342985,
  'from_poi_to_this_person': 0.089015151515151519,
  'from_this_person_to_poi': 0.10673234811165845,
  'loan_advances': 0.0,
  'long_term_incentive': 0.0062817990249686706,
  'other': 3.5624229904342617e-06,
  'poi': 0.0,
  'restricted_stock': 0.020541510259455675,
  'restricted_stock_deferred': 0.32348090863062245,
  'salary': 0.0075626598318940426,
  'shared_receipt_with_poi': 0.25484513675058867,
  'to_messages': 0.19156379959073208,
  'total_benefit': 0.0083476834891944417,
  'total_payments': 0.014471236307309012,
  'total_stock_value': 0.0040815079743303662},
 'BADUM JAMES P': {'bonus': 0.0,
  'deferral_paym

In [103]:
dataset

Unnamed: 0,to_messages,deferral_payments,expenses,poi,deferred_income,long_term_incentive,fraction_from_poi,restricted_stock_deferred,shared_receipt_with_poi,loan_advances,...,bonus,total_stock_value,from_poi_to_this_person,from_this_person_to_poi,restricted_stock,salary,total_payments,fraction_to_poi,exercised_stock_options,total_benefit
ALLEN PHILLIP K,2902,2869717,13868,False,-3081055,304805,0.016196,-126027,1407,0,...,4175000,1729541,47,65,126027,201955,4484442,0.029613,1729541,6213983
BADUM JAMES P,0,178980,3486,False,0,0,0.000000,0,0,0,...,0,257817,0,0,0,0,182466,0.000000,257817,440283
BANNANTINE JAMES M,566,0,56301,False,-5104,0,0.068905,-560222,465,0,...,0,5243487,39,0,1757552,477,916197,0.000000,4046157,6159684
BAXTER JOHN C,0,1295738,11200,False,-1386055,1586055,0.000000,0,0,0,...,1200000,10623258,0,0,3942714,267102,5634343,0.000000,6680544,16257601
BAY FRANKLIN R,0,260455,129142,False,-201641,0,0.000000,-82782,0,0,...,400000,63014,0,0,145796,239671,827696,0.000000,0,890710
BAZELIDES PHILIP J,0,684694,0,False,0,93750,0.000000,0,0,0,...,0,1599641,0,0,0,80818,860136,0.000000,1599641,2459777
BECK SALLY W,7315,0,37172,False,0,0,0.019686,0,2639,0,...,700000,126027,144,386,126027,231330,969068,0.088879,0,1095095
BELDEN TIMOTHY N,7991,2144013,17355,True,-2334434,0,0.028532,0,5521,0,...,5249999,1110705,228,108,157569,213999,5501630,0.223140,953136,6612335
BELFER ROBERT,0,-102500,0,False,0,0,0.000000,44093,0,0,...,0,-44093,0,0,0,0,102500,0.000000,3285,58407
BERBERIAN DAVID,0,0,11892,False,0,0,0.000000,0,0,0,...,0,2493616,0,0,869220,216582,228474,0.000000,1624396,2722090


In [96]:
type(dataset)

pandas.core.frame.DataFrame

In [100]:
scaler = MinMaxScaler()

In [115]:
dataset.index

Index([u'ALLEN PHILLIP K', u'BADUM JAMES P', u'BANNANTINE JAMES M',
       u'BAXTER JOHN C', u'BAY FRANKLIN R', u'BAZELIDES PHILIP J',
       u'BECK SALLY W', u'BELDEN TIMOTHY N', u'BELFER ROBERT',
       u'BERBERIAN DAVID',
       ...
       u'WASAFF GEORGE', u'WESTFAHL RICHARD K', u'WHALEY DAVID A',
       u'WHALLEY LAWRENCE G', u'WHITE JR THOMAS E', u'WINOKUR JR. HERBERT S',
       u'WODRASKA JOHN', u'WROBEL BRUCE', u'YEAGER F SCOTT', u'YEAP SOON'],
      dtype='object', length=146)

In [116]:
dataset.columns

Index([u'to_messages', u'deferral_payments', u'expenses', u'poi',
       u'deferred_income', u'long_term_incentive', u'fraction_from_poi',
       u'restricted_stock_deferred', u'shared_receipt_with_poi',
       u'loan_advances', u'from_messages', u'other', u'director_fees',
       u'bonus', u'total_stock_value', u'from_poi_to_this_person',
       u'from_this_person_to_poi', u'restricted_stock', u'salary',
       u'total_payments', u'fraction_to_poi', u'exercised_stock_options',
       u'total_benefit'],
      dtype='object')

In [112]:
features_rescaled = scaler.fit_transform(dataset)

In [113]:
features_rescaled

array([[  1.91563800e-01,   9.23453242e-02,   2.64899245e-03, ...,
          2.96127563e-02,   5.54759690e-03,   8.34768349e-03],
       [  0.00000000e+00,   8.74544552e-03,   6.65877394e-04, ...,
          0.00000000e+00,   8.26962061e-04,   5.91463338e-04],
       [  3.73622021e-02,   3.18462472e-03,   1.07543210e-02, ...,
          0.00000000e+00,   1.29782688e-02,   8.27473980e-03],
       ..., 
       [  0.00000000e+00,   3.18462472e-03,   0.00000000e+00, ...,
          0.00000000e+00,   4.46267048e-04,   1.86903183e-04],
       [  0.00000000e+00,   3.18462472e-03,   1.03046723e-02, ...,
          0.00000000e+00,   2.66501328e-02,   1.64496537e-02],
       [  0.00000000e+00,   3.18462472e-03,   1.05243393e-02, ...,
          0.00000000e+00,   6.18281777e-04,   3.32961177e-04]])

In [117]:
pd.DataFrame(features_rescaled, index=dataset.index, columns=dataset.columns)

Unnamed: 0,to_messages,deferral_payments,expenses,poi,deferred_income,long_term_incentive,fraction_from_poi,restricted_stock_deferred,shared_receipt_with_poi,loan_advances,...,bonus,total_stock_value,from_poi_to_this_person,from_this_person_to_poi,restricted_stock,salary,total_payments,fraction_to_poi,exercised_stock_options,total_benefit
ALLEN PHILLIP K,0.191564,0.092345,0.002649,0.0,0.889934,0.006282,0.074518,0.323481,0.254845,0.0,...,0.042889,0.004082,0.089015,0.106732,0.020542,0.007563,0.014471,0.029613,0.005548,0.008348
BADUM JAMES P,0.000000,0.008745,0.000666,0.0,1.000000,0.000000,0.000000,0.328952,0.000000,0.0,...,0.000000,0.000695,0.000000,0.000000,0.019593,0.000000,0.000589,0.000000,0.000827,0.000591
BANNANTINE JAMES M,0.037362,0.003185,0.010754,0.0,0.999818,0.000000,0.317034,0.304630,0.084224,0.0,...,0.000000,0.012168,0.073864,0.000000,0.032815,0.000018,0.002957,0.000000,0.012978,0.008275
BAXTER JOHN C,0.000000,0.043443,0.002139,0.0,0.950485,0.032687,0.000000,0.328952,0.000000,0.0,...,0.012327,0.024548,0.000000,0.000000,0.049254,0.010002,0.018182,0.000000,0.021428,0.021840
BAY FRANKLIN R,0.000000,0.011277,0.024668,0.0,0.992797,0.000000,0.000000,0.325358,0.000000,0.0,...,0.004109,0.000246,0.000000,0.000000,0.020690,0.008975,0.002671,0.000000,0.000000,0.001197
BAZELIDES PHILIP J,0.000000,0.024458,0.000000,0.0,1.000000,0.001932,0.000000,0.328952,0.000000,0.0,...,0.000000,0.003783,0.000000,0.000000,0.019593,0.003026,0.002776,0.000000,0.005131,0.003304
BECK SALLY W,0.482870,0.003185,0.007100,0.0,1.000000,0.000000,0.090575,0.328952,0.477993,0.0,...,0.007191,0.000391,0.272727,0.633826,0.020542,0.008663,0.003127,0.088879,0.000000,0.001471
BELDEN TIMOTHY N,0.527494,0.069798,0.003315,1.0,0.916606,0.000000,0.131278,0.328952,1.000000,0.0,...,0.053933,0.002657,0.431818,0.177340,0.020779,0.008014,0.017754,0.223140,0.003057,0.008883
BELFER ROBERT,0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.000000,0.330867,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.019593,0.000000,0.000331,0.000000,0.000011,0.000078
BERBERIAN DAVID,0.000000,0.003185,0.002272,0.0,1.000000,0.000000,0.000000,0.328952,0.000000,0.0,...,0.000000,0.005840,0.000000,0.000000,0.026133,0.008110,0.000737,0.000000,0.005210,0.003657
