In [572]:
#!/usr/bin/python
import sys
import pickle
sys.path.append("../tools/")
import numpy as np
import pandas as pd
import matplotlib.pyplot
from sklearn.preprocessing import Imputer
from feature_format import featureFormat, targetFeatureSplit
import tester
from tester import dump_classifier_and_data

### Needed for Features selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA

### Needed for Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

### Needed for tuning the model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [573]:
### Define the various feature lists needed

# all the intital featurs in dataset except for email_address and shared_receipt_with_poi
initial_features_list = ['poi',
                'salary',
                'bonus', 
                'long_term_incentive', 
                'deferred_income', 
                'deferral_payments',
                'loan_advances', 
                'other',
                'expenses', 
                'director_fees',
                'total_payments',
                'exercised_stock_options',
                'restricted_stock',
                'restricted_stock_deferred',
                'total_stock_value',
                'to_messages',
                'from_messages',
                'from_this_person_to_poi',
                'from_poi_to_this_person']

payment_feature_list = ['salary',
            'bonus', 
            'long_term_incentive', 
            'deferred_income', 
            'deferral_payments',
            'loan_advances', 
            'other',
            'expenses', 
            'director_fees']

stock_feature_list = ['exercised_stock_options',
                'restricted_stock',
                'restricted_stock_deferred']

email_feature_list = [  'to_messages',
                'from_messages',
                'from_this_person_to_poi',
                'from_poi_to_this_person']

features_list = ['poi', 'salary']

In [574]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

my_dataset = data_dict

### Exploring the dataset: 146 people, 21 features
print "Number of people in the dataset: {0}".format(len(my_dataset))
print "Number of features per person in the dataset: {0}".format(len(my_dataset.values()[0]))
print "features included in the dataset: " 
my_dataset.values()[0].keys()

Number of people in the dataset: 146
Number of features per person in the dataset: 21
features included in the dataset: 


['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'email_address',
 'from_poi_to_this_person']

In [575]:
### Transform data from dictionary to the Pandas DataFrame
df = pd.DataFrame.from_dict(data_dict, orient = 'index')

#Order columns in DataFrame, excluding email
df = df[initial_features_list]
df = df.replace('NaN', np.nan)

print "total NaN values in the dataset: {0}" .format (df.isnull().sum().sum())
print "total values in the datase: {0}" .format (sum (df.count () + df.isnull().sum().sum()))
print " total number of rows with NaN values: {0}"\
.format (sum([True for idx,row in df.iterrows() if any(row.isnull())]))
df.info()

total NaN values in the dataset: 1263
total values in the datase: 25508
 total number of rows with NaN values: 146
<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 19 columns):
poi                          146 non-null bool
salary                       95 non-null float64
bonus                        82 non-null float64
long_term_incentive          66 non-null float64
deferred_income              49 non-null float64
deferral_payments            39 non-null float64
loan_advances                4 non-null float64
other                        93 non-null float64
expenses                     95 non-null float64
director_fees                17 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
to_messages                  

In [582]:
### impute the "NaN" values

# Replace "NaN" values in financial dataset with 0
df.iloc [:,:15] = df.iloc [:,:15].fillna(0)

# Replace "NaN" values in email feautre set with median values for two categories POI = 1 and POI = 0
email_features = ['to_messages', 'from_messages', 'from_this_person_to_poi', 'from_poi_to_this_person']
imp = Imputer(missing_values='NaN', strategy='median', axis=0)

df.loc[df[df.poi == 1].index,email_features] = imp.fit_transform(df[email_features][df.poi == 1])
df.loc[df[df.poi == 0].index,email_features] = imp.fit_transform(df[email_features][df.poi == 0])

# Reviewing to see if we still have NaN values
print "total NaN values in the dataset is {0}" .format (df.isnull().sum().sum())
df.info()

total NaN values in the dataset is 0
<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 19 columns):
poi                          146 non-null bool
salary                       146 non-null float64
bonus                        146 non-null float64
long_term_incentive          146 non-null float64
deferred_income              146 non-null float64
deferral_payments            146 non-null float64
loan_advances                146 non-null float64
other                        146 non-null float64
expenses                     146 non-null float64
director_fees                146 non-null float64
total_payments               146 non-null float64
exercised_stock_options      146 non-null float64
restricted_stock             146 non-null float64
restricted_stock_deferred    146 non-null float64
total_stock_value            146 non-null float64
to_messages                  146 non-null float64
from_messages                146 non-null floa

In [584]:
### Task 2: Data cleansing: check for typos and miscalculations

# Task 2.1: check for the accuracy of money payments data: summing payments features and comparing with total_payments
df[df[payment_feature_list].sum(axis='columns') != df.total_payments]

# Task 2.2: check for the accuracy of stock payments: summing stock payments and comparing with total_stock_value
df[df[stock_feature_list].sum(axis='columns') != df.total_stock_value]

# Task 2.3: Correct the errors for total_payments and total_stock_value based on the data in the PDF file
df.loc['BELFER ROBERT','total_payments'] = 3285
df.loc['BELFER ROBERT','deferral_payments'] = 0
df.loc['BELFER ROBERT','restricted_stock'] = 44093
df.loc['BELFER ROBERT','restricted_stock_deferred'] = -44093
df.loc['BELFER ROBERT','total_stock_value'] = 0
df.loc['BELFER ROBERT','director_fees'] = 102500
df.loc['BELFER ROBERT','deferred_income'] = -102500
df.loc['BELFER ROBERT','exercised_stock_options'] = 0
df.loc['BELFER ROBERT','expenses'] = 3285
df.loc['BELFER ROBERT',]
df.loc['BHATNAGAR SANJAY','expenses'] = 137864
df.loc['BHATNAGAR SANJAY','total_payments'] = 137864
df.loc['BHATNAGAR SANJAY','exercised_stock_options'] = 1.54563e+07
df.loc['BHATNAGAR SANJAY','restricted_stock'] = 2.60449e+06
df.loc['BHATNAGAR SANJAY','restricted_stock_deferred'] = -2.60449e+06
df.loc['BHATNAGAR SANJAY','other'] = 0
df.loc['BHATNAGAR SANJAY','director_fees'] = 0
df.loc['BHATNAGAR SANJAY','total_stock_value'] = 1.54563e+07
df.loc['BHATNAGAR SANJAY',]

# Reviewing to see if the totals are correct now
df[df[payment_feature_list].sum(axis='columns') != df.total_payments]
df[df[stock_feature_list].sum(axis='columns') != df.total_stock_value]

Unnamed: 0,poi,salary,bonus,long_term_incentive,deferred_income,deferral_payments,loan_advances,other,expenses,director_fees,total_payments,exercised_stock_options,restricted_stock,restricted_stock_deferred,total_stock_value,to_messages,from_messages,from_this_person_to_poi,from_poi_to_this_person


In [585]:
# Task 2.4: identify & remove the outliers using interquantile range (IQR) in descriptive statistics
# IQR = df.quantile(.75)-df.quantile(.25)
# Upper outliers definition: df.quantile(.75) + (1.5 * IQR)
# lower outliers definition: df.quantile(.25) - (1.5 * IQR)

# determine the number of lower outliers for each row/person => we will ignore this based on the results   
lower_outliers = df.quantile(.25) - 1.5 * (df.quantile(.75)-df.quantile(.25))
pd.DataFrame((df[1:] < lower_outliers[1:]).sum(axis = 1), columns = ['# of lower outliers']).\
    sort_values('# of lower outliers',  ascending = [0]).head(7)

# determine the number of upper outliers for each row/person 
upper_outliers = df.quantile(.5) + 1.5 * (df.quantile(.75)-df.quantile(.25))
pd.DataFrame((df[1:] > upper_outliers[1:]).sum(axis = 1), columns = ['# of upper outliers']).\
    sort_values('# of upper outliers',  ascending = [0]).head(7)

# "TOTAL" doesn't add much value to the set so we will remove it.
# Kenneth Lay and Jeffrey Skilling are very important personas in Enron case 
# We will leave the rest of the outliers since they maybe anomalies vs outliers
df = df.drop(['TOTAL'],0)

In [586]:
### Task 3: Create new feature(s) & store in the dataframe

# feature scaling:fraction of person's email to POI to all sent messages
df['to_poi_message_ratio'] = df['from_this_person_to_poi']/df['from_messages']
#clean all 'inf' values which we got if the person's from_messages = 0
df = df.replace('inf', 0)

#feature scaling: fraction of person's email from POI to all messages received
df['from_poi_message_ratio'] = df['from_poi_to_this_person']/df['to_messages']
#clean all 'inf' values which we got if the person's to_messages = 0
df = df.replace('inf', 0)

initial_features_list.extend(['to_poi_message_ratio', 'from_poi_message_ratio'])
features_list.extend(['to_poi_message_ratio', 'from_poi_message_ratio'])


In [587]:
### normalize the training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_norm = df[initial_features_list]
df_norm = scaler.fit_transform(df_norm.iloc[:,1:])

In [588]:
### Task 4.1: Trying GaussianNB
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Trying GaussianNB => eliminated for final model based on results
clf = GaussianNB()
temp_features_list = ['poi']+range(7)

my_dataset_GNB = pd.DataFrame(SelectKBest(f_classif, k = 7).fit_transform(df_norm, df.poi), index = df.index)
my_dataset_GNB.insert(0, "poi", df.poi)
my_dataset_GNB = my_dataset_GNB.to_dict(orient = 'index')

dump_classifier_and_data(clf, my_dataset_GNB, temp_features_list)
tester.main()

GaussianNB(priors=None)
	Accuracy: 0.86020	Precision: 0.47162	Recall: 0.40300	F1: 0.43462	F2: 0.41508
	Total predictions: 15000	True positives:  806	False positives:  903	False negatives: 1194	True negatives: 12097



In [589]:
# Trying PCA + GaussianNB => eliminated for final model based on results
pca = PCA(n_components=3)
temp_features_list = ['poi']+range(3)
my_dataset_GNB = pd.DataFrame(SelectKBest(f_classif, k=8).fit_transform(df_norm, df.poi), index = df.index)
PCA_dataset = pd.DataFrame(pca.fit_transform(my_dataset_GNB),  index=df.index)
PCA_dataset.insert(0, "poi", df.poi)
PCA_dataset = PCA_dataset.to_dict(orient = 'index')  

dump_classifier_and_data(clf, PCA_dataset, temp_features_list)
tester.main()

GaussianNB(priors=None)
	Accuracy: 0.87213	Precision: 0.52774	Recall: 0.39000	F1: 0.44853	F2: 0.41148
	Total predictions: 15000	True positives:  780	False positives:  698	False negatives: 1220	True negatives: 12302



In [590]:
# Trying Decision tree => eliminated for final model based on the results
clf = DecisionTreeClassifier(random_state = 75)
my_dataset_DT = df[initial_features_list].to_dict(orient = 'index')
tester.dump_classifier_and_data(clf, my_dataset_DT, initial_features_list)
tester.main() 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=75, splitter='best')
	Accuracy: 0.86793	Precision: 0.50475	Recall: 0.50500	F1: 0.50487	F2: 0.50495
	Total predictions: 15000	True positives: 1010	False positives:  991	False negatives:  990	True negatives: 12009



In [591]:
# Trying Decision tree with feature importance => eliminated for final model based on the results
clf = DecisionTreeClassifier(random_state = 75)
my_dataset_DT = df[initial_features_list].to_dict(orient = 'index')
clf.fit(df_norm, df['poi'])

# create and sort features_list of non-null importance features for the model
features_importance = []
len (clf.feature_importances_)
for i in range(len(clf.feature_importances_)):
   if clf.feature_importances_[i] > 0:
       features_importance.append([df.columns[i+1], clf.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
for f_i in features_importance:
    print f_i
features_list = [x[0] for x in features_importance]
features_list.insert(0, 'poi')

# Searchgrid for tuning parameters
param_grid = {'bootstrap': [False],
 'criterion': ['entropy'],
 'max_depth': [None],
 'max_features': [1],
 'min_samples_leaf': [1],
 'min_samples_split': [9]}

grid_search = GridSearchCV(clf,param_grid=param_grid)

my_dataset_DT = df[features_list].to_dict(orient = 'index')
tester.dump_classifier_and_data(clf, my_dataset_DT, features_list)
tester.main() 

['to_poi_message_ratio', 0.34758155230596183]
['expenses', 0.31100626310600066]
['to_messages', 0.13531641878098569]
['total_stock_value', 0.084572761738116065]
['deferred_income', 0.070477301448430049]
['from_messages', 0.051045702620505777]
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=75, splitter='best')
	Accuracy: 0.89500	Precision: 0.61530	Recall: 0.56700	F1: 0.59016	F2: 0.57604
	Total predictions: 15000	True positives: 1134	False positives:  709	False negatives:  866	True negatives: 12291



In [553]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. 

#data = featureFormat(my_dataset, initial_features_list, sort_keys = True)
#labels, features = targetFeatureSplit(data)
#features_train, features_test, labels_train, labels_test = \
#train_test_split(features, labels, test_size=0.3, random_state=42)

### Trying random forest along with grid_search for tuning the parameters
clf = RandomForestClassifier()
my_dataset = df[features_list].to_dict(orient = 'index')

# Searchgrid for random forest: specify parameters and distributions to sample from
param_grid = {'bootstrap': [False],
 'criterion': ['entropy'],
 'max_depth': [None],
 'max_features': [1],
 'min_samples_leaf': [1],
 'min_samples_split': [9]}

grid_search = GridSearchCV(clf,param_grid=param_grid)
dump_classifier_and_data(clf, my_dataset, features_list)
tester.main()
#grid_search.fit(features_train, labels_train)
#predictions=grid_search.predict(features_test)
#print classification_report(labels_test,predictions)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
	Accuracy: 0.88893	Precision: 0.66277	Recall: 0.34000	F1: 0.44944	F2: 0.37669
	Total predictions: 15000	True positives:  680	False positives:  346	False negatives: 1320	True negatives: 12654



In [410]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_list)