In [76]:
#Inspirations :
#https://github.com/MarcCollado/enron
#https://scikit-learn.org/stable/tutorial/machine_learning_map/
#https://williamkoehrsen.medium.com/machine-learning-with-python-on-the-enron-dataset-8d71015be26d
#https://review.udacity.com/#!/rubrics/27/view
#https://nbviewer.jupyter.org/github/riched158/Data-Analyst-Udacity-Nanodegree/blob/master/P5/poi_id.html
#https://www.kaggle.com/tsilveira/machine-learning-tutorial-enron-e-mails


import sys
import math
import operator
import pickle
from pprint import pprint as pp
import numpy as np
#np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import GenericUnivariateSelect, chi2
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn import preprocessing



#sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



def isnan(value):
    try:
        return math.isnan(float(value))
    except:
        return False












### Task 1: Select what features you'll use
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = [
    'poi',
    'salary',
    'to_messages',
    'deferral_payments',
    'total_payments',
    'exercised_stock_options',
    'bonus',
    'restricted_stock',
    'shared_receipt_with_poi',
    'restricted_stock_deferred',
    'total_stock_value',
    'expenses',
    'loan_advances',
    'from_messages',
    'other',
    'from_this_person_to_poi',
    'director_fees',
    'deferred_income',
    'long_term_incentive',
    #'email_address',
    'from_poi_to_this_person',
]



### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)



### KPI

"""

print "Total people:", len(data_dict)
print

poi = set()
for empl in data_dict:
    if data_dict[empl]['poi'] == True:poi.add(empl)
print "Total POI = {} / non-POI = {}".format(len(poi), (len(data_dict) - len(poi)))
print "Ratio POI = {} % / non-POI = {} %".format(\
    round((float(len(poi))/len(data_dict))*100, 2) , round((float((len(data_dict) - len(poi)))/len(data_dict))*100, 2))
print

print "Total label + features:", len(data_dict['LAY KENNETH L'])
feature_nan_dict = dict()
for _, values in data_dict.items():
    for key, value in values.items():
        if isnan(value):
            if key not in feature_nan_dict:
                feature_nan_dict[key] = 1
            else: feature_nan_dict[key] += 1
print "Quantity of NaN per feature:"
pp(feature_nan_dict)

"""

















### Task 2: Remove outliers

ratio_high_NaN = 0.8
high_NaN = dict()
for name, values in data_dict.items():
    i = 0
    for _, value in values.items():
        if isnan(value): i += 1
    if (i / 21.) >= ratio_high_NaN: high_NaN[name] = i / 21.

                
pp(high_NaN)







#Replacing NaN values
for _, values in data_dict.items():
    for key, value in values.items():
        if isnan(value):
            values[key] = 0





#Compte avant
#print "nb of poi = {}, nb of empl. = {}. Ratio = {} %".format(len(poi), len(data_dict),\
#                                                            round( float( len(poi)) / len(data_dict), 4)*100 )


# Computing IQR using Pandas functionnality
Q1 = pd_data.quantile(0.25)
Q3 = pd_data.quantile(0.75)
IQR = Q3 - Q1
series_outl = [pd_data[(pd_data>(Q3 + 1.5*IQR) ) | (pd_data<(Q1 - 1.5*IQR) )].count(axis=1), pd_data['poi']]
series_outl[0].name = "bad_feats"
outl = pd.concat(series_outl, axis=1)

outl.sort_values(by='bad_feats', ascending=False, inplace=True)
#print outl.head(8)

#Removing outliers
outliers = ('LAY KENNETH L', 'TOTAL', 'FREVERT MARK A', 'BELDEN TIMOTHY N', 'BAXTER JOHN C',\
            'SKILLING JEFFREY K', 'LAVORATO JOHN J', 'HAEDICKE MARK E')
for k in outliers:
    data_dict.pop(k, None)


#Compte après
poi2 = set()
for empl in data_dict:
    if data_dict[empl]['poi'] == True:poi2.add(empl)

#print "nb of poi = {}, nb of empl. = {}. Ratio = {} %".format(len(poi2), len(data_dict),\
#                                                            round( float( len(poi2)) / len(data_dict), 4)*100 )

















### Task 2 bis: Cleaning data

#Check sums
payment_sum = [
    'salary',
    'deferral_payments',
    'bonus',
    'expenses',
    'loan_advances',
    'other',
    'director_fees',
    'deferred_income',
    'long_term_incentive',
]

stock_sum = [
    'exercised_stock_options',
    'restricted_stock',
    'restricted_stock_deferred',
]

delta_pay = pd_data[payment_sum].sum(axis=1) != pd_data['total_payments']
delta_stock = pd_data[stock_sum].sum(axis=1) != pd_data['total_stock_value']
#print list(delta_pay[delta_pay==True].index), list(delta_pay[delta_stock==True].index)



#Corrections
data_dict['BELFER ROBERT']['deferral_payments'] = 0
data_dict['BELFER ROBERT']['total_payments'] = 3285
data_dict['BELFER ROBERT']['exercised_stock_options'] = 0
data_dict['BELFER ROBERT']['restricted_stock'] = 44093
data_dict['BELFER ROBERT']['restricted_stock_deferred'] = -44093
data_dict['BELFER ROBERT']['total_stock_value'] = 0
data_dict['BELFER ROBERT']['expenses'] = 3285
data_dict['BELFER ROBERT']['deferred_income'] = -102500
data_dict['BELFER ROBERT']['director_fees'] = 102500

data_dict['BHATNAGAR SANJAY']['total_payments'] = 137864
data_dict['BHATNAGAR SANJAY']['exercised_stock_options'] = 15456290
data_dict['BHATNAGAR SANJAY']['restricted_stock'] = 2604490
data_dict['BHATNAGAR SANJAY']['restricted_stock_deferred'] = -2604490
data_dict['BHATNAGAR SANJAY']['total_stock_value'] = 15456290
data_dict['BHATNAGAR SANJAY']['expenses'] = 137864
data_dict['BHATNAGAR SANJAY']['other'] = 0
data_dict['BHATNAGAR SANJAY']['director_fees'] = 0

pd_data_corr = pd.DataFrame.from_dict(data_dict, orient='index')

delta_pay = pd_data_corr[payment_sum].sum(axis=1) != pd_data_corr['total_payments']
delta_stock = pd_data_corr[stock_sum].sum(axis=1) != pd_data_corr['total_stock_value']

#print list(delta_pay[delta_pay==True].index), list(delta_pay[delta_stock==True].index)



















### Task 3: Create new feature(s)

def computeFraction( poi_messages, all_messages ):
    """ given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
    """
    
    fraction = 0.
    
    poi_messages = float(poi_messages)
    all_messages = float(all_messages)
    
    if isnan(poi_messages) or poi_messages == 0:
        fraction = 0.
    else: fraction = poi_messages / all_messages
    
    return round(fraction, 3)



for name in data_dict:

    data_point = data_dict[name]
    
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    data_point["fraction_from_poi"] = fraction_from_poi

    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    data_point["fraction_to_poi"] = fraction_to_poi

    

#Correlation Matrix with Heatmap (from https://towardsdatascience.com/)
#To identify added value of new features
pd_data = pd.DataFrame.from_dict(data_dict, orient='index')

corrmat = pd_data.corr()
top_corr_features = corrmat.index
#plt.figure(figsize=(20,20))
#g=sns.heatmap(pd_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

















### Store to my_dataset for easy export below.
my_dataset = data_dict



### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

np.set_printoptions(suppress=True,
   formatter={'float_kind':'{:16.0f}'.format}, linewidth=130)





























#minMax Scaling
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)


#Univariate feature selector with configurable strategy
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html

k = 10

print len(features[0])
transformer = GenericUnivariateSelect(chi2, mode='k_best', param=k)
features_uni = transformer.fit_transform(features, labels)
scores = transformer.scores_
feat_scores = zip(features_list[1:], scores)
feat_scores_order = sorted(dict(feat_scores).items(), key=operator.itemgetter(1), reverse=True)

pp (feat_scores_order[:10])
print len(features_uni[0])

























### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

{'CLINE KENNETH W': 0.8095238095238095,
 'GILLIS JOHN': 0.8095238095238095,
 'GRAMM WENDY L': 0.8571428571428571,
 'LOCKHART EUGENE E': 0.9523809523809523,
 'SAVAGE FRANK': 0.8095238095238095,
 'SCRIMSHAW MATTHEW': 0.8095238095238095,
 'THE TRAVEL AGENCY IN THE PARK': 0.8571428571428571,
 'WAKEHAM JOHN': 0.8095238095238095,
 'WHALEY DAVID A': 0.8571428571428571,
 'WODRASKA JOHN': 0.8095238095238095,
 'WROBEL BRUCE': 0.8571428571428571}
19
[('exercised_stock_options', 2.9002530056800802),
 ('total_stock_value', 2.8494853617346152),
 ('bonus', 2.3407059657201494),
 ('salary', 1.9923391654947711),
 ('from_poi_to_this_person', 1.7797177306026659),
 ('expenses', 1.46399832108206),
 ('director_fees', 1.3752164329335206),
 ('long_term_incentive', 1.3072310950338606),
 ('from_this_person_to_poi', 1.2279728928522513),
 ('shared_receipt_with_poi', 0.97799523929517618)]
10
