In [1]:
#!/usr/bin/python
print("#############LOADING PACKAGES...    #####################")
from __future__ import division
import os
import sys
sys.path.append("..\\tools") # see tester.py
import pickle
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
print("#############LOADING VISUALIZATION LIBRARIES... #########")
import matplotlib.pyplot as plt
plt.style.use('classic')
import seaborn as sns
%matplotlib inline
#######################################################################
import pandas as pd
import numpy as np
import pprint
print("#############LOADING SCIKIT-LEARN PACKAGES... ############")
# from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.grid_search import GridSearchCV    # --> now under model_selection
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit#, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from ml_functions import *


print("############# LOAD NESTED DICTIONARY - DATASET... #######")
print("############# AND CREATING DATAFRAME... #################")
data_dict = load_dict("final_project_dataset.pkl")
df = dict_to_df(data_dict)

df["bin_email_address"] = np.where(df["email_address"]!="NaN",1,0)

df = cols_to_numeric(df, "email_address")
# show_null_values(df, False)


print("######## DELETING THE FOLLOWING FEATURES... #############")
lst = ["loan_advances", 'director_fees', 'restricted_stock_deferred',
"deferral_payments"]
print(lst)
df = drop_bad_features(df, lst)
print("#########################################################")
print("############# DROPING THE FOLLOWING OUTLIERS...  ########")

lst = ['TOTAL' , 'THE TRAVEL AGENCY IN THE PARK', 'LAVORATO JOHN J',
      "FREVERT MARK A", "ALLEN PHILLIP K", "LOCKHART EUGENE E", 
       "MCMAHON JEFFREY", "FALLON JAMES B", "KITCHEN LOUISE", 
       "WHALLEY LAWRENCE G", "SHANKMAN JEFFREY A", "HICKERSON GARY J"]
print(lst)
df = drop_outliers(df, lst)
print("############# CREATING NEW FEATURES... ##################")
df['ratio_from_poi'] = df.from_poi_to_this_person / df.to_messages
df['ratio_to_poi'] = df.from_this_person_to_poi / df.from_messages
df['log_ratio_to_poi'] = df["ratio_to_poi"].apply(take_log_on_non_zero_entries_of_this_feature)
df["log_ratio_from_poi"] = df["ratio_from_poi"].apply(take_log_on_non_zero_entries_of_this_feature)
df = df.replace("NaN", 0)
print("#########################################################")
print("############# FEATURES LIST... ######################")
print("#########################################################")
features_dict = {"SelectKBest": ["poi", "salary", "ratio_to_poi",
                                "total_stock_value", 
                                 "exercised_stock_options", 
                                "bonus", "deferred_income", 
                                "total_payments", "restricted_stock", "other"],
                "Lasso": ["poi", "from_poi_to_this_person", 
                         "from_this_person_to_poi",
                         "shared_receipt_with_poi",
                         "total_stock_value"],
                 "RandomForest": ['poi', 'shared_receipt_with_poi', 'salary', 
                                  'exercised_stock_options', 'from_poi_to_this_person', 
                                  'other', 'from_this_person_to_poi', 'deferred_income', 
                                  'exercised_stock_options', 'expenses', 
                                  'long_term_incentive', 'restricted_stock'],
                 "DecisionTree": ["poi", "bonus", "expenses", 
                       "exercised_stock_options", "restricted_stock"],
                 "all": ["poi", 'bonus', 'deferred_income', 'bin_email_address',
                         'exercised_stock_options',  'expenses', 'from_messages',
                         'from_poi_to_this_person', 'from_this_person_to_poi',
                         'long_term_incentive', 'other', 'restricted_stock',
                         'salary', 'shared_receipt_with_poi', 'to_messages',
                         'total_payments', 'total_stock_value', 'ratio_from_poi',
                         'ratio_to_poi', 'log_ratio_to_poi', 'log_ratio_from_poi'],
                 "RF_clf_on_DT_list": ['poi', 'bonus', 'expenses', 
                                       'exercised_stock_options', 'restricted_stock'],
                 "RF_clf_on_DT_list2": ['poi', 'bonus', 'expenses', 'exercised_stock_options']
                }
###########################################################################################
features_list = features_dict["RF_clf_on_DT_list2"]
###########################################################################################
print(features_list)
print("#########################################################")
print("############# CREATE MY_DATASET... ######################")
print("#########################################################")
df = df.replace(np.nan, 0)
my_dataset = transpose_to_dict(df)

print("#############b############################################")
print("############# CREATE LABELS AND FEATURES... #############")
print("#########################################################")
labels, features = labels_features(my_dataset, features_list, True)

print("#########################################################")
print("############# FEATURE SCALING ...   #####################")
print("#########################################################")
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)
df_feat = pd.DataFrame(features, columns = features_list[1:])

print("#########################################################")
print("###### FEATURE SELECTION ... ############################")
print("###### PARAMETER GRID ... ###############################")
kbest = SelectKBest(f_classif)

dt = DecisionTreeClassifier(random_state = 42)
rf = RandomForestClassifier(random_state = 42)
ada = AdaBoostClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()

pipeline = Pipeline([
                    ("kbest", kbest),
                     ("dt", dt),
                    ])
            
# rf: Parameters of RandomForest
# Parameters of DecisionTree
pg = {"kbest__k": [1,2,3], 
      "dt__min_samples_split": np.arange(10,120,30).tolist(), 
      "dt__criterion": ["gini", "entropy"],
      "dt__splitter":["best", "random"],
      "dt__max_depth": [3, 6, 8, 11]},
#            "rf": {"n_estimators": np.arange(20,220,40).tolist(), 
#                       "min_samples_split" : np.arange(2,12, 3).tolist(),
#                       "criterion" : ['gini', 'entropy']},
#            "knn": {'n_neighbors': np.arange(1,6).tolist(),
#                       'weights': ['distance', 'uniform'],
#                     'algorithm': ['kd_tree', 'ball_tree', 'auto', 'brute']},
#            "ada": {'algorithm' : ['SAMME', 'SAMME.R'],
#                    'n_estimators': [25, 50, 100],
#                 'learning_rate': [.5, 1., 1.5],},
#            "lr": {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
#            }

###########################################################################################
# clf = clf_dict["rf"]
# pg = pg_dict["rf"]
# print(pg_dict["rf"])
###########################################################################################
print("#########################################################")
print("###### EXAMINE FIRST TUPLE OF SCORES... #################")
print("#########################################################")
# instantiate the grid
grid = GridSearchCV(pipeline, pg, cv = 5, scoring = "accuracy")
# fit the grid with data
grid.fit(features, labels)

# examine the first tuple
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)

print("#########################################################")
print("###### CREATE A LIST WITH THE MEAN SCORES... ############")
print("#########################################################")
# grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
# print(grid_mean_scores)

print("#########################################################")
print("###### EXAMINE THE BEST MODEL... ########################")
print("#########################################################")
# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

# best estimator
clf = grid.best_estimator_

#################################################################
### Decision Tree importances
################################################################
# feat_list = features_list[1:]

# print("#########################################################")
# print("###### GINI IMPORTANCE OF EACH FEATURE... ###############")
# print("#########################################################")
# # Print the name and gini importance of each feature
# features_list = ["poi"]
# features_importances = zip(feat_list, clf.feature_importances_)
# for f, s in sorted(features_importances, key = lambda x:x[1], reverse = True):
#     print('{:>25}: {:.3f}'.format(f, s))
#     if s > 0.0:
#         features_list.append(f)

print("#########################################################")
print("###### FEATURES LIST FOR THE TESTER... ##################")
print("#########################################################")
print(features_list)

df = df[features_list]
my_dataset = transpose_to_dict(df)

print("#########################################################")
print("###### PRINT CLASSIFICATION REPORT... ###################")
print("#########################################################")
report = classification_report(labels, clf.predict(features))
print report

dump_classifier_and_data(clf, my_dataset, features_list)

print("#########################################################")
print("####### GETTING FEATURE SCORES... #######################")
print("#########################################################")
# k = grid.get_params(True)['estimator__SelectKBest__transformer_list'][0][1]
# features_scores = zip(features_list[1:], k.scores_)
# for f, s in sorted(features_scores, key=lambda x: x[1], reverse=True):
#     print('%s: %s'%(f, s))
    
    
print("#########################################################")
print("####### RUNNING TESTER.PY ###############################")
print("#########################################################")
%run "tester.py"

#############LOADING PACKAGES...    #####################
#############LOADING VISUALIZATION LIBRARIES... #########
#############LOADING SCIKIT-LEARN PACKAGES... ############
############# LOAD NESTED DICTIONARY - DATASET... #######
############# AND CREATING DATAFRAME... #################


  from numpy.core.umath_tests import inner1d


######## DELETING THE FOLLOWING FEATURES... #############
['loan_advances', 'director_fees', 'restricted_stock_deferred', 'deferral_payments']
#########################################################
############# DROPING THE FOLLOWING OUTLIERS...  ########
['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LAVORATO JOHN J', 'FREVERT MARK A', 'ALLEN PHILLIP K', 'LOCKHART EUGENE E', 'MCMAHON JEFFREY', 'FALLON JAMES B', 'KITCHEN LOUISE', 'WHALLEY LAWRENCE G', 'SHANKMAN JEFFREY A', 'HICKERSON GARY J']
############# CREATING NEW FEATURES... ##################
#########################################################
############# FEATURES LIST... ######################
#########################################################
['poi', 'bonus', 'expenses', 'exercised_stock_options']
#########################################################
############# CREATE MY_DATASET... ######################
#########################################################
#############b##############################

  if np.issubdtype(mask.dtype, np.int):


{'dt__criterion': 'gini', 'dt__max_depth': 3, 'kbest__k': 1, 'dt__min_samples_split': 10, 'dt__splitter': 'best'}
[0.92       0.92       0.88       0.95833333 0.875     ]
0.9105691056910569
#########################################################
###### CREATE A LIST WITH THE MEAN SCORES... ############
#########################################################
#########################################################
###### EXAMINE THE BEST MODEL... ########################
#########################################################
0.926829268292683
{'dt__criterion': 'entropy', 'dt__max_depth': 6, 'kbest__k': 3, 'dt__min_samples_split': 10, 'dt__splitter': 'random'}
Pipeline(steps=[('kbest', SelectKBest(k=3, score_func=<function f_classif at 0x0EF971F0>)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            pr

In [2]:
#######################################################
# All Classifiers over the selected features list 
#######################################################
RF_dict = {"LogisticRegression": {'Accuracy': 0.63949,
            'Precision': 0.11167,
            'Recall': 0.23980,
            'F1': 0.15238,
            'F2': 0.19504,
            'Total predictions': 37000,
            'True positives': 1199,
            'False positives': 9538,
            'False negatives': 3801,
            'True negatives': 22462},
"AdaBoost": {'Accuracy': 0.88508,
            'Precision': 0.57968,
            'Recall': 0.54420,
            'F1': 0.56138,
            'F2': 0.55094,
            'Total predictions': 37000,
            'True positives': 2721,
            'False positives': 1973,
            'False negatives': 2279,
            'True negatives': 30027},
"RandomForest": {'Accuracy': 0.90811,
            'Precision': 0.74585,
            'Recall': 0.48540,
            'F1': 0.58808,
            'F2': 0.52185,
            'Total predictions': 37000,
            'True positives': 2427,
            'False positives':  827,
            'False negatives': 2573,
            'True negatives': 31173},
"DecisionTree": {'Accuracy': 0.89200,
            'Precision': 0.62972,
            'Recall': 0.48740,
            'F1': 0.54949,
            'F2': 0.51047,
            'Total predictions': 37000,
            'True positives': 2437,
            'False positives': 1433,
            'False negatives': 2563,
            'True negatives': 30567},
"K-NearestNeighbors": {'Accuracy': 0.84557,
            'Precision': 0.43413,
            'Recall': 0.47060,
            'F1': 0.45163,
            'F2': 0.46282,
            'Total predictions': 37000,
            'True positives': 2353,
            'False positives': 3067,
            'False negatives': 2647,
            'True negatives': 28933}}


df = pd.DataFrame(RF_dict)

df = df.reindex(index = ["Accuracy","Precision", "Recall", "F1",
            "F2",
            "Total predictions",
            "True positives",
            "False positives",
            "False negatives",
            "True negatives"])

df = df[["LogisticRegression", "AdaBoost", "RandomForest", "DecisionTree", "K-NearestNeighbors"]]; df

Unnamed: 0,LogisticRegression,AdaBoost,RandomForest,DecisionTree,K-NearestNeighbors
Accuracy,0.63949,0.88508,0.90811,0.892,0.84557
Precision,0.11167,0.57968,0.74585,0.62972,0.43413
Recall,0.2398,0.5442,0.4854,0.4874,0.4706
F1,0.15238,0.56138,0.58808,0.54949,0.45163
F2,0.19504,0.55094,0.52185,0.51047,0.46282
Total predictions,37000.0,37000.0,37000.0,37000.0,37000.0
True positives,1199.0,2721.0,2427.0,2437.0,2353.0
False positives,9538.0,1973.0,827.0,1433.0,3067.0
False negatives,3801.0,2279.0,2573.0,2563.0,2647.0
True negatives,22462.0,30027.0,31173.0,30567.0,28933.0


In [3]:
#######################################################
# RANDOMFOREST OVER ALL LISTS
#######################################################
# RF_dict = {"SelectKBest": {"Accuracy": 0.84362,
#             "Precision": 0.37818,
#             "Recall": 0.24400,
#             "F1": 0.29662,
#             "F2": 0.26264,
#             "Total predictions": 37000,
#             "True positives": 1220,
#             "False positives": 2006,
#             "False negatives": 3780,
#             "True negatives": 29994},
# "Lasso": {"Accuracy": 0.81789,
#             "Precision": 0.22290,
#             "Recall": 0.13980,
#             "F1": 0.17183,
#             "F2": 0.15106,
#             "Total predictions": 37000,
#             "True positives": 699,
#             "False positives": 2437,
#             "False negatives": 4301,
#             "True negatives": 29563},
# "RandomForest": {"Accuracy": 0.83377,
#             "Precision": 0.35877,
#             "Recall": 0.20780,
#             "F1": 0.26317,
#             "F2": 0.22690,
#             "Total predictions": 35000,
#             "True positives": 1039,
#             "False positives": 1857,
#             "False negatives": 3961,
#             "True negatives": 28143},
# "DecisionTree": {"Accuracy": 0.90926,
#             "Precision": 0.70630,
#             "Recall": 0.50020,
#             "F1": 0.58565,
#             "F2": 0.53120,
#             "Total predictions": 39000,
#             "True positives": 2501,
#             "False positives": 1040,
#             "False negatives": 2499,
#             "True negatives": 32960},
# "RF_clf_on_DT_list": {'Accuracy': 0.90811,
#             'Precision': 0.74585,
#             'Recall': 0.48540,
#             'F1': 0.58808,
#             'F2': 0.52185,
#             'Total predictions': 37000,
#             'True positives': 2427,
#             'False positives':  827,
#             'False negatives': 2573,
#             'True negatives': 31173}}


# df = pd.DataFrame(RF_dict)

# df = df.reindex(index = ["Accuracy","Precision", "Recall", "F1",
#             "F2",
#             "Total predictions",
#             "True positives",
#             "False positives",
#             "False negatives",
#             "True negatives"])

# df = df[["SelectKBest", "Lasso", "RandomForest", "DecisionTree"]]; df

In [4]:
######################################################
# DECISIONTREE OVER ALL LISTS
######################################################
# DT_dict = {"all features": {'Accuracy': 0.87107,
#             'Precision': 0.56409,
#             'Recall': 0.52367,
#             'F1': 0.54313,
#             'F2': 0.53128,
#             'Total predictions': 41000,
#             'True positives': 3142,
#             'False positives': 2428,
#             'False negatives': 2858,
#             'True negatives': 32572},
# "SelectKBest": {'Accuracy': 0.85512,
#             'Precision': 0.50545,
#             'Recall': 0.46333,
#             'F1': 0.48348,
#             'F2': 0.47119,
#             'Total predictions': 41000,
#             'True positives': 2780,
#             'False positives': 2720,
#             'False negatives': 3220,
#             'True negatives': 32280},
# "Lasso": {'Accuracy': 0.85941,
#             'Precision': 0.45643,
#             'Recall': 0.21160,
#             'F1': 0.28915,
#             'F2': 0.23703,
#             'Total predictions': 37000,
#             'True positives': 1058,
#             'False positives': 1260,
#             'False negatives': 3942,
#             'True negatives': 30740},
# "RandomForest": {'Accuracy': 0.84477,
#             'Precision': 0.41316,
#             'Recall': 0.20600,
#             'F1': 0.27492,
#             'F2': 0.22896,
#             'Total predictions': 35000,
#             'True positives': 1030,
#             'False positives': 1463,
#             'False negatives': 3970,
#             'True negatives': 28537},
# "DecisionTree": {'Accuracy': 0.89200,
#             'Precision': 0.62972,
#             'Recall': 0.48740,
#             'F1': 0.54949,
#             'F2': 0.51047,
#             'Total predictions': 37000,
#             'True positives': 2437,
#             'False positives': 1433,
#             'False negatives': 2563,
#             'True negatives': 30567},
# "RF_clf_on_DT_list": {'Accuracy': 0.89456,
#             'Precision': 0.61111,
#             'Recall': 0.48840,
#             'F1': 0.54291,
#             'F2': 0.50883,
#             'Total predictions': 39000,
#             'True positives': 2442,
#             'False positives': 1554,
#             'False negatives': 2558,
#             'True negatives': 32446},
# "RF_clf_on_DT_list2": {'Accuracy': 0.89200,
#             'Precision': 0.62972,
#             'Recall': 0.48740,
#             'F1': 0.54949,
#             'F2': 0.51047,
#             'Total predictions': 37000,
#             'True positives': 2437,
#             'False positives': 1433,
#             'False negatives': 2563,
#             'True negatives': 30567}}
# df = pd.DataFrame(DT_dict)

# df = df.reindex(index = ["Accuracy","Precision", "Recall", "F1",
#             "F2",
#             "Total predictions",
#             "True positives",
#             "False positives",
#             "False negatives",
#             "True negatives"])

# df = df[["SelectKBest", "Lasso", "RandomForest", "DecisionTree"]]; df

In [5]:
# #######################################################
# # K-Nearest Neighbors OVER ALL LISTS
# #######################################################
# DT_dict = {"RF_clf_on_DT_list": {'Accuracy': 0.86133,
#             'Precision': 0.46083,
#             'Recall': 0.48000,
#             'F1': 0.47022,
#             'F2': 0.47604,
#             'Total predictions': 39000,
#             'True positives': 2400,
#             'False positives': 2808,
#             'False negatives': 2600,
#             'True negatives': 31192},
# "SelectKBest": {'Accuracy': 0.84727,
#             'Precision': 0.40480,
#             'Recall': 0.27680,
#             'F1': 0.32878,
#             'F2': 0.29549,
#             'Total predictions': 37000,
#             'True positives': 1384,
#             'False positives': 2035,
#             'False negatives': 3616,
#             'True negatives': 29965},
# "Lasso": {'Accuracy': 0.77438,
#             'Precision': 0.22620,
#             'Recall': 0.27660,
#             'F1': 0.24888,
#             'F2': 0.26480,
#             'Total predictions': 37000,
#             'True positives': 1383,
#             'False positives': 4731,
#             'False negatives': 3617,
#             'True negatives': 27269},
# "RandomForest": {'Accuracy': 0.81971,
#             'Precision': 0.34102,
#             'Recall': 0.28100,
#             'F1': 0.30811,
#             'F2': 0.29125,
#             'Total predictions': 35000,
#             'True positives': 1405,
#             'False positives': 2715,
#             'False negatives': 3595,
#             'True negatives': 27285},
# "DecisionTree": {'Accuracy': 0.86133,
#             'Precision': 0.46083,
#             'Recall': 0.48000,
#             'F1': 0.47022,
#             'F2': 0.47604,
#             'Total predictions': 39000,
#             'True positives': 2400,
#             'False positives': 2808,
#             'False negatives': 2600,
#             'True negatives': 31192}
# }

# df = pd.DataFrame(DT_dict)

# df = df.reindex(index = ["Accuracy","Precision", "Recall", "F1",
#             "F2",
#             "Total predictions",
#             "True positives",
#             "False positives",
#             "False negatives",
#             "True negatives"])

# df = df[["SelectKBest", "Lasso", "RandomForest", "DecisionTree", "RF_clf_on_DT_list"]]; df