In [21]:
import pickle
from collections import defaultdict
from functools import reduce

import pandas as pd
from pandas import DataFrame

from utils import DATA_DIR

FINAL_PROJECT_DIR = DATA_DIR / "final_project" / "final_project_dataset.pkl"

NUMERIC_FEATURES = ['salary',
                    'to_messages',
                    'deferral_payments',
                    'total_payments',
                    'loan_advances',
                    'bonus',
                    'restricted_stock_deferred',
                    'deferred_income',
                    'total_stock_value',
                    'expenses',
                    'from_poi_to_this_person',
                    'exercised_stock_options',
                    'from_messages',
                    'other',
                    'from_this_person_to_poi',
                    'poi',
                    'long_term_incentive',
                    'shared_receipt_with_poi',
                    'restricted_stock',
                    'director_fees']

# Exploratory Data Analysis of Enron data

**Goals**: get a sense for the data and start to form hypothesis about how to use it

Data:
- `download-Enron-dataset.py` has already been run
- files from [udacity/ud120-projects/final_project](https://github.com/udacity/ud120-projects/tree/master/final_project) have been added to `/data`

In [27]:
def build_feature_dict():
    with FINAL_PROJECT_DIR.open('rb') as f:
        final_project_dataset_dict = pickle.load(f)
    feature_dict = defaultdict(lambda: [])
    for person, data in final_project_dataset_dict.items():
        feature_dict['person'].append(person)
        for key, value in data.items():
            feature_dict[key].append(value)

    return feature_dict

In [None]:
def sum_dict(dictionary, value):
    initial_val = dictionary[value] if value in dictionary else 0
    dictionary[value] = initial_val + 1
    return dictionary


{
    feature: {
        "types": reduce(sum_dict, map(type, values), {}),
        "NaNs": len(list(filter(lambda x: x == 'NaN', values))),
    }
    for feature, values in build_feature_dict().items()
}

## final_project_dataset.pkl

In [22]:
def get_final_project_dataset() -> pd.DataFrame:
    final_project_dataset_file = FINAL_PROJECT_DIR
    with final_project_dataset_file.open('rb') as f:
        final_project_dataset_dict = pickle.load(f)

    df = pd.DataFrame.from_dict(
        final_project_dataset_dict,
        orient="index",
    )
    for numeric_feature in NUMERIC_FEATURES:
        df[numeric_feature] = pd.to_numeric(df[numeric_feature], errors='coerce', downcast='integer')

    return df


final_project_dataset_df: DataFrame = get_final_project_dataset()

In [23]:
final_project_dataset_df.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
count,95.0,86.0,39.0,125.0,4.0,82.0,18.0,49.0,126.0,95.0,86.0,102.0,86.0,93.0,86.0,66.0,86.0,110.0,17.0
mean,562194.3,2073.860465,1642674.0,5081526.0,41962500.0,2374235.0,166410.6,-1140475.0,6773957.0,108728.9,64.895349,5987054.0,608.790698,919065.0,41.232558,1470361.0,1176.465116,2321741.0,166804.9
std,2716369.0,2582.700981,5161930.0,29061720.0,47083210.0,10713330.0,4201494.0,4025406.0,38957770.0,533534.8,86.979244,31062010.0,1841.033949,4589253.0,100.073111,5942759.0,1178.317641,12518280.0,319891.4
min,477.0,57.0,-102500.0,148.0,400000.0,70000.0,-7576788.0,-27992890.0,-44093.0,148.0,0.0,3285.0,12.0,2.0,0.0,69223.0,2.0,-2604490.0,3285.0
25%,211816.0,541.25,81573.0,394475.0,1600000.0,431250.0,-389621.8,-694862.0,494510.2,22614.0,10.0,527886.2,22.75,1215.0,1.0,281250.0,249.75,254018.0,98784.0
50%,259996.0,1211.0,227449.0,1101393.0,41762500.0,769375.0,-146975.0,-159792.0,1102872.0,46950.0,35.0,1310814.0,41.0,52382.0,8.0,442035.0,740.5,451740.0,108579.0
75%,312117.0,2634.75,1002672.0,2093263.0,82125000.0,1200000.0,-75009.75,-38346.0,2949847.0,79952.5,72.25,2547724.0,145.5,362096.0,24.75,938672.0,1888.25,1002370.0,113784.0
max,26704230.0,15149.0,32083400.0,309886600.0,83925000.0,97343620.0,15456290.0,-833.0,434509500.0,5235198.0,528.0,311764000.0,14368.0,42667590.0,609.0,48521930.0,5521.0,130322300.0,1398517.0


In [26]:
final_project_dataset_df.dtypes

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,email_address,restricted_stock_deferred,deferred_income,total_stock_value,...,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
METTS MARK,365788.0,807.0,,1061827.0,,600000.0,mark.metts@enron.com,,,585062.0,...,38.0,,29.0,1740.0,1.0,False,,702.0,585062.0,
BAXTER JOHN C,267102.0,,1295738.0,5634343.0,,1200000.0,,,-1386055.0,10623258.0,...,,6680544.0,,2660303.0,,False,1586055.0,,3942714.0,
ELLIOTT STEVEN,170941.0,,,211725.0,,350000.0,steven.elliott@enron.com,,-400729.0,6678735.0,...,,4890344.0,,12961.0,,False,,,1788391.0,
CORDES WILLIAM R,,764.0,,,,,bill.cordes@enron.com,,,1038185.0,...,10.0,651850.0,12.0,,0.0,False,,58.0,386335.0,
HANNON KEVIN P,243293.0,1045.0,,288682.0,,1500000.0,kevin.hannon@enron.com,,-3117011.0,6391065.0,...,32.0,5538001.0,32.0,11350.0,21.0,True,1617011.0,1035.0,853064.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GRAMM WENDY L,,,,119292.0,,,,,,,...,,,,,,False,,,,119292.0
CAUSEY RICHARD A,415189.0,1892.0,,1868758.0,,1000000.0,richard.causey@enron.com,,-235000.0,2502063.0,...,58.0,,49.0,307895.0,12.0,True,350000.0,1585.0,2502063.0,
TAYLOR MITCHELL S,265214.0,533.0,227449.0,1092663.0,,600000.0,mitchell.taylor@enron.com,,,3745048.0,...,0.0,3181250.0,29.0,,0.0,False,,300.0,563798.0,
DONAHUE JR JEFFREY M,278601.0,865.0,,875760.0,,800000.0,jeff.donahue@enron.com,,-300000.0,1080988.0,...,188.0,765920.0,22.0,891.0,11.0,False,,772.0,315068.0,


In [None]:
final_project_dataset_df.describe()

In [None]:
# how many NaNs do we have?
final_project_dataset_df.isna().sum()

In [None]:
# who earned the most stock?
final_project_dataset_df["total_payments"].sort_values(ascending=False).head()

In [None]:
final_project_dataset_df["total_stock_value"].sort_values(ascending=False).head()

In [None]:
#
final_project_dataset_df["salary"].isna().sum()

In [None]:
# How many people in the E+F dataset (as it currently exists) have “NaN” for their total payments?
general_total_payment_nan_count = final_project_dataset_df["total_payments"].isna().sum()
general_total_payment_count = len(final_project_dataset_df["total_payments"])
general_total_payment_nan_count / general_total_payment_count

In [None]:
# How many POIs in the E+F dataset have “NaN” for their total payments?
# What percentage of POI’s as a whole is this?
poi_df = final_project_dataset_df[final_project_dataset_df["poi"]]
poi_total_payment_nan_count = poi_df["total_payments"].isna().sum()
poi_total_payment_count = len(poi_df["total_payments"])
poi_total_payment_nan_count / poi_total_payment_count

In [None]:
# What is the new number of people of the dataset?
# What is the new number of folks with “NaN” for total payments?
print('new ppl with NaN for total pay: ', general_total_payment_nan_count + 10)
print('new total ppl in dataset: ', general_total_payment_count + 10)

In [None]:
# What is the new number of POI’s in the dataset?
# What is the new number of POI’s with NaN for total_payments?
print('new poi with NaN for total pay: ', poi_total_payment_nan_count + 10)
print('new total poi in dataset: ', poi_total_payment_count + 10)