In [1]:
# Importações necessárias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import Markdown, display

In [2]:
# Carregando o dataset
df = pd.read_csv('enron.csv')

In [3]:
# Exibindo as primeiras linhas do dataset (Verificando se o dataset foi carregado corretamente)
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value,name
0,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868.0,2195.0,47.0,65.0,...,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0,ALLEN PHILLIP K
1,,178980.0,,,,257817.0,3486.0,,,,...,,False,,,,,,182466.0,257817.0,BADUM JAMES P
2,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301.0,29.0,39.0,0.0,...,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197.0,5243487.0,BANNANTINE JAMES M
3,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,2660303.0,False,3942714.0,,267102.0,,,5634343.0,10623258.0,BAXTER JOHN C
4,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142.0,,,,...,69.0,False,145796.0,-82782.0,239671.0,,,827696.0,63014.0,BAY FRANKLIN R


In [4]:
# Calculando e exibindo a quantidade de indivíduos (instâncias) no dataset, utilizando Markdown para formatação
display(Markdown('Quantidade de indivíduos (instâncias) no dataset: **{}**'.format(len(df))))

Quantidade de indivíduos (instâncias) no dataset: **146**

In [22]:
# Calculando e exibindo a quantidade de variáveis descritivas no dataset
variaveis_descritivas = df.select_dtypes(include=['object', 'category'])

# Exibindo a quantidade de variáveis descritivas (através de len() ) e seus nomes
display(Markdown(f'**Variáveis descritivas do dataset**: {variaveis_descritivas.shape[1]} variáveis.'))
display(Markdown(f'**Variáveis descritivas:**: {variaveis_descritivas.columns.tolist()}'))

**Variáveis descritivas do dataset**: 2 variáveis.

**Variáveis descritivas:**: ['email_address', 'name']

In [6]:
# Calculando e exibindo a quantidade de pessoas de interesse (POI) no dataset
display(Markdown('**Quantidade de Pessoas de Interesse**: {}'.format(df['poi'].sum())))
display(Markdown('**Quantidade de Pessoas fora de interesse**: {}'.format((~df['poi']).sum())))

**Quantidade de Pessoas de Interesse**: 18

**Quantidade de Pessoas fora de interesse**: 128

In [7]:
# Calculando, criando um novo DataFrame e exibindo a quantidade de variáveis numéricas no dataset original
# Criando uma variável que contém somente as colunas numéricas do DataFrame
variaveis_numericas = df.select_dtypes(include=['number']).columns
resumo = pd.DataFrame({
    'Média': df[variaveis_numericas].mean(numeric_only=True),
    'Mediana': df[variaveis_numericas].median(numeric_only=True),
    'Variância': df[variaveis_numericas].var(numeric_only=True),
    'Desvio Padrão': df[variaveis_numericas].std(numeric_only=True),
    '1º Quartil (25%)': df[variaveis_numericas].quantile(0.25, numeric_only=True),
    '3º Quartil (75%)': df[variaveis_numericas].quantile(0.75, numeric_only=True)
}).T

display(Markdown(resumo.to_markdown(floatfmt=".2f")))

|                  |              bonus |   deferral_payments |   deferred_income |   director_fees |   exercised_stock_options |        expenses |   from_messages |   from_poi_to_this_person |   from_this_person_to_poi |       loan_advances |   long_term_incentive |             other |   restricted_stock |   restricted_stock_deferred |           salary |   shared_receipt_with_poi |   to_messages |     total_payments |   total_stock_value |
|:-----------------|-------------------:|--------------------:|------------------:|----------------:|--------------------------:|----------------:|----------------:|--------------------------:|--------------------------:|--------------------:|----------------------:|------------------:|-------------------:|----------------------------:|-----------------:|--------------------------:|--------------:|-------------------:|--------------------:|
| Média            |         2374234.61 |          1642674.15 |       -1140475.14 |       166804.88 |                5987053.77 |       108728.92 |          608.79 |                     64.90 |                     41.23 |         41962500.00 |            1470361.45 |         919064.97 |         2321741.14 |                   166410.56 |        562194.29 |                   1176.47 |       2073.86 |         5081526.49 |          6773957.45 |
| Mediana          |          769375.00 |           227449.00 |        -159792.00 |       108579.00 |                1310813.50 |        46950.00 |           41.00 |                     35.00 |                      8.00 |         41762500.00 |             442035.00 |          52382.00 |          451740.00 |                  -146975.00 |        259996.00 |                    740.50 |       1211.00 |         1101393.00 |          1102872.50 |
| Variância        | 114775396172335.09 |   26645521052091.14 | 16203896512113.12 | 102330514030.11 |        964848252082401.75 | 284659397866.12 |      3389406.00 |                   7565.39 |                  10014.63 | 2216828541666666.75 |     35316388281936.87 | 21061242250265.09 | 156707288526355.62 |           17652554476485.44 | 7378661383806.17 |                1388432.46 |    6670344.36 | 844583360181500.75 | 1517708056259895.75 |
| Desvio Padrão    |        10713327.97 |          5161929.97 |        4025406.38 |       319891.41 |               31062006.57 |       533534.81 |         1841.03 |                     86.98 |                    100.07 |         47083208.70 |            5942759.32 |        4589252.91 |        12518278.18 |                  4201494.31 |       2716369.15 |                   1178.32 |       2582.70 |        29061716.40 |         38957772.73 |
| 1º Quartil (25%) |          431250.00 |            81573.00 |        -694862.00 |        98784.00 |                 527886.25 |        22614.00 |           22.75 |                     10.00 |                      1.00 |          1600000.00 |             281250.00 |           1215.00 |          254018.00 |                  -389621.75 |        211816.00 |                    249.75 |        541.25 |          394475.00 |           494510.25 |
| 3º Quartil (75%) |         1200000.00 |          1002671.50 |         -38346.00 |       113784.00 |                2547724.00 |        79952.50 |          145.50 |                     72.25 |                     24.75 |         82125000.00 |             938672.00 |         362096.00 |         1002369.75 |                   -75009.75 |        312117.00 |                   1888.25 |       2634.75 |         2093263.00 |          2949846.75 |

In [8]:
# Calculando e exibindo a moda das variáveis descritivas do dataset
moda = variaveis_descritivas.mode()

# Como são variáveis do tipo texto, a moda é apenas uma repetição de todos os valores das variáveis
moda

Unnamed: 0,email_address,name
0,a..martin@enron.com,ALLEN PHILLIP K
1,adam.umanoff@enron.com,BADUM JAMES P
2,andrew.fastow@enron.com,BANNANTINE JAMES M
3,ben.glisan@enron.com,BAXTER JOHN C
4,bill.cordes@enron.com,BAY FRANKLIN R
...,...,...
141,,WINOKUR JR. HERBERT S
142,,WODRASKA JOHN
143,,WROBEL BRUCE
144,,YEAGER F SCOTT


In [9]:
for coluna in variaveis_descritivas.columns:
    print(f'\nValores únicos da variável {coluna}:')
    print(variaveis_descritivas[coluna].unique())


Valores únicos da variável email_address:
['phillip.allen@enron.com' nan 'james.bannantine@enron.com'
 'frank.bay@enron.com' 'sally.beck@enron.com' 'tim.belden@enron.com'
 'david.berberian@enron.com' 'rick.bergsieker@enron.com'
 'sanjay.bhatnagar@enron.com' 'philippe.bibi@enron.com'
 'jeremy.blachman@enron.com' 'raymond.bowen@enron.com'
 'michael.brown@enron.com' 'john.buchanan@enron.com' 'bob.butts@enron.com'
 'rick.buy@enron.com' 'christopher.calger@enron.com'
 'rebecca.carter@enron.com' 'richard.causey@enron.com'
 'diomedes.christodoulou@enron.com' 'wes.colwell@enron.com'
 'bill.cordes@enron.com' 'chip.cox@enron.com' 'joseph.deffner@enron.com'
 'david.delainey@enron.com' 'james.derrick@enron.com'
 'timothy.detmering@enron.com' 'janet.dietrich@enron.com'
 'richard.dimichele@enron.com' 'keith.dodson@enron.com'
 'jeff.donahue@enron.com' 'w.duran@enron.com' 'john.echols@enron.com'
 'steven.elliott@enron.com' 'jim.fallon@enron.com'
 'andrew.fastow@enron.com' 'jay.fitzgerald@enron.com'
 

In [11]:
for coluna in variaveis_descritivas.columns:
    globals()[f'df_{coluna}'] = variaveis_descritivas[[coluna]].dropna().drop_duplicates()

In [20]:
display(Markdown('Valores únicos da variável categórica **email_address**'))
df_email_address

Valores únicos da variável categórica **email_address**

Unnamed: 0,email_address
0,phillip.allen@enron.com
2,james.bannantine@enron.com
4,frank.bay@enron.com
6,sally.beck@enron.com
7,tim.belden@enron.com
...,...
137,dick.westfahl@enron.com
139,greg.whalley@enron.com
140,thomas.white@enron.com
142,john.wodraska@enron.com


In [19]:
display(Markdown('Valores únicos da variável categórica **name**'))
df_name

Valores únicos da variável categórica **name**

Unnamed: 0,name
0,ALLEN PHILLIP K
1,BADUM JAMES P
2,BANNANTINE JAMES M
3,BAXTER JOHN C
4,BAY FRANKLIN R
...,...
141,WINOKUR JR. HERBERT S
142,WODRASKA JOHN
143,WROBEL BRUCE
144,YEAGER F SCOTT


In [None]:
df.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
count,82.0,39.0,49.0,17.0,102.0,95.0,86.0,86.0,86.0,4.0,66.0,93.0,110.0,18.0,95.0,86.0,86.0,125.0,126.0
mean,2374235.0,1642674.0,-1140475.0,166804.9,5987054.0,108728.9,608.790698,64.895349,41.232558,41962500.0,1470361.0,919065.0,2321741.0,166410.6,562194.3,1176.465116,2073.860465,5081526.0,6773957.0
std,10713330.0,5161930.0,4025406.0,319891.4,31062010.0,533534.8,1841.033949,86.979244,100.073111,47083210.0,5942759.0,4589253.0,12518280.0,4201494.0,2716369.0,1178.317641,2582.700981,29061720.0,38957770.0
min,70000.0,-102500.0,-27992890.0,3285.0,3285.0,148.0,12.0,0.0,0.0,400000.0,69223.0,2.0,-2604490.0,-7576788.0,477.0,2.0,57.0,148.0,-44093.0
25%,431250.0,81573.0,-694862.0,98784.0,527886.2,22614.0,22.75,10.0,1.0,1600000.0,281250.0,1215.0,254018.0,-389621.8,211816.0,249.75,541.25,394475.0,494510.2
50%,769375.0,227449.0,-159792.0,108579.0,1310814.0,46950.0,41.0,35.0,8.0,41762500.0,442035.0,52382.0,451740.0,-146975.0,259996.0,740.5,1211.0,1101393.0,1102872.0
75%,1200000.0,1002672.0,-38346.0,113784.0,2547724.0,79952.5,145.5,72.25,24.75,82125000.0,938672.0,362096.0,1002370.0,-75009.75,312117.0,1888.25,2634.75,2093263.0,2949847.0
max,97343620.0,32083400.0,-833.0,1398517.0,311764000.0,5235198.0,14368.0,528.0,609.0,83925000.0,48521930.0,42667590.0,130322300.0,15456290.0,26704230.0,5521.0,15149.0,309886600.0,434509500.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   bonus                      82 non-null     float64
 1   deferral_payments          39 non-null     float64
 2   deferred_income            49 non-null     float64
 3   director_fees              17 non-null     float64
 4   email_address              111 non-null    object 
 5   exercised_stock_options    102 non-null    float64
 6   expenses                   95 non-null     float64
 7   from_messages              86 non-null     float64
 8   from_poi_to_this_person    86 non-null     float64
 9   from_this_person_to_poi    86 non-null     float64
 10  loan_advances              4 non-null      float64
 11  long_term_incentive        66 non-null     float64
 12  other                      93 non-null     float64
 13  poi                        146 non-null    bool   