In [27]:
# Dependencies
import pandas as pd
import numpy as np

In [29]:
# Name of the CSV file
file = 'Resources/donors2008.csv'

In [60]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file, encoding="ISO-8859-1")

In [58]:
# Preview of the DataFrame
# Note that FIELD8 is likely a meaningless column
df.head()

Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0
3,Adams,Lorraine,Self,New York,NY,10026,200.0
4,Adams,Marion,,Exeter,NH,3833,100.0


In [61]:
# Delete extraneous column
del df['FIELD8']
df.head()


Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0
3,Adams,Lorraine,Self,New York,NY,10026,200.0
4,Adams,Marion,,Exeter,NH,3833,100.0


In [62]:
# Identify incomplete rows
df.count()

LastName     1776
FirstName    1776
Employer     1743
City         1776
State        1776
Zip          1776
Amount       1776
dtype: int64

In [64]:
# Drop all rows with missing information
df.dropna(how='any')

Unnamed: 0,LastName,FirstName,Employer,City,State,Zip,Amount
0,Aaron,Eugene,State Department,Dulles,VA,20189,500.0
1,Abadi,Barbara,Abadi & Co.,New York,NY,10021,200.0
2,Adamany,Anthony,Retired,Rockford,IL,61103,500.0
3,Adams,Lorraine,Self,New York,NY,10026,200.0
4,Adams,Marion,,Exeter,NH,03833,100.0
...,...,...,...,...,...,...,...
1769,Zeluf,Craig,TD Ameritrade,Omaha,NE,68118,250.0
1770,Zimmer,Charles,RZO LLC / QED Productions,Atlanta,GA,30307,250.0
1771,Zinczenko,David,Rodale,Allentown,PA,18102,500.0
1774,Zwerdling,David,"Montg Cnty, Md",Silver Spring,MD,20910,35.0


In [65]:
# Verify dropped rows
df.count()

LastName     1776
FirstName    1776
Employer     1743
City         1776
State        1776
Zip          1776
Amount       1776
dtype: int64

In [67]:
# The Amount column is the wrong data type. It should be numeric.
df.dtypes

LastName      object
FirstName     object
Employer      object
City          object
State         object
Zip           object
Amount       float64
dtype: object

In [70]:
# Use pd.to_numeric() method to convert the datatype of the Amount column
df['Amount'] = pd.to_numeric(df['Amount'])

In [71]:
# Verify that the Amount column datatype has been made numeric
df['Amount'].dtype


dtype('float64')

In [74]:
# Display an overview of the Employers column
df['Employer'].value_counts()

None                                   249
Self                                   241
Retired                                126
Self Employed                           39
Self-Employed                           34
                                      ... 
Capri Capital Partners LLC               1
Derivix Corp                             1
Sony Electronics                         1
Teresa Trabue, Psy.D.                    1
Mooney, Green, Baker & Saindon, PC.      1
Name: Employer, Length: 1011, dtype: int64

In [75]:
# Clean up Employer category. Replace 'Self Employed' and 'Self' with 'Self-Employed'
df['Employer']=df['Employer'].replace({'Self Employed':'Self-Employed','Self':'Self-Employed'})

In [77]:
# Verify clean-up.
df['Employer'].value_counts()



Self-Employed                          314
None                                   249
Retired                                126
Google                                   6
Unemployed                               4
                                      ... 
TWJ Capital LLC                          1
Capri Capital Partners LLC               1
Derivix Corp                             1
Sony Electronics                         1
Mooney, Green, Baker & Saindon, PC.      1
Name: Employer, Length: 1009, dtype: int64

In [80]:
df['Employer']=df['Employer'].replace({'Not Employed':'Unemployed'})
df['Employer'].value_counts()

Self-Employed                          314
None                                   249
Retired                                126
Unemployed                               8
Google                                   6
                                      ... 
TWJ Capital LLC                          1
Capri Capital Partners LLC               1
Derivix Corp                             1
Sony Electronics                         1
Mooney, Green, Baker & Saindon, PC.      1
Name: Employer, Length: 1008, dtype: int64

In [85]:
# Display a statistical overview
df.describe()
# We can infer the maximum allowable individual contribution from 'max'


Unnamed: 0,Amount
count,1776.0
mean,659.311622
std,1274.416858
min,5.0
25%,200.0
50%,250.0
75%,500.0
max,5000.0
