In [1]:
#Dependencies and setup
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#File to load
adult_census_to_load = Path("adult_census_income_clean.csv")

In [3]:
#Read adult census income data and store into Pandas DataFrames
adult_census = pd.read_csv(adult_census_to_load)
adult_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
#Remove unnecessary columns
adult_census = adult_census.drop(['fnlwgt', 'education.num'], axis=1)
adult_census.head()

Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,HS-grad,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,Some-college,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [13]:
#Rename some of the columns
adult_census = adult_census.rename(columns={"marital.status": "marital status", "capital.gain": "capital gain", "capital.loss": "capital loss", "hours.per.week": "hours per week", "native.country": "native country"})
adult_census.head()

Unnamed: 0,age,workclass,education,marital status,occupation,relationship,race,sex,capital gain,capital loss,hours per week,native country,income
0,90,?,HS-grad,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,Some-college,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [14]:
#Identify incomplete rows
adult_census.count()

age               32561
workclass         32561
education         32561
marital status    32561
occupation        32561
relationship      32561
race              32561
sex               32561
capital gain      32561
capital loss      32561
hours per week    32561
native country    32561
income            32561
dtype: int64

In [16]:
#Get count of each unique value in 'native-country'
country_counts = adult_census['native country'].value_counts()
print(country_counts)

native country
United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece       

In [17]:
#Filter to only include United States data
adult_census_us = adult_census[adult_census['native country'] == 'United-States']
us_only = adult_census_us['native country'].value_counts()
print(us_only)

native country
United-States    29170
Name: count, dtype: int64


In [18]:
adult_census_us.describe()

Unnamed: 0,age,capital gain,capital loss,hours per week
count,29170.0,29170.0,29170.0,29170.0
mean,38.655674,1089.229928,88.510593,40.447755
std,13.722408,7381.898528,405.681709,12.417203
min,17.0,0.0,0.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,0.0,0.0,40.0
75%,48.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


In [19]:
#Get count of each unique value in 'income'
income_counts = adult_census_us['income'].value_counts()
print(income_counts)

income
<=50K    21999
>50K      7171
Name: count, dtype: int64


In [25]:
#Get count of each unique value in 'hours.per.week'
hours_per_week_counts = adult_census_us['hours per week'].value_counts()
print(hours_per_week_counts)

hours per week
40    13418
50     2593
45     1676
60     1332
35     1135
      ...  
94        1
82        1
87        1
92        1
74        1
Name: count, Length: 94, dtype: int64


In [27]:
#Create new column to identify FT versus PT employees
adult_census_us.loc[:,'employment type'] = np.where(adult_census_us['hours per week'] >=40, 'FT', 'PT')
adult_census_us.head()


Unnamed: 0,age,workclass,education,marital status,occupation,relationship,race,sex,capital gain,capital loss,hours per week,native country,income,employment type
0,90,?,HS-grad,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,FT
1,82,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,PT
2,66,?,Some-college,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,FT
3,54,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,FT
4,41,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,FT


In [28]:
#Get count of each unique value in 'sex'
gender_counts = adult_census_us['sex'].value_counts()
print(gender_counts)

sex
Male      19488
Female     9682
Name: count, dtype: int64


In [29]:
#Get count of each unique value in 'race'
race_counts = adult_census_us['race'].value_counts()
print(race_counts)

race
White                 25621
Black                  2832
Amer-Indian-Eskimo      296
Asian-Pac-Islander      292
Other                   129
Name: count, dtype: int64


In [30]:
#Get count of each unique value in 'relationship'
relationship_counts = adult_census_us['relationship'].value_counts()
print(relationship_counts)

relationship
Husband           11861
Not-in-family      7528
Own-child          4691
Unmarried          3033
Wife               1361
Other-relative      696
Name: count, dtype: int64


In [32]:
#Get count of each unique value in 'marital status'
marital_status_counts = adult_census_us['marital status'].value_counts()
print(marital_status_counts)

marital status
Married-civ-spouse       13368
Never-married             9579
Divorced                  4162
Widowed                    902
Separated                  883
Married-spouse-absent      253
Married-AF-spouse           23
Name: count, dtype: int64


In [31]:
#Get count of each unique value in 'occupation'
occupation_counts = adult_census_us['occupation'].value_counts()
print(occupation_counts)

occupation
Exec-managerial      3735
Prof-specialty       3693
Craft-repair         3685
Adm-clerical         3449
Sales                3364
Other-service        2777
Machine-op-inspct    1687
?                    1666
Transport-moving     1491
Handlers-cleaners    1189
Farming-fishing       879
Tech-support          850
Protective-serv       606
Priv-house-serv        90
Armed-Forces            9
Name: count, dtype: int64


In [33]:
#Get count of each unique value in 'education'
education_counts = adult_census_us['education'].value_counts()
print(education_counts)

education
HS-grad         9702
Some-college    6740
Bachelors       4766
Masters         1527
Assoc-voc       1289
11th            1067
Assoc-acdm       982
10th             848
Prof-school      502
7th-8th          499
9th              395
12th             365
Doctorate        328
5th-6th           97
1st-4th           46
Preschool         17
Name: count, dtype: int64


In [35]:
#Get count of each unique value in 'workclass'
work_class_counts = adult_census_us['workclass'].value_counts()
print(work_class_counts)

workclass
Private             20135
Self-emp-not-inc     2313
Local-gov            1956
?                    1659
State-gov            1210
Self-emp-inc          991
Federal-gov           886
Without-pay            13
Never-worked            7
Name: count, dtype: int64
