In [11]:
# import libraries

import pandas as pd
import numpy as np

In [12]:
# read the data

df = pd.read_csv('adult.data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
# rename some column names to avoid errors down the road

df.rename(columns={'hours-per-week' : 'hours_per_week',
                   'native-country': 'native_country',
                   'capital-gain': 'capital_gain',
                   'capital-loss': 'capital_loss',
                   'education-num': 'education_num',
                   'marital-status': 'marital_satatus'}, inplace = True)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_satatus,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
# check for missing values

for col in df.columns:
  percent_missing = np.mean(df[col].isnull())
  print(f'{col} - {percent_missing}%')

age - 0.0%
workclass - 0.0%
fnlwgt - 0.0%
education - 0.0%
education_num - 0.0%
marital_satatus - 0.0%
occupation - 0.0%
relationship - 0.0%
race - 0.0%
sex - 0.0%
capital_gain - 0.0%
capital_loss - 0.0%
hours_per_week - 0.0%
native_country - 0.0%
salary - 0.0%


In [15]:
# find how many people of each race are represented in dataset

df.race.value_counts()


White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [16]:
# calculate the average age for men

avg_men_age = round(df.where(df['sex'] == 'Male').age.mean())

print(f'The average age for men is {avg_men_age} years.')


The average age for men is 39 years.


In [17]:
# calculate the percentage of people who have a Bachelor's degree

bach_degree = df.where(df['education'] == 'Bachelors').education.count()
total_education = df.education.count()

percent_bach_degree = round((bach_degree/total_education)*100,2)

print(f'The percentage of people who have a Bachelor\'s degree is: {percent_bach_degree}%')

The percentage of people who have a Bachelor's degree is: 16.45%


In [18]:
# calculate the percentage of people with advanced education(Bachelors, Masters, Doctorate) who make more than 50K

adv_edu = df.where((df['education']== 'Bachelors') | 
                   (df['education']== 'Masters' )| 
                   (df['education']== 'Doctorate')).education.count()

adv_edu_50K = df.where(((df['education']== 'Bachelors') | 
                        (df['education']== 'Masters' )| 
                        (df['education']== 'Doctorate')) &
                        (df['salary'] =='>50K')).education.count()

percent_adv_edu_50K = round((adv_edu_50K/adv_edu)*100,2)

print(f'The percent of people with advanced education who make more than 50K is: {percent_adv_edu_50K}%')

The percent of people with advanced education who make more than 50K is: 46.54%


In [19]:
# calculate the percent of people without advanced education who make more than 50k

no_adv_edu = total_education - adv_edu

total_50K_salary = df.where(df['salary'] == '>50K').salary.count()

no_adv_edu_50k = total_50K_salary - adv_edu_50K

percent_no_adv_edu_50K = round((no_adv_edu_50k/no_adv_edu)*100,2)

print(f'The percent of people without advanced education who make more than 50K is: {percent_no_adv_edu_50K}%')


The percent of people without advanced education who make more than 50K is: 17.37%


In [20]:
# calculate the minimum number of hours a person works per week

min_h = df.hours_per_week.min()

print(f'The min number of hours a person works per week is : {min_h} h')

The min number of hours a person works per week is : 1 h


In [105]:
# calculate the percentage of people who work the min numbers per week and have a salary > 50k

min_hour_great_salary = df[(df['hours_per_week'] == min_h) & (df['salary'] == '>50K')].salary.count()

total_people_min_hour = df.hours_per_week.nsmallest().count()

percent_min_hour_great_salary = round((min_hour_great_salary / total_people_min_hour)*100,2)

print(f'The percent of people who work the min numbers per week and have a salary more than 50k: {percent_min_hour_great_salary}%')

The percent of people who work the min numbers per week and have a salary more than 50k: 40.0%


In [119]:
# find the country who has the highest percentage of people that earn >50K and what is that percentage

countries_50K = df[df.salary == '>50K'].native_country.value_counts()

country_name = countries_50K.idxmax()
country_value = countries_50K.max()
percent = round((country_value/total_50K_salary)*100,2)

print(f'Country who has the highest percentage of people that earn more than 50K is {country_name} : {percent}% ')

Country who has the highest percentage of people that earn more than 50K is United-States : 91.46% 


In [126]:
# identify the most popular occupation for those who earn >50K in India

df[(df.salary == '>50K') & (df.native_country == 'India')].occupation.value_counts().idxmax()



'Prof-specialty'