In [1]:
import pandas as pd

### Read data from file

In [2]:
df = pd.read_csv('adult.data.csv')

In [33]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.

In [4]:
race_count = df.race.value_counts()
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

### What is the average age of men?

In [38]:
age_men = df.sex == 'Male'
average_age_men = (df[age_men].age.sum()/len(df[age_men])).round(1)
average_age_men

39.4

### What is the percentage of people who have a Bachelor's degree?

In [43]:
bachelor = df.education == 'Bachelors'
percentage_bachelors = len(df[bachelor])/len(df)
print(f"{percentage_bachelors:.2f}")

0.16


### What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

#### What percentage of people without advanced education make more than 50K?

#### with and without `Bachelors`, `Masters`, or `Doctorate`

In [7]:
higher_education = (
      (df.education == "Bachelors") | 
      (df.education == "Masters") | 
      (df.education == "Doctorate")
    )

In [8]:
lower_education = len(df.loc[~higher_education])
lower_education

25070

### percentage with salary >50K

In [9]:
rich = (df.salary == ">50K")
rich

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559    False
32560     True
Name: salary, Length: 32561, dtype: bool

In [10]:
high_edu_rich = df.loc[higher_education & rich]
higher_education_rich = len(high_edu_rich)/len(df.loc[higher_education])

print(f"{higher_education_rich:.2f}")

0.47


In [11]:
low_edu = df.loc[~higher_education]
low_edu_rich = (df.loc[~higher_education & rich])
lower_education_rich = len(low_edu_rich) / len(low_edu)

print(f"{lower_education_rich:.2f}")

0.17


### What is the minimum number of hours a person works per week (hours-per-week feature)?

In [12]:
df.columns = df.columns.str.replace("-", "_")
df.columns      #need to replace "hours-per-week" with "hours_per_week" in order to be able to use it.

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [13]:
df.hours_per_week.value_counts()

40    15217
50     2819
45     1824
60     1475
35     1297
      ...  
82        1
92        1
87        1
74        1
94        1
Name: hours_per_week, Length: 94, dtype: int64

In [14]:
df['hours_per_week'].min()

1

### What percentage of the people who work the minimum number of hours per week have a salary of >50K?

In [15]:
min_hours = df.hours_per_week == 1
min_hours_rich = (df.loc[rich])

df.loc[min_hours & rich]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
20072,65,?,76043,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,1,United-States,>50K


In [16]:
rich_percentage = (len(df.loc[min_hours & rich])/len(df.loc[min_hours]))
rich_percentage

0.1

### What country has the highest percentage of people that earn >50K?

In [17]:
rich_country = df.loc[rich]['native_country'].value_counts()  #people who are rich, by country.

In [44]:
people_by_country = df['native_country'].value_counts()  #total amount of people by country
people_by_country

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

In [19]:
sorted_people = people_by_country.sort_index()

In [20]:
sorted_rich_country = rich_country.sort_index()

In [21]:
sorted_people = sorted_people[sorted_people.index.isin(sorted_rich_country.index)]
len(sorted_people)

40

In [22]:
percentages = []
for i in range(len(sorted_rich_country)):
    percentages.append(sorted_rich_country[i]/sorted_people[i])

In [23]:
percentages.index(max(percentages))

19

In [30]:
sorted_people.index[percentages.index(max(percentages))]

'Iran'

In [31]:
df.loc[rich]['native_country'].value_counts().index[0]

'United-States'

### Identify the most popular occupation for those who earn >50K in India.

In [26]:
top_occupation = df[(df.salary == ">50K") & (df.native_country == 'India')]['occupation'].value_counts()

In [27]:
top_occupation.index[0]

'Prof-specialty'