In [1]:
import pandas as pd

### Read data from file

In [2]:
df = pd.read_csv('adult.data.csv')

In [None]:
df

### How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.

In [3]:
race_count = df.race.value_counts()
race_count

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

### What is the average age of men?

In [4]:
age_men = df.sex == 'Male'
average_age_men = df[age_men].age.sum()/len(df[age_men])
print(f"{average_age_men:.2f}")

39.43


### What is the percentage of people who have a Bachelor's degree?

In [5]:
bachelor = df.education == 'Bachelors'
percentage_bachelors = len(df[bachelor])/len(df)
print(f"{percentage_bachelors:.2f}")

0.16


### What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?

#### What percentage of people without advanced education make more than 50K?

#### with and without `Bachelors`, `Masters`, or `Doctorate`

In [7]:
higher_education = (
      (df.education == "Bachelors") | 
      (df.education == "Masters") | 
      (df.education == "Doctorate")
    )

In [8]:
lower_education = len(df.loc[~higher_education])
lower_education

25070

### percentage with salary >50K

In [9]:
rich = (df.salary == ">50K")
rich

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559    False
32560     True
Name: salary, Length: 32561, dtype: bool

In [10]:
high_edu_rich = df.loc[higher_education & rich]
higher_education_rich = len(high_edu_rich)/len(df.loc[higher_education])

print(f"{higher_education_rich:.2f}")

0.47


In [11]:
low_edu = df.loc[~higher_education]
low_edu_rich = (df.loc[~higher_education & rich])
lower_education_rich = len(low_edu_rich) / len(low_edu)

print(f"{lower_education_rich:.2f}")

0.17


### What is the minimum number of hours a person works per week (hours-per-week feature)?

In [12]:
df.columns = df.columns.str.replace("-", "_")
df.columns      #need to replace "hours-per-week" with "hours_per_week" in order to be able to use it.

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [13]:
df.hours_per_week.value_counts()

40    15217
50     2819
45     1824
60     1475
35     1297
      ...  
82        1
92        1
87        1
74        1
94        1
Name: hours_per_week, Length: 94, dtype: int64

In [14]:
df['hours_per_week'].min()

1

### What percentage of the people who work the minimum number of hours per week have a salary of >50K?

In [15]:
min_hours = df.hours_per_week == 1
min_hours_rich = (df.loc[rich])

df.loc[min_hours & rich]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
20072,65,?,76043,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,1,United-States,>50K


In [16]:
rich_percentage = (len(df.loc[min_hours & rich])/len(df.loc[min_hours]))
rich_percentage

0.1

### What country has the highest percentage of people that earn >50K?

In [20]:
rich_country = df.loc[rich]['native_country'].value_counts()
rich_country   #people who are rich, by country.

United-States         7171
?                      146
Philippines             61
Germany                 44
India                   40
Canada                  39
Mexico                  33
England                 30
Italy                   25
Cuba                    25
Japan                   24
Taiwan                  20
China                   20
Iran                    18
South                   16
Puerto-Rico             12
Poland                  12
France                  12
Jamaica                 10
El-Salvador              9
Greece                   8
Cambodia                 7
Hong                     6
Yugoslavia               6
Ireland                  5
Vietnam                  5
Portugal                 4
Haiti                    4
Ecuador                  4
Thailand                 3
Hungary                  3
Guatemala                3
Scotland                 3
Nicaragua                2
Trinadad&Tobago          2
Laos                     2
Columbia                 2
D

In [21]:
people_by_country = df['native_country'].value_counts()
people_by_country   #total amount of people by country

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

In [22]:
sorted_people = people_by_country.sort_index()
sorted_people

?                               583
Cambodia                         19
Canada                          121
China                            75
Columbia                         59
Cuba                             95
Dominican-Republic               70
Ecuador                          28
El-Salvador                     106
England                          90
France                           29
Germany                         137
Greece                           29
Guatemala                        64
Haiti                            44
Holand-Netherlands                1
Honduras                         13
Hong                             20
Hungary                          13
India                           100
Iran                             43
Ireland                          24
Italy                            73
Jamaica                          81
Japan                            62
Laos                             18
Mexico                          643
Nicaragua                   

In [23]:
sorted_rich_country = rich_country.sort_index()
sorted_rich_country

?                      146
Cambodia                 7
Canada                  39
China                   20
Columbia                 2
Cuba                    25
Dominican-Republic       2
Ecuador                  4
El-Salvador              9
England                 30
France                  12
Germany                 44
Greece                   8
Guatemala                3
Haiti                    4
Honduras                 1
Hong                     6
Hungary                  3
India                   40
Iran                    18
Ireland                  5
Italy                   25
Jamaica                 10
Japan                   24
Laos                     2
Mexico                  33
Nicaragua                2
Peru                     2
Philippines             61
Poland                  12
Portugal                 4
Puerto-Rico             12
Scotland                 3
South                   16
Taiwan                  20
Thailand                 3
Trinadad&Tobago          2
U

In [24]:
len(sorted_rich_country)

40

In [25]:
len(sorted_people)

42

In [None]:
percentages = []
for i in range(len(sorted_rich_country)):
    percentages.append(sorted_rich_country[i]/sorted_people[i])

In [None]:
percentages