In [197]:
import pandas as pd
import numpy as np

![separator2](https://i.imgur.com/4gX5WFr.png)

### Loading data:

In [198]:
frame = pd.read_csv('../datasets/adult.data.csv', sep=',')

In [199]:
frame.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [200]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [201]:
frame['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

![separator1](https://i.imgur.com/ZUWYTii.png)

### How many people of each race are represented in this dataset? 



In [202]:
frame.groupby('race')['sex'].count().sort_values(ascending=False)

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: sex, dtype: int64

![separator1](https://i.imgur.com/ZUWYTii.png)

### What is the average age of men?

In [203]:
frame.groupby('sex')['age'].mean()

sex
Female    36.858230
Male      39.433547
Name: age, dtype: float64

![separator1](https://i.imgur.com/ZUWYTii.png)

### What is the percentage of people who have a Bachelor's degree?

In [204]:
percentage_bachelor = round((frame['education'] == 'Bachelors').sum() / len(frame['education'])*100,1)
print('The percentage of people who have a Bachelor'"'s degree is {}".format(percentage_bachelor) + str(' %.'))

The percentage of people who have a Bachelor's degree is 16.4 %.


![separator1](https://i.imgur.com/ZUWYTii.png)

### What percentage of people `with` advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [205]:
filter1 = frame['education'].isin(['Bachelors','Masters','Doctorate'])
filter2 = frame['salary'].isin(['>50K'])
filter3 = frame[filter1 & filter2]
advanced_education = round(filter3['education'].count()/filter1.sum()*100,1)
print('The percentage of people with advanced education and make more than 50K is {}'.format(advanced_education) + str(' %.'))

The percentage of people with advanced education and make more than 50K is 46.5 %.


![separator1](https://i.imgur.com/ZUWYTii.png)

### What percentage of people `without` advanced education make more than 50K?

In [206]:
filter1 = ~frame['education'].isin(['Bachelors','Masters','Doctorate'])
filter2 = frame['salary'].isin(['>50K'])
filter3 = frame[filter1 & filter2]
advanced_education = round(filter3['education'].count()/filter1.sum()*100,1)
print('The percentage of people with advanced education and make more than 50K is {}'.format(advanced_education) + str(' %.'))

The percentage of people with advanced education and make more than 50K is 17.4 %.


![separator1](https://i.imgur.com/ZUWYTii.png)

###  What is the minimum number of hours a person works per week?

In [207]:
min_hours = frame['hours-per-week'].min()
print('The minimum time a person works per week is {}'.format(min_hours) + str(' hour.'))

The minimum time a person works per week is 1 hour.


![separator1](https://i.imgur.com/ZUWYTii.png)

### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [208]:
filter4 = frame['hours-per-week'].isin([min_hours])
filter5 = frame[filter4 & filter2]
min_number = round(filter5['hours-per-week'].count()/filter4.sum()*100,1)
print('The percentage of the people who work the minimum number of hours per week have a salary of more than {}'.format(min_number) + str(' %.'))

The percentage of the people who work the minimum number of hours per week have a salary of more than 10.0 %.


![separator1](https://i.imgur.com/ZUWYTii.png)

### What country has the highest percentage of people that earn >50K and what is that percentage?

In [209]:
c = frame[['native-country', 'salary']]
d = c['native-country'].value_counts()
e = c[c['salary'] == '>50K']['native-country'].value_counts()
f = (e / d * 100).idxmax()
g = round((e / d * 100).max(),2)
print('The country is the: {} with percentage of {}'.format(f,g) + str(' %.'))

The country is the: Iran with percentage of 41.86 %.


![separator1](https://i.imgur.com/ZUWYTii.png)

### Identify the most popular occupation for those who earn >50K in India.

In [210]:
df_india = frame.loc[(frame['native-country'] == 'India') & (frame['salary'] == '>50K')]

In [211]:
df_india.groupby('salary')['occupation'].value_counts()

salary  occupation      
>50K    Prof-specialty      25
        Exec-managerial      8
        Other-service        2
        Tech-support         2
        Adm-clerical         1
        Sales                1
        Transport-moving     1
Name: occupation, dtype: int64

In [212]:
df_india.groupby('salary')['occupation'].value_counts().idxmax()

('>50K', 'Prof-specialty')

In [213]:
# optional
df_india.filter(items=['occupation','salary']).value_counts().idxmax()

('Prof-specialty', '>50K')

![separator2](https://i.imgur.com/4gX5WFr.png)

