In [0]:
import numpy as np
import pandas as pd
pd.set_option('display.max.columns', 100)
# to draw pictures in jupyter notebook
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

url = 'https://raw.githubusercontent.com/Androidik/ML-2019/master/ML-LR2/adult.csv'
data = pd.read_csv(url)

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
data.groupby('gender').size()  #count men and women


gender
Female    16192
Male      32650
dtype: int64

In [0]:
data[data['gender'] == 'Female']['age'].mean()  #average age of women

36.92798913043478

In [0]:
(data['native-country'] == "Germany").sum()/data.shape[0]  #the percentage of German citizens 

0.00421768150362393

In [0]:
ages1 = data.loc[data['income'] == '>50K', 'age']  #the mean and standard deviation of age for
ages2 = data.loc[data['income'] == '<=50K', 'age']
print("The average age of the rich: {0} +- {1} years, poor - {2} +- {3} years.".format(
    round(ages1.mean()), round(ages1.std(), 1),
    round(ages2.mean()), round(ages2.std(), 1)))

The average age of the rich: 44 +- 10.6 years, poor - 37 +- 14.1 years.


In [0]:
data.loc[data['income'] == '>50K', 'education'].unique()  #what education do rich pple have

array(['Assoc-acdm', 'Some-college', 'Prof-school', 'HS-grad', 'Masters',
       'Doctorate', 'Bachelors', 'Assoc-voc', '9th', '10th', '7th-8th',
       '11th', '5th-6th', '1st-4th', '12th', 'Preschool'], dtype=object)

In [0]:
for (race, sex), sub_df in data.groupby(['race', 'gender']):
    print("Race: {0}, gender: {1}".format(race, sex))
    print(sub_df['age'].describe())

Race: Amer-Indian-Eskimo, gender: Female
count    185.000000
mean      36.237838
std       12.840056
min       17.000000
25%       26.000000
50%       35.000000
75%       46.000000
max       80.000000
Name: age, dtype: float64
Race: Amer-Indian-Eskimo, gender: Male
count    285.000000
mean      36.989474
std       11.703943
min       17.000000
25%       29.000000
50%       35.000000
75%       44.000000
max       82.000000
Name: age, dtype: float64
Race: Asian-Pac-Islander, gender: Female
count    517.000000
mean      35.657640
std       12.637799
min       17.000000
25%       25.000000
50%       34.000000
75%       44.000000
max       81.000000
Name: age, dtype: float64
Race: Asian-Pac-Islander, gender: Male
count    1002.000000
mean       38.994012
std        12.824878
min        18.000000
25%        29.000000
50%        37.000000
75%        46.000000
max        90.000000
Name: age, dtype: float64
Race: Black, gender: Female
count    2308.000000
mean       37.905979
std        12.7360

In [0]:
#Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? 

In [0]:
data.loc[(data['gender'] == 'Male') &
     (data['marital-status'].isin(['Never-married', 
                                   'Separated', 
                                   'Divorced',
                                   'Widowed'])), 'income'].value_counts()

<=50K    11414
>50K      1001
Name: income, dtype: int64

In [0]:
data.loc[(data['gender'] == 'Male') &
     (data['marital-status'].str.startswith('Married')), 'income'].value_counts()

<=50K    11318
>50K      8917
Name: income, dtype: int64

In [0]:
data['marital-status'].value_counts()

Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: marital-status, dtype: int64

In [0]:
# the maximum number of hours a person works per week 
max_load = data['hours-per-week'].max()
print("Max time - {0} hours./week.".format(max_load))

num_workaholics = data[data['hours-per-week'] == max_load].shape[0]
print("Total number of such hard workers {0}".format(num_workaholics))

rich_share = float(data[(data['hours-per-week'] == max_load)
                 & (data['income'] == '>50K')].shape[0]) / num_workaholics
print("Percentage of rich among them {0}%".format(int(100 * rich_share)))

Max time - 99 hours./week.
Total number of such hard workers 137
Percentage of rich among them 29%


In [0]:
#average time of work and salary for each country
for (country, salary), sub_df in data.groupby(['native-country', 'income']):
    print(country, salary, round(sub_df['hours-per-week'].mean(), 2))

? <=50K 39.55
? >50K 45.32
Cambodia <=50K 41.16
Cambodia >50K 43.89
Canada <=50K 37.38
Canada >50K 46.13
China <=50K 36.69
China >50K 42.03
Columbia <=50K 39.12
Columbia >50K 56.25
Cuba <=50K 39.2
Cuba >50K 42.85
Dominican-Republic <=50K 41.56
Dominican-Republic >50K 42.8
Ecuador <=50K 37.95
Ecuador >50K 47.83
El-Salvador <=50K 35.82
El-Salvador >50K 43.45
England <=50K 39.38
England >50K 46.3
France <=50K 40.09
France >50K 46.5
Germany <=50K 38.9
Germany >50K 45.71
Greece <=50K 41.87
Greece >50K 55.56
Guatemala <=50K 38.79
Guatemala >50K 36.67
Haiti <=50K 36.41
Haiti >50K 40.67
Holand-Netherlands <=50K 40.0
Honduras <=50K 34.06
Honduras >50K 50.0
Hong <=50K 39.0
Hong >50K 43.75
Hungary <=50K 34.08
Hungary >50K 46.33
India <=50K 38.04
India >50K 46.27
Iran <=50K 39.97
Iran >50K 47.95
Ireland <=50K 41.27
Ireland >50K 45.18
Italy <=50K 38.69
Italy >50K 45.65
Jamaica <=50K 38.63
Jamaica >50K 42.4
Japan <=50K 39.73
Japan >50K 47.06
Laos <=50K 39.33
Laos >50K 40.0
Mexico <=50K 39.93
Mexico 