# Author: Yury Kashnitsky. Translated and edited by Sergey Isaev, Artem Trunov, Anastasia Manokhina, and Yuanyuan Pao. All content is distributed under the Creative Commons CC BY-NC-SA 4.0 license. https://mlcourse.ai/

In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max.columns", 100)
# to draw pictures in jupyter notebook
%matplotlib inline
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [2]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
df = pd.read_csv(DATA_URL + "adult.data.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# 1.How many men and women (sex feature) are represented in this dataset?

In [3]:
sum_data = df['sex'].value_counts().sum()
sum_data

32561

In [4]:
repartition = df['sex'].value_counts(normalize=True)
repartition

sex
Male      0.669205
Female    0.330795
Name: proportion, dtype: float64

In [5]:
print(f"There is a total of {sum_data} people in the dataset, with a distribution of {repartition[0]} men and {repartition[1]} women.")


There is a total of 32561 people in the dataset, with a distribution of 0.6692054912318418 men and 0.33079450876815825 women.


# 2. What is the average age (age feature) of women?

In [6]:
female_df = df[df['sex'] == 'Female']

In [7]:
female_df['age'].mean()

36.85823043357163

# 3. What is the percentage of German citizens (native-country feature)?

In [8]:
df['native-country'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [9]:
proportion_germany = (df['native-country'].value_counts(normalize=True)['Germany'])*100
proportion_germany

0.42074874850281013

# 4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [10]:
df['salary'].unique()

array(['<=50K', '>50K'], dtype=object)

In [25]:
# Data for people who earn more than 50K
more = df[df['salary']=='>50K']
more_std = more['age'].std()
more_mean = more['age'].mean()
print(f"The average age of people earning more than 50K is {more_mean:.2f} years with a standard deviation of {more_std:.2f}.")

The average age of people earning more than 50K is 44.25 years with a standard deviation of 10.52.


In [12]:
# Data for people who earn less than 50K
less = df[df['salary'] == '<=50K']
less_std = less['age'].std()
less_mean = less['age'].mean()
print(f"The average age of people earning less than 50K is {less_mean:.2f} years with a standard deviation of {less_std:.2f}.")

The average age of people earning less than 50K is 36.78 years with a standard deviation of 14.02.


# 6. Is it true that people who earn more than 50K have at least high school education?

In [13]:
df['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [14]:
more['education'].value_counts()

education
Bachelors       2221
HS-grad         1675
Some-college    1387
Masters          959
Prof-school      423
Assoc-voc        361
Doctorate        306
Assoc-acdm       265
10th              62
11th              60
7th-8th           40
12th              33
9th               27
5th-6th           16
1st-4th            6
Name: count, dtype: int64

# 7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [23]:
df['age'].groupby([df['race'], df['sex']]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amer-Indian-Eskimo,Female,119.0,37.117647,13.114991,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.208333,12.049563,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.089595,12.300845,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.073593,12.883944,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.854019,12.637197,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.6826,12.882612,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.678899,11.631599,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.654321,11.355531,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.811618,14.329093,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.652498,13.436029,17.0,29.0,38.0,49.0,90.0


# 8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [36]:
men = df[df['sex'] == 'Male'].copy()
married_statuses = ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']

men['marital_group'] = men['marital-status'].apply(
    lambda x: 'Married' if x in married_statuses else 'Single'
)

result = men.groupby('marital_group')['salary'].apply(lambda x: (x == '>50K').mean())

print(result)

print("Groupe avec la plus grande proportion de hauts revenus (>50K) :", result.idxmax())

marital_group
Married    0.440514
Single     0.084495
Name: salary, dtype: float64
Groupe avec la plus grande proportion de hauts revenus (>50K) : Married


# 9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [38]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [40]:
max_hours = df['hours-per-week'].max()
max_hours

99

In [48]:
max_hour_workers = df[df['hours-per-week'] == max_hours]
count = len(max_hour_workers)
count

85

In [50]:
rich_percentage = (max_hour_workers['salary'] == '>50K').mean() * 100
rich_percentage

29.411764705882355

In [51]:
print(f"Nombre maximum d'heures travaillées par semaine : {max_hours}")
print(f"Nombre de personnes qui travaillent {max_hours} heures : {count}")
print(f"Pourcentage de ceux qui gagnent >50K : {rich_percentage:.2f}%")

Nombre maximum d'heures travaillées par semaine : 99
Nombre de personnes qui travaillent 99 heures : 85
Pourcentage de ceux qui gagnent >50K : 29.41%


# 10. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [61]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [63]:
df['hours-per-week'].groupby(df['native-country']).mean()

native-country
?                             41.512864
Cambodia                      40.894737
Canada                        40.404959
China                         37.786667
Columbia                      39.067797
Cuba                          39.157895
Dominican-Republic            42.471429
Ecuador                       39.571429
El-Salvador                   36.792453
England                       41.833333
France                        45.068966
Germany                       41.014599
Greece                        44.241379
Guatemala                     39.234375
Haiti                         36.909091
Holand-Netherlands            40.000000
Honduras                      36.307692
Hong                          40.900000
Hungary                       35.615385
India                         41.530000
Iran                          43.976744
Ireland                       42.416667
Italy                         41.602740
Jamaica                       38.592593
Japan                    

In [65]:
df['hours-per-week'].groupby([df['native-country'], df['salary']]).mean()['Japan']

salary
<=50K    41.000000
>50K     47.958333
Name: hours-per-week, dtype: float64