In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import plotly.express as px

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_colwidth", None)
warnings.filterwarnings("ignore")


DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

data = pd.read_csv(DATA_URL + "adult.data.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
pd.DataFrame(data.select_dtypes("object").apply(lambda col: col.unique(), 
                                                axis=0), 
            columns=['unique'])

Unnamed: 0,unique
workclass,"[State-gov, Self-emp-not-inc, Private, Federal-gov, Local-gov, ?, Self-emp-inc, Without-pay, Never-worked]"
education,"[Bachelors, HS-grad, 11th, Masters, 9th, Some-college, Assoc-acdm, Assoc-voc, 7th-8th, Doctorate, Prof-school, 5th-6th, 10th, 1st-4th, Preschool, 12th]"
marital-status,"[Never-married, Married-civ-spouse, Divorced, Married-spouse-absent, Separated, Married-AF-spouse, Widowed]"
occupation,"[Adm-clerical, Exec-managerial, Handlers-cleaners, Prof-specialty, Other-service, Sales, Craft-repair, Transport-moving, Farming-fishing, Machine-op-inspct, Tech-support, ?, Protective-serv, Armed-Forces, Priv-house-serv]"
relationship,"[Not-in-family, Husband, Wife, Own-child, Unmarried, Other-relative]"
race,"[White, Black, Asian-Pac-Islander, Amer-Indian-Eskimo, Other]"
sex,"[Male, Female]"
native-country,"[United-States, Cuba, Jamaica, India, ?, Mexico, South, Puerto-Rico, Honduras, England, Canada, Germany, Iran, Philippines, Italy, Poland, Columbia, Cambodia, Thailand, Ecuador, Laos, Taiwan, Haiti, Portugal, Dominican-Republic, El-Salvador, France, Guatemala, China, Japan, Yugoslavia, Peru, Outlying-US(Guam-USVI-etc), Scotland, Trinadad&Tobago, Greece, Nicaragua, Vietnam, Hong, Ireland, Hungary, Holand-Netherlands]"
salary,"[<=50K, >50K]"


In [11]:
# 1. How many men and women (sex feature) are represented in this dataset?
# data['sex'].value_counts() # Численное представление
# data['sex'].value_counts().plot(kind='bar') # встроенными средствами pandas
px.bar(data['sex'].value_counts().reset_index(), x='sex', y='count', 
       labels={'sex': 'Sex', 'count': 'Count'},
       title='Sex Distribution', color='sex') # с помощью plotly

In [13]:
# 2. What is the average age (age feature) of women?
data[data['sex']=='Female']['age'].mean()

36.85823043357163

In [19]:
# 3. What is the percentage of German citizens (native-country feature)?
data['native-country'].value_counts(normalize=True)['Germany']

0.004207487485028101

In [20]:
# 4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?
data.groupby(['salary'])['age'].agg(['mean', 'std'])

Unnamed: 0_level_0,mean,std
salary,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,36.783738,14.020088
>50K,44.249841,10.519028


In [42]:
# 6. Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)
data['High School'] = data['education'].isin(['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate'])
pd.crosstab(data['salary'], data['High School'], normalize=True)

High School,False,True
salary,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,0.575504,0.183686
>50K,0.101533,0.139277


In [38]:
# 7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.
data.pivot_table(values=['age'], index=['race','sex'], aggfunc=['mean','median','min','max'], margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,min,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,age,age,age
race,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Amer-Indian-Eskimo,Female,37.117647,36.0,17,80
Amer-Indian-Eskimo,Male,37.208333,35.0,17,82
Asian-Pac-Islander,Female,35.089595,33.0,17,75
Asian-Pac-Islander,Male,39.073593,37.0,18,90
Black,Female,37.854019,37.0,17,90
Black,Male,37.6826,36.0,17,90
Other,Female,31.678899,29.0,17,74
Other,Male,34.654321,32.0,17,77
White,Female,36.811618,35.0,17,90
White,Male,39.652498,38.0,17,90


In [60]:
# 8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.
data['married'] = data["marital-status"].str.startswith("Married")
df_male = data[data['sex']=='Male']
pd.crosstab(df_male['salary'], df_male['married'], normalize=True)

married,False,True
salary,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,0.346581,0.347682
>50K,0.031987,0.273749


In [56]:
# 9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?
max_hours_per_week = data['hours-per-week'].max()
print("Maximum hours per week: ", max_hours_per_week)
data['max_hours_per_week'] = data['hours-per-week'] == max_hours_per_week
pd.crosstab(data['salary'], data['max_hours_per_week'], normalize=True, margins=True)

Maximum hours per week:  99


max_hours_per_week,False,True,All
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<=50K,0.757348,0.001843,0.75919
>50K,0.240042,0.000768,0.24081
All,0.99739,0.00261,1.0


In [68]:
# 10. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?
df_Japan = data[data['native-country']=='Japan']
df_Japan.pivot_table(index=['native-country', 'salary'], values=['hours-per-week'], aggfunc=['mean','count'], margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Unnamed: 0_level_1,Unnamed: 1_level_1,hours-per-week,hours-per-week
native-country,salary,Unnamed: 2_level_2,Unnamed: 3_level_2
Japan,<=50K,41.0,38
Japan,>50K,47.958333,24
All,,43.693548,62
